BlendMMM commited on
Commit
bb080e9
1 Parent(s): 45dc323

Upload 28 files

Browse files
Files changed (6) hide show
  1. Data_Import.py +211 -846
  2. README.md +1 -1
  3. classes.py +106 -130
  4. upf_data_converted.csv +0 -0
  5. upf_data_converted.xlsx +2 -2
  6. utilities.py +263 -534
Data_Import.py CHANGED
@@ -1,58 +1,79 @@
1
  # Importing necessary libraries
2
  import streamlit as st
 
3
 
4
  st.set_page_config(
5
- page_title="Data Import",
6
  page_icon=":shark:",
7
  layout="wide",
8
  initial_sidebar_state="collapsed",
9
  )
10
 
11
- import pickle
 
12
  import pandas as pd
13
  from utilities import set_header, load_local_css
14
- import streamlit_authenticator as stauth
15
- import yaml
16
- from yaml import SafeLoader
17
 
18
  load_local_css("styles.css")
19
  set_header()
20
 
21
 
22
  for k, v in st.session_state.items():
23
- if k not in ["logout", "login", "config"] and not k.startswith(
24
- "FormSubmitter"
25
- ):
26
  st.session_state[k] = v
27
- with open("config.yaml") as file:
28
- config = yaml.load(file, Loader=SafeLoader)
29
- st.session_state["config"] = config
30
- authenticator = stauth.Authenticate(
31
- config["credentials"],
32
- config["cookie"]["name"],
33
- config["cookie"]["key"],
34
- config["cookie"]["expiry_days"],
35
- config["preauthorized"],
36
- )
37
- st.session_state["authenticator"] = authenticator
38
- name, authentication_status, username = authenticator.login("Login", "main")
39
- auth_status = st.session_state.get("authentication_status")
40
 
41
- if auth_status == True:
42
- authenticator.logout("Logout", "main")
43
- is_state_initiaized = st.session_state.get("initialized", False)
 
 
 
44
 
 
 
45
  if not is_state_initiaized:
46
-
47
- if 'session_name' not in st.session_state:
48
- st.session_state['session_name']=None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
50
 
51
- # Function to validate date column in dataframe
 
52
  def validate_date_column(df):
53
  try:
54
  # Attempt to convert the 'Date' column to datetime
55
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
56
  return True
57
  except:
58
  return False
@@ -70,786 +91,196 @@ if auth_status == True:
70
  return "irregular"
71
 
72
 
73
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
74
- st.cache_resource(show_spinner=False)
75
-
 
 
76
 
77
- def files_to_dataframes(uploaded_files):
78
- df_dict = {}
79
- for uploaded_file in uploaded_files:
80
- # Extract file name without extension
81
- file_name = uploaded_file.name.rsplit(".", 1)[0]
 
 
82
 
83
- # Check for duplicate file names
84
- if file_name in df_dict:
85
- st.warning(
86
- f"Duplicate File: {file_name}. This file will be skipped.",
87
- icon="⚠️",
88
- )
89
- continue
90
-
91
- # Read the file into a DataFrame
92
- df = pd.read_excel(uploaded_file)
93
-
94
- # Convert all column names to lowercase
95
- df.columns = df.columns.str.lower().str.strip()
96
-
97
- # Separate numeric and non-numeric columns
98
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
99
- non_numeric_cols = [
100
- col
101
- for col in df.select_dtypes(exclude=["number"]).columns
102
- if col.lower() != "date"
103
- ]
104
-
105
- # Check for 'Date' column
106
- if not (validate_date_column(df) and len(numeric_cols) > 0):
107
- st.warning(
108
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
109
- icon="⚠️",
110
- )
111
- continue
112
-
113
- # Check for interval
114
- common_freq = common_freq = (
115
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
116
- )
117
- # Calculate the data interval (daily, weekly, monthly or irregular)
118
- interval = determine_data_interval(common_freq)
119
- if interval == "irregular":
120
- st.warning(
121
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
122
- icon="⚠️",
123
- )
124
- continue
125
-
126
- # Store both DataFrames in the dictionary under their respective keys
127
- df_dict[file_name] = {
128
- "numeric": numeric_cols,
129
- "non_numeric": non_numeric_cols,
130
- "interval": interval,
131
- "df": df,
132
- }
133
-
134
- return df_dict
135
-
136
-
137
- # Function to adjust dataframe granularity
138
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
139
- # Set index
140
- df.set_index("date", inplace=True)
141
-
142
- # Define aggregation rules for resampling
143
- aggregation_rules = {
144
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
145
- for col in df.columns
146
- }
147
-
148
- # Initialize resampled_df
149
- resampled_df = df
150
- if current_granularity == "daily" and target_granularity == "weekly":
151
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
152
- aggregation_rules
153
- )
154
-
155
- elif current_granularity == "daily" and target_granularity == "monthly":
156
- resampled_df = df.resample("MS", closed="left", label="left").agg(
157
- aggregation_rules
158
- )
159
-
160
- elif current_granularity == "daily" and target_granularity == "daily":
161
- resampled_df = df.resample("D").agg(aggregation_rules)
162
-
163
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
164
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
165
- expanded_data = []
166
  for _, row in df.iterrows():
167
- if current_granularity == "weekly":
168
- period_range = pd.date_range(start=row.name, periods=7)
169
- elif current_granularity == "monthly":
170
- period_range = pd.date_range(
171
- start=row.name, periods=row.name.days_in_month
172
- )
173
-
174
- for date in period_range:
175
- new_row = {}
176
  for col in df.columns:
177
- if pd.api.types.is_numeric_dtype(df[col]):
178
- if current_granularity == "weekly":
179
- new_row[col] = row[col] / 7
180
- elif current_granularity == "monthly":
181
- new_row[col] = row[col] / row.name.days_in_month
182
- else:
183
- new_row[col] = row[col]
184
- expanded_data.append((date, new_row))
185
-
186
- resampled_df = pd.DataFrame(
187
- [data for _, data in expanded_data],
188
- index=[date for date, _ in expanded_data],
189
- )
190
 
191
- # Reset index
192
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
 
 
 
 
 
193
 
194
- return resampled_df
 
 
 
195
 
 
196
 
197
- # Function to clean and extract unique values of Panel_1 and Panel_2
198
- st.cache_resource(show_spinner=False)
199
 
 
 
 
 
200
 
201
- def clean_and_extract_unique_values(files_dict, selections):
202
- all_panel1_values = set()
203
- all_panel2_values = set()
 
204
 
205
- for file_name, file_data in files_dict.items():
206
- df = file_data["df"]
 
 
 
 
 
207
 
208
- # 'Panel_1' and 'Panel_2' selections
209
- selected_panel1 = selections[file_name].get("Panel_1")
210
- selected_panel2 = selections[file_name].get("Panel_2")
211
 
212
- # Clean and standardize Panel_1 column if it exists and is selected
213
- if (
214
- selected_panel1
215
- and selected_panel1 != "N/A"
216
- and selected_panel1 in df.columns
217
- ):
218
- df[selected_panel1] = (
219
- df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
220
- )
221
- all_panel1_values.update(df[selected_panel1].dropna().unique())
222
 
223
- # Clean and standardize Panel_2 column if it exists and is selected
224
- if (
225
- selected_panel2
226
- and selected_panel2 != "N/A"
227
- and selected_panel2 in df.columns
228
- ):
229
- df[selected_panel2] = (
230
- df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
231
- )
232
- all_panel2_values.update(df[selected_panel2].dropna().unique())
233
 
234
- # Update the processed DataFrame back in the dictionary
235
- files_dict[file_name]["df"] = df
 
 
 
 
 
236
 
237
- return all_panel1_values, all_panel2_values
 
 
 
238
 
 
 
239
 
240
- # Function to format values for display
241
- st.cache_resource(show_spinner=False)
242
 
 
 
 
 
243
 
244
- def format_values_for_display(values_list):
245
- # Capitalize the first letter of each word and replace underscores with spaces
246
- formatted_list = [value.replace("_", " ").title() for value in values_list]
247
- # Join values with commas and 'and' before the last value
248
- if len(formatted_list) > 1:
249
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
250
- elif formatted_list:
251
- return formatted_list[0]
252
- return "No values available"
253
 
 
254
 
255
- # Function to normalizes all data within files_dict to a daily granularity
256
- st.cache(show_spinner=False, allow_output_mutation=True)
257
 
 
 
 
 
258
 
259
- def standardize_data_to_daily(files_dict, selections):
260
- # Normalize all data to a daily granularity using a provided function
261
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
 
 
 
 
262
 
263
- # Update the "interval" attribute for each dataset to indicate the new granularity
264
- for files_name, files_data in files_dict.items():
265
- files_data["interval"] = "daily"
266
 
267
- return files_dict
 
 
268
 
 
 
269
 
270
- # Function to apply granularity transformation to all DataFrames in files_dict
271
- st.cache_resource(show_spinner=False)
272
 
 
 
 
 
 
 
273
 
274
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
275
- for file_name, file_data in files_dict.items():
276
- df = file_data["df"].copy()
 
277
 
278
- # Handling when Panel_1 or Panel_2 might be 'N/A'
279
- selected_panel1 = selections[file_name].get("Panel_1")
280
- selected_panel2 = selections[file_name].get("Panel_2")
281
 
282
- # Correcting the segment selection logic & handling 'N/A'
283
- if selected_panel1 != "N/A" and selected_panel2 != "N/A":
284
- unique_combinations = df[
285
- [selected_panel1, selected_panel2]
286
- ].drop_duplicates()
287
- elif selected_panel1 != "N/A":
288
- unique_combinations = df[[selected_panel1]].drop_duplicates()
289
- selected_panel2 = None # Ensure Panel_2 is ignored if N/A
290
- elif selected_panel2 != "N/A":
291
- unique_combinations = df[[selected_panel2]].drop_duplicates()
292
- selected_panel1 = None # Ensure Panel_1 is ignored if N/A
293
  else:
294
- # If both are 'N/A', process the entire dataframe as is
295
- df = adjust_dataframe_granularity(
296
- df, file_data["interval"], granularity_selection
297
- )
298
- files_dict[file_name]["df"] = df
299
- continue # Skip to the next file
300
-
301
- transformed_segments = []
302
- for _, combo in unique_combinations.iterrows():
303
- if selected_panel1 and selected_panel2:
304
- segment = df[
305
- (df[selected_panel1] == combo[selected_panel1])
306
- & (df[selected_panel2] == combo[selected_panel2])
307
- ]
308
- elif selected_panel1:
309
- segment = df[df[selected_panel1] == combo[selected_panel1]]
310
- elif selected_panel2:
311
- segment = df[df[selected_panel2] == combo[selected_panel2]]
312
-
313
- # Adjust granularity of the segment
314
- transformed_segment = adjust_dataframe_granularity(
315
- segment, file_data["interval"], granularity_selection
316
  )
317
- transformed_segments.append(transformed_segment)
318
-
319
- # Combine all transformed segments into a single DataFrame for this file
320
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
321
- files_dict[file_name]["df"] = transformed_df
322
-
323
- return files_dict
324
-
325
-
326
- # Function to create main dataframe structure
327
- st.cache_resource(show_spinner=False)
328
-
329
-
330
- def create_main_dataframe(
331
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
332
- ):
333
- # Determine the global start and end dates across all DataFrames
334
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
335
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
336
-
337
- # Adjust the date_range generation based on the granularity_selection
338
- if granularity_selection == "weekly":
339
- # Generate a weekly range, with weeks starting on Monday
340
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
341
- elif granularity_selection == "monthly":
342
- # Generate a monthly range, starting from the first day of each month
343
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
344
- else: # Default to daily if not weekly or monthly
345
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
346
-
347
- # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
348
- all_panel1s = all_panel1_values
349
- all_panel2s = all_panel2_values
350
-
351
- # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
352
- dimensions, merge_keys = [], []
353
- if all_panel1s:
354
- dimensions.append(all_panel1s)
355
- merge_keys.append("Panel_1")
356
- if all_panel2s:
357
- dimensions.append(all_panel2s)
358
- merge_keys.append("Panel_2")
359
-
360
- dimensions.append(date_range) # Date range is always included
361
- merge_keys.append("date") # Date range is always included
362
-
363
- # Create a main DataFrame template with the dimensions
364
- main_df = pd.MultiIndex.from_product(
365
- dimensions,
366
- names=[name for name, _ in zip(merge_keys, dimensions)],
367
- ).to_frame(index=False)
368
-
369
- return main_df.reset_index(drop=True)
370
-
371
-
372
- # Function to prepare and merge dataFrames
373
- st.cache_resource(show_spinner=False)
374
-
375
-
376
- def merge_into_main_df(main_df, files_dict, selections):
377
- for file_name, file_data in files_dict.items():
378
- df = file_data["df"].copy()
379
-
380
- # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
381
- selected_panel1 = selections[file_name].get("Panel_1", "N/A")
382
- selected_panel2 = selections[file_name].get("Panel_2", "N/A")
383
- if selected_panel1 != "N/A":
384
- df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
385
- if selected_panel2 != "N/A":
386
- df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
387
-
388
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
389
- merge_keys = ["date"]
390
- if "Panel_1" in df.columns:
391
- merge_keys.append("Panel_1")
392
- if "Panel_2" in df.columns:
393
- merge_keys.append("Panel_2")
394
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
395
-
396
- # After all merges, sort by 'date' and reset index for cleanliness
397
- sort_by = ["date"]
398
- if "Panel_1" in main_df.columns:
399
- sort_by.append("Panel_1")
400
- if "Panel_2" in main_df.columns:
401
- sort_by.append("Panel_2")
402
- main_df.sort_values(by=sort_by, inplace=True)
403
- main_df.reset_index(drop=True, inplace=True)
404
-
405
- return main_df
406
-
407
-
408
- # Function to categorize column
409
- def categorize_column(column_name):
410
- # Define keywords for each category
411
- internal_keywords = [
412
- "Price",
413
- "Discount",
414
- "product_price",
415
- "cost",
416
- "margin",
417
- "inventory",
418
- "sales",
419
- "revenue",
420
- "turnover",
421
- "expense",
422
- ]
423
- exogenous_keywords = [
424
- "GDP",
425
- "Tax",
426
- "Inflation",
427
- "interest_rate",
428
- "employment_rate",
429
- "exchange_rate",
430
- "consumer_spending",
431
- "retail_sales",
432
- "oil_prices",
433
- "weather",
434
- ]
435
-
436
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
437
- for keyword in internal_keywords:
438
- if keyword.lower() in column_name.lower():
439
- return "Internal"
440
- for keyword in exogenous_keywords:
441
- if keyword.lower() in column_name.lower():
442
- return "Exogenous"
443
-
444
- # Default to Media if no match found
445
- return "Media"
446
 
447
-
448
- # Function to calculate missing stats and prepare for editable DataFrame
449
- st.cache_resource(show_spinner=False)
450
 
451
 
 
452
  def prepare_missing_stats_df(df):
453
  missing_stats = []
454
  for column in df.columns:
455
  if (
456
- column == "date" or column == "Panel_2" or column == "Panel_1"
457
- ): # Skip Date, Panel_1 and Panel_2 column
458
  continue
459
 
460
  missing = df[column].isnull().sum()
461
  pct_missing = round((missing / len(df)) * 100, 2)
462
-
463
- # Dynamically assign category based on column name
464
- category = categorize_column(column)
465
- # category = "Media" # Keep default bin as Media
466
-
467
  missing_stats.append(
468
  {
469
  "Column": column,
470
  "Missing Values": missing,
471
  "Missing Percentage": pct_missing,
472
  "Impute Method": "Fill with 0", # Default value
473
- "Category": category,
474
  }
475
  )
476
  stats_df = pd.DataFrame(missing_stats)
477
-
478
  return stats_df
479
 
480
 
481
- # Function to add API DataFrame details to the files dictionary
482
- st.cache_resource(show_spinner=False)
483
-
484
-
485
- def add_api_dataframe_to_dict(main_df, files_dict):
486
- files_dict["API"] = {
487
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
488
- "non_numeric": [
489
- col
490
- for col in main_df.select_dtypes(exclude=["number"]).columns
491
- if col.lower() != "date"
492
- ],
493
- "interval": determine_data_interval(
494
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
495
- ),
496
- "df": main_df,
497
- }
498
-
499
- return files_dict
500
-
501
-
502
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
503
- @st.cache_resource(show_spinner=False)
504
- def read_API_data():
505
- return pd.read_excel("upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
506
-
507
-
508
- # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
509
- def set_Panel_1_Panel_2_Selected_false():
510
- st.session_state["Panel_1_Panel_2_Selected"] = False
511
-
512
-
513
- # Function to serialize and save the objects into a pickle file
514
- @st.cache_resource(show_spinner=False)
515
- def save_to_pickle(file_path, final_df, bin_dict):
516
- # Open the file in write-binary mode and dump the objects
517
- with open(file_path, "wb") as f:
518
- pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
519
- # Data is now saved to file
520
-
521
-
522
- # Function to processes the merged_df DataFrame based on operations defined in edited_df
523
- @st.cache_resource(show_spinner=False)
524
- def process_dataframes(merged_df, edited_df, edited_stats_df):
525
- # Ensure there are operations defined by the user
526
- if edited_df.empty:
527
- return merged_df, edited_stats_df # No operations to apply
528
-
529
- # Perform operations as defined by the user
530
- for index, row in edited_df.iterrows():
531
- result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
532
- col1 = row["Column 1"]
533
- col2 = row["Column 2"]
534
- op = row["Operator"]
535
-
536
- # Apply the specified operation
537
- if op == "+":
538
- merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
539
- elif op == "-":
540
- merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
541
- elif op == "*":
542
- merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
543
- elif op == "/":
544
- merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
545
- 0, 1e-9
546
- )
547
-
548
- # Add summary of operation to edited_stats_df
549
- new_row = {
550
- "Column": result_column_name,
551
- "Missing Values": None,
552
- "Missing Percentage": None,
553
- "Impute Method": None,
554
- "Category": row["Category"],
555
- }
556
- new_row_df = pd.DataFrame([new_row])
557
-
558
- # Use pd.concat to add the new_row_df to edited_stats_df
559
- edited_stats_df = pd.concat(
560
- [edited_stats_df, new_row_df], ignore_index=True, axis=0
561
- )
562
-
563
- # Combine column names from edited_df for cleanup
564
- combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
565
-
566
- # Filter out rows in edited_stats_df and drop columns from merged_df
567
- edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
568
- merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
569
-
570
- return merged_df, edited_stats_df
571
-
572
-
573
- # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
574
- st.cache_resource(show_spinner=False)
575
-
576
-
577
- def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
578
- # Get columns categorized as 'Response Metrics'
579
- columns_response_metrics = edited_stats_df[
580
- edited_stats_df["Category"] == "Response Metrics"
581
- ]["Column"].tolist()
582
-
583
- # Filter numeric columns, excluding those categorized as 'Response Metrics'
584
- numeric_columns = [
585
- col
586
- for col in merged_df.select_dtypes(include=["number"]).columns
587
- if col not in columns_response_metrics
588
- ]
589
-
590
- # Define the structure of the empty DataFrame
591
- data = {
592
- "Column 1": pd.Series([], dtype="str"),
593
- "Operator": pd.Series([], dtype="str"),
594
- "Column 2": pd.Series([], dtype="str"),
595
- "Category": pd.Series([], dtype="str"),
596
- }
597
- default_df = pd.DataFrame(data)
598
-
599
- return numeric_columns, default_df
600
-
601
-
602
- # Initialize 'final_df' in session state
603
- if "final_df" not in st.session_state:
604
- st.session_state["final_df"] = pd.DataFrame()
605
-
606
- # Initialize 'bin_dict' in session state
607
- if "bin_dict" not in st.session_state:
608
- st.session_state["bin_dict"] = {}
609
-
610
- # Initialize 'Panel_1_Panel_2_Selected' in session state
611
- if "Panel_1_Panel_2_Selected" not in st.session_state:
612
- st.session_state["Panel_1_Panel_2_Selected"] = False
613
-
614
-
615
- # Page Title
616
- st.write("") # Top padding
617
- st.title("Data Import")
618
-
619
-
620
- #########################################################################################################################################################
621
- # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
622
- #########################################################################################################################################################
623
-
624
-
625
- # Read the Excel file, parsing 'Date' column as datetime
626
- main_df = read_API_data()
627
-
628
- # Convert all column names to lowercase
629
- main_df.columns = main_df.columns.str.lower().str.strip()
630
-
631
- # File uploader
632
- uploaded_files = st.file_uploader(
633
- "Upload additional data",
634
- type=["xlsx"],
635
- accept_multiple_files=True,
636
- on_change=set_Panel_1_Panel_2_Selected_false,
637
- )
638
-
639
- # Custom HTML for upload instructions
640
- recommendation_html = f"""
641
- <div style="text-align: justify;">
642
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
643
- </div>
644
- """
645
- st.markdown(recommendation_html, unsafe_allow_html=True)
646
-
647
- # Choose Desired Granularity
648
- st.markdown("#### Choose Desired Granularity")
649
- # Granularity Selection
650
- granularity_selection = st.selectbox(
651
- "Choose Date Granularity",
652
- ["Daily", "Weekly", "Monthly"],
653
- label_visibility="collapsed",
654
- on_change=set_Panel_1_Panel_2_Selected_false,
655
- )
656
- granularity_selection = str(granularity_selection).lower()
657
-
658
- # Convert files to dataframes
659
- files_dict = files_to_dataframes(uploaded_files)
660
-
661
- # Add API Dataframe
662
- if main_df is not None:
663
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
664
-
665
- # Display a warning message if no files have been uploaded and halt further execution
666
- if not files_dict:
667
- st.warning(
668
- "Please upload at least one file to proceed.",
669
- icon="⚠️",
670
- )
671
- st.stop() # Halts further execution until file is uploaded
672
-
673
-
674
- # Select Panel_1 and Panel_2 columns
675
- st.markdown("#### Select Panel columns")
676
- selections = {}
677
- with st.expander("Select Panel columns", expanded=False):
678
- count = 0 # Initialize counter to manage the visibility of labels and keys
679
- for file_name, file_data in files_dict.items():
680
- # Determine visibility of the label based on the count
681
- if count == 0:
682
- label_visibility = "visible"
683
- else:
684
- label_visibility = "collapsed"
685
-
686
- # Extract non-numeric columns
687
- non_numeric_cols = file_data["non_numeric"]
688
-
689
- # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
690
- panel1_values = non_numeric_cols + ["N/A"]
691
- panel2_values = non_numeric_cols + ["N/A"]
692
-
693
- # Skip if only one option is available
694
- if len(panel1_values) == 1 and len(panel2_values) == 1:
695
- selected_panel1, selected_panel2 = "N/A", "N/A"
696
- # Update the selections for Panel_1 and Panel_2 for the current file
697
- selections[file_name] = {
698
- "Panel_1": selected_panel1,
699
- "Panel_2": selected_panel2,
700
- }
701
- continue
702
-
703
- # Create layout columns for File Name, Panel_2, and Panel_1 selections
704
- file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
705
-
706
- with file_name_col:
707
- # Display "File Name" label only for the first file
708
- if count == 0:
709
- st.write("File Name")
710
- else:
711
- st.write("")
712
- st.write(file_name) # Display the file name
713
-
714
- with Panel_1_col:
715
- # Display a selectbox for Panel_1 values
716
- selected_panel1 = st.selectbox(
717
- "Select Panel Level 1",
718
- panel2_values,
719
- on_change=set_Panel_1_Panel_2_Selected_false,
720
- label_visibility=label_visibility, # Control visibility of the label
721
- key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
722
- )
723
-
724
- with Panel_2_col:
725
- # Display a selectbox for Panel_2 values
726
- selected_panel2 = st.selectbox(
727
- "Select Panel Level 2",
728
- panel1_values,
729
- on_change=set_Panel_1_Panel_2_Selected_false,
730
- label_visibility=label_visibility, # Control visibility of the label
731
- key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
732
- )
733
-
734
- # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
735
- if selected_panel2 == selected_panel1 and not (
736
- selected_panel2 == "N/A" and selected_panel1 == "N/A"
737
- ):
738
- st.warning(
739
- f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
740
- )
741
- selected_panel1, selected_panel2 = "N/A", "N/A"
742
- st.stop()
743
-
744
- # Update the selections for Panel_1 and Panel_2 for the current file
745
- selections[file_name] = {
746
- "Panel_1": selected_panel1,
747
- "Panel_2": selected_panel2,
748
- }
749
-
750
- count += 1 # Increment the counter after processing each file
751
-
752
- # Accept Panel_1 and Panel_2 selection
753
- if st.button("Accept and Process", use_container_width=True):
754
-
755
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
756
- with st.spinner("Processing..."):
757
- files_dict = standardize_data_to_daily(files_dict, selections)
758
-
759
- # Convert all data to daily level granularity
760
- files_dict = apply_granularity_to_all(
761
- files_dict, granularity_selection, selections
762
- )
763
-
764
- # Update the 'files_dict' in the session state
765
- st.session_state["files_dict"] = files_dict
766
-
767
- # Set a flag in the session state to indicate that selection has been made
768
- st.session_state["Panel_1_Panel_2_Selected"] = True
769
-
770
-
771
- #########################################################################################################################################################
772
- # Display unique Panel_1 and Panel_2 values
773
- #########################################################################################################################################################
774
-
775
-
776
- # Halts further execution until Panel_1 and Panel_2 columns are selected
777
- if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
778
- files_dict = st.session_state["files_dict"]
779
- else:
780
- st.stop()
781
-
782
- # Set to store unique values of Panel_1 and Panel_2
783
- with st.spinner("Fetching Panel values..."):
784
- all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
785
- files_dict, selections
786
- )
787
-
788
- # List of Panel_1 and Panel_2 columns unique values
789
- list_of_all_panel1_values = list(all_panel1_values)
790
- list_of_all_panel2_values = list(all_panel2_values)
791
-
792
- # Format Panel_1 and Panel_2 values for display
793
- formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
794
- formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
795
-
796
- # Unique Panel_1 and Panel_2 values
797
- st.markdown("#### Unique Panel values")
798
- # Display Panel_1 and Panel_2 values
799
- with st.expander("Unique Panel values"):
800
- st.write("")
801
- st.markdown(
802
- f"""
803
- <style>
804
- .justify-text {{
805
- text-align: justify;
806
- }}
807
- </style>
808
- <div class="justify-text">
809
- <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
810
- <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
811
- </div>
812
- """,
813
- unsafe_allow_html=True,
814
- )
815
-
816
- # Display total Panel_1 and Panel_2
817
- st.write("")
818
- st.markdown(
819
- f"""
820
- <div style="text-align: justify;">
821
- <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
822
- <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
823
- </div>
824
- """,
825
- unsafe_allow_html=True,
826
- )
827
- st.write("")
828
-
829
-
830
- #########################################################################################################################################################
831
- # Merge all DataFrames
832
- #########################################################################################################################################################
833
-
834
-
835
- # Merge all DataFrames selected
836
- main_df = create_main_dataframe(
837
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
838
- )
839
- merged_df = merge_into_main_df(main_df, files_dict, selections)
840
-
841
-
842
- #########################################################################################################################################################
843
- # Categorize Variables and Impute Missing Values
844
- #########################################################################################################################################################
845
-
846
 
847
  # Create an editable DataFrame in Streamlit
848
  st.markdown("#### Select Variables Category & Impute Missing Values")
849
 
850
- # Prepare missing stats DataFrame for editing
851
- missing_stats_df = prepare_missing_stats_df(merged_df)
852
-
853
  edited_stats_df = st.data_editor(
854
  missing_stats_df,
855
  column_config={
@@ -865,10 +296,12 @@ if auth_status == True:
865
  ),
866
  "Category": st.column_config.SelectboxColumn(
867
  options=[
 
868
  "Media",
869
  "Exogenous",
870
  "Internal",
871
- "Response Metrics",
 
872
  ],
873
  required=True,
874
  default="Media",
@@ -879,84 +312,31 @@ if auth_status == True:
879
  use_container_width=True,
880
  )
881
 
 
882
  # Apply changes based on edited DataFrame
883
  for i, row in edited_stats_df.iterrows():
884
  column = row["Column"]
885
  if row["Impute Method"] == "Drop Column":
886
- merged_df.drop(columns=[column], inplace=True)
887
 
888
  elif row["Impute Method"] == "Fill with Mean":
889
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
890
 
891
  elif row["Impute Method"] == "Fill with Median":
892
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
893
 
894
  elif row["Impute Method"] == "Fill with 0":
895
- merged_df[column].fillna(0, inplace=True)
896
-
897
-
898
- #########################################################################################################################################################
899
- # Group columns
900
- #########################################################################################################################################################
901
 
902
 
903
- # Display Group columns header
904
- st.markdown("#### Feature engineering")
905
 
906
- # Prepare the numeric columns and an empty DataFrame for user input
907
- numeric_columns, default_df = prepare_numeric_columns_and_default_df(
908
- merged_df, edited_stats_df
909
- )
910
-
911
- # Display editable Dataframe
912
- edited_df = st.data_editor(
913
- default_df,
914
- column_config={
915
- "Column 1": st.column_config.SelectboxColumn(
916
- options=numeric_columns,
917
- required=True,
918
- default=numeric_columns[0],
919
- width=400,
920
- ),
921
- "Operator": st.column_config.SelectboxColumn(
922
- options=["+", "-", "*", "/"],
923
- required=True,
924
- default="+",
925
- width=100,
926
- ),
927
- "Column 2": st.column_config.SelectboxColumn(
928
- options=numeric_columns,
929
- required=True,
930
- default=numeric_columns[0],
931
- width=400,
932
- ),
933
- "Category": st.column_config.SelectboxColumn(
934
- options=[
935
- "Media",
936
- "Exogenous",
937
- "Internal",
938
- "Response Metrics",
939
- ],
940
- required=True,
941
- default="Media",
942
- width=200,
943
- ),
944
- },
945
- num_rows="dynamic",
946
- )
947
-
948
- # Process the DataFrame based on user inputs and operations specified in edited_df
949
- final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
950
-
951
-
952
- #########################################################################################################################################################
953
- # Display the Final DataFrame and variables
954
- #########################################################################################################################################################
955
-
956
-
957
- # Display the Final DataFrame and variables
958
- st.markdown("#### Final DataFrame")
959
- st.dataframe(final_df, hide_index=True)
960
 
961
  # Initialize an empty dictionary to hold categories and their variables
962
  category_dict = {}
@@ -974,15 +354,8 @@ if auth_status == True:
974
  # If it exists, append the current column to the list of variables under this category
975
  category_dict[category].append(column)
976
 
977
- # Add Date, Panel_1 and Panel_12 in category dictionary
978
- category_dict.update({"Date": ["date"]})
979
- if "Panel_1" in final_df.columns:
980
- category_dict["Panel Level 1"] = ["Panel_1"]
981
- if "Panel_2" in final_df.columns:
982
- category_dict["Panel Level 2"] = ["Panel_2"]
983
-
984
  # Display the dictionary
985
- st.markdown("#### Variable Category")
986
  for category, variables in category_dict.items():
987
  # Check if there are multiple variables to handle "and" insertion correctly
988
  if len(variables) > 1:
@@ -993,27 +366,19 @@ if auth_status == True:
993
  variables_str = variables[0]
994
 
995
  # Display the category and its variables in the desired format
996
- st.markdown(
997
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
998
- unsafe_allow_html=True,
999
- )
1000
-
1001
- # Function to check if Response Metrics is selected
1002
- st.write("")
1003
- response_metrics_col = category_dict.get("Response Metrics", [])
1004
- if len(response_metrics_col) == 0:
1005
- st.warning("Please select Response Metrics column", icon="⚠️")
1006
- st.stop()
1007
- # elif len(response_metrics_col) > 1:
1008
- # st.warning("Please select only one Response Metrics column", icon="⚠️")
1009
- # st.stop()
1010
-
1011
- # Store final dataframe and bin dictionary into session state
1012
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
1013
-
1014
- # Save the DataFrame and dictionary from the session state to the pickle file
1015
- if st.button("Accept and Save", use_container_width=True):
1016
- save_to_pickle(
1017
- "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
1018
- )
1019
- st.toast("💾 Saved Successfully!")
 
1
  # Importing necessary libraries
2
  import streamlit as st
3
+ import pickle
4
 
5
  st.set_page_config(
6
+ page_title="Model Build",
7
  page_icon=":shark:",
8
  layout="wide",
9
  initial_sidebar_state="collapsed",
10
  )
11
 
12
+ from utilities import load_authenticator
13
+ import numpy as np
14
  import pandas as pd
15
  from utilities import set_header, load_local_css
 
 
 
16
 
17
  load_local_css("styles.css")
18
  set_header()
19
 
20
 
21
  for k, v in st.session_state.items():
22
+ if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
 
 
23
  st.session_state[k] = v
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ authenticator = st.session_state.get('authenticator')
26
+ if authenticator is None:
27
+ authenticator = load_authenticator()
28
+
29
+ name, authentication_status, username = authenticator.login('Login', 'main')
30
+ auth_status = st.session_state.get('authentication_status')
31
 
32
+ if auth_status == True:
33
+ is_state_initiaized = st.session_state.get('initialized',False)
34
  if not is_state_initiaized:
35
+ a=1
36
+
37
+
38
+ # Function to expand dataframe to daily
39
+ @st.cache_resource(show_spinner=False)
40
+ def expand_to_daily(df, granularity, start_date, end_date):
41
+ # Create a new DataFrame with a row for each day
42
+ all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
43
+ daily_df = pd.DataFrame(all_dates, columns=["Date"])
44
+
45
+ if granularity == "daily":
46
+ # For daily data, simply merge to fill missing dates
47
+ daily_df = daily_df.merge(df, on="Date", how="left")
48
+ else:
49
+ # For weekly or monthly, distribute values to daily rows
50
+ for column in df.columns:
51
+ if column != "Date": # Skip 'Date' column
52
+ daily_df[column] = np.nan # Initialize with NaNs
53
+
54
+ # Group by the required frequency and distribute values
55
+ freq = "W-MON" if granularity == "weekly" else "MS"
56
+ for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)):
57
+ num_days = len(
58
+ pd.date_range(group["Date"].min(), group["Date"].max(), freq="D")
59
+ )
60
+ for column in group.columns:
61
+ if column == "Date": # Skip 'Date' column
62
+ continue
63
+ value = group[column].sum() / num_days
64
+ date_range = pd.date_range(
65
+ group["Date"].min(), periods=num_days, freq="D"
66
+ )
67
+ daily_df.loc[daily_df["Date"].isin(date_range), column] = value
68
 
69
+ return daily_df
70
 
71
+
72
+ # Function to validate date column in dataframe
73
  def validate_date_column(df):
74
  try:
75
  # Attempt to convert the 'Date' column to datetime
76
+ df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
77
  return True
78
  except:
79
  return False
 
91
  return "irregular"
92
 
93
 
94
+ # Function to convert and fill dates in dataframe
95
+ def convert_and_fill_dates(df, start_date, end_date, interval):
96
+ # Create a date range for the desired period
97
+ all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
98
+ new_df = pd.DataFrame(all_dates, columns=["Date"])
99
 
100
+ # Preprocess and aggregate data based on the original interval
101
+ if interval != "daily":
102
+ # Resample to start of each week/month, then sum values for the same period
103
+ if interval == "weekly":
104
+ df = df.resample("W-MON", on="Date").sum().reset_index()
105
+ elif interval == "monthly":
106
+ df = df.resample("MS", on="Date").sum().reset_index()
107
 
108
+ # Distribute values equally across the days in each week/month
109
+ expanded_rows = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  for _, row in df.iterrows():
111
+ if interval == "weekly":
112
+ period_dates = pd.date_range(row["Date"], periods=7)
113
+ elif interval == "monthly":
114
+ period_end = row["Date"] + pd.offsets.MonthEnd(1)
115
+ period_dates = pd.date_range(row["Date"], period_end)
116
+
117
+ for date in period_dates:
118
+ new_row = row.copy()
119
+ new_row["Date"] = date
120
  for col in df.columns:
121
+ if col != "Date": # Skip 'Date' column
122
+ new_row[col] = row[col] / len(period_dates)
123
+ expanded_rows.append(new_row)
 
 
 
 
 
 
 
 
 
 
124
 
125
+ # Create a DataFrame from expanded rows
126
+ expanded_df = pd.DataFrame(expanded_rows)
127
+ new_df = pd.merge(new_df, expanded_df, how="left", on="Date")
128
+ else:
129
+ # Daily data, aggregate if there are multiple entries for the same day
130
+ df = df.groupby("Date").sum().reset_index()
131
+ new_df = pd.merge(new_df, df, how="left", on="Date")
132
 
133
+ # Ensure all dates from start to end are present, filling missing values with NaN
134
+ new_df["Date"] = pd.to_datetime(new_df["Date"]) # Ensure 'Date' is datetime type
135
+ new_df = new_df.set_index("Date").reindex(all_dates).reset_index()
136
+ new_df.rename(columns={"index": "Date"}, inplace=True)
137
 
138
+ return new_df
139
 
 
 
140
 
141
+ # Function to convert a DataFrame from daily level granularity to either weekly or monthly level
142
+ def convert_to_higher_granularity(df, required_granularity):
143
+ if required_granularity == "daily":
144
+ return df
145
 
146
+ # Ensure 'Date' is the index and is in datetime format
147
+ if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
148
+ df["Date"] = pd.to_datetime(df["Date"])
149
+ df.set_index("Date", inplace=True)
150
 
151
+ # Resample and aggregate
152
+ if required_granularity == "weekly":
153
+ # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday
154
+ df = df.resample("W-MON").sum()
155
+ elif required_granularity == "monthly":
156
+ # Resample to monthly, using 'MS' to indicate month start
157
+ df = df.resample("MS").sum()
158
 
159
+ # Reset index to move 'Date' back to a column
160
+ df.reset_index(inplace=True)
 
161
 
162
+ return df
 
 
 
 
 
 
 
 
 
163
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ # # Read the CSV file, parsing 'Date' column as datetime
166
+ main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"])
167
+ # st.write(main_df)
168
+
169
+ # Get the start date (minimum) and end date (maximum) from the 'Date' column
170
+ api_start_date = main_df["Date"].min()
171
+ api_end_date = main_df["Date"].max()
172
 
173
+ # Infer the granularity from the most common difference between consecutive dates
174
+ date_diffs = main_df["Date"].diff().dt.days.dropna()
175
+ common_diff = date_diffs.mode()[0]
176
+ api_granularity = determine_data_interval(common_diff)
177
 
178
+ # Convert the DataFrame to daily level granularity
179
+ main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date)
180
 
181
+ # Page Title
182
+ st.title("Data Import")
183
 
184
+ # File uploader
185
+ uploaded_files = st.file_uploader(
186
+ "Upload additional data", type=["xlsx"], accept_multiple_files=True
187
+ )
188
 
189
+ # Custom HTML for upload instructions
190
+ recommendation_html = f"""
191
+ <div style="text-align: justify;">
192
+ <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values, and aggregated to a {api_granularity} level.
193
+ </div>
194
+ """
 
 
 
195
 
196
+ st.markdown(recommendation_html, unsafe_allow_html=True)
197
 
198
+ # Initialize a list to collect all processed DataFrames
199
+ all_data_dfs = []
200
 
201
+ if uploaded_files:
202
+ for uploaded_file in uploaded_files:
203
+ # Extract the file name
204
+ file_name = uploaded_file.name
205
 
206
+ # Load the file into a DataFrame
207
+ data_df = pd.read_excel(
208
+ uploaded_file,
209
+ )
210
+
211
+ # Identify numeric columns in the DataFrame
212
+ numeric_columns = data_df.select_dtypes(include="number").columns.tolist()
213
 
214
+ # Validate the 'Date' column and ensure there's at least one numeric column
215
+ if validate_date_column(data_df) and len(numeric_columns) > 0:
216
+ data_df = data_df[["Date"] + numeric_columns]
217
 
218
+ # Ensure the 'Date' column is in datetime format and sorted
219
+ data_df["Date"] = pd.to_datetime(data_df["Date"], dayfirst=True)
220
+ data_df.sort_values("Date", inplace=True)
221
 
222
+ # Calculate the most common day difference between dates to determine frequency
223
+ common_freq = data_df["Date"].diff().dt.days.dropna().mode()[0]
224
 
225
+ # Calculate the data interval (daily, weekly, monthly or irregular)
226
+ interval = determine_data_interval(common_freq)
227
 
228
+ if interval == "irregular":
229
+ # Warn the user if the 'Date' column doesn't meet the format requirements
230
+ st.warning(
231
+ f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval."
232
+ )
233
+ continue
234
 
235
+ # Convert data to specified interval and redistribute to daily
236
+ data_df = convert_and_fill_dates(
237
+ data_df, api_start_date, api_end_date, interval
238
+ )
239
 
240
+ # Add the processed DataFrame to the list
241
+ all_data_dfs.append(data_df)
 
242
 
 
 
 
 
 
 
 
 
 
 
 
243
  else:
244
+ # Warn the user if the 'Date' column doesn't meet the format requirements
245
+ st.warning(
246
+ f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ # Sequentially merge each of the other DataFrames with the main DataFrame on 'Date'
250
+ for df in all_data_dfs:
251
+ main_df = pd.merge(main_df, df, on="Date", how="left")
252
 
253
 
254
+ # Function to calculate missing stats and prepare for editable DataFrame
255
  def prepare_missing_stats_df(df):
256
  missing_stats = []
257
  for column in df.columns:
258
  if (
259
+ column == "Date" or column == "Total Approved Accounts - Revenue"
260
+ ): # Skip Date and Revenue column
261
  continue
262
 
263
  missing = df[column].isnull().sum()
264
  pct_missing = round((missing / len(df)) * 100, 2)
 
 
 
 
 
265
  missing_stats.append(
266
  {
267
  "Column": column,
268
  "Missing Values": missing,
269
  "Missing Percentage": pct_missing,
270
  "Impute Method": "Fill with 0", # Default value
271
+ "Category": "Media", # Default value
272
  }
273
  )
274
  stats_df = pd.DataFrame(missing_stats)
 
275
  return stats_df
276
 
277
 
278
+ # Prepare missing stats DataFrame for editing
279
+ missing_stats_df = prepare_missing_stats_df(main_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  # Create an editable DataFrame in Streamlit
282
  st.markdown("#### Select Variables Category & Impute Missing Values")
283
 
 
 
 
284
  edited_stats_df = st.data_editor(
285
  missing_stats_df,
286
  column_config={
 
296
  ),
297
  "Category": st.column_config.SelectboxColumn(
298
  options=[
299
+ "Date",
300
  "Media",
301
  "Exogenous",
302
  "Internal",
303
+ "DMA/Panel",
304
+ "Response_Metric"
305
  ],
306
  required=True,
307
  default="Media",
 
312
  use_container_width=True,
313
  )
314
 
315
+
316
  # Apply changes based on edited DataFrame
317
  for i, row in edited_stats_df.iterrows():
318
  column = row["Column"]
319
  if row["Impute Method"] == "Drop Column":
320
+ main_df.drop(columns=[column], inplace=True)
321
 
322
  elif row["Impute Method"] == "Fill with Mean":
323
+ main_df[column].fillna(main_df[column].mean(), inplace=True)
324
 
325
  elif row["Impute Method"] == "Fill with Median":
326
+ main_df[column].fillna(main_df[column].median(), inplace=True)
327
 
328
  elif row["Impute Method"] == "Fill with 0":
329
+ main_df[column].fillna(0, inplace=True)
 
 
 
 
 
330
 
331
 
332
+ # Convert the Final DataFrame to required granularity
333
+ main_df = convert_to_higher_granularity(main_df, api_granularity)
334
 
335
+ # Display the Final DataFrame and exogenous variables
336
+ st.markdown("#### Final DataFrame:")
337
+ st.dataframe(main_df)
338
+
339
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  # Initialize an empty dictionary to hold categories and their variables
342
  category_dict = {}
 
354
  # If it exists, append the current column to the list of variables under this category
355
  category_dict[category].append(column)
356
 
 
 
 
 
 
 
 
357
  # Display the dictionary
358
+ st.markdown("#### Variable Category:")
359
  for category, variables in category_dict.items():
360
  # Check if there are multiple variables to handle "and" insertion correctly
361
  if len(variables) > 1:
 
366
  variables_str = variables[0]
367
 
368
  # Display the category and its variables in the desired format
369
+ st.markdown(f"**{category}:** {variables_str}\n\n", unsafe_allow_html=True)
370
+
371
+ # storing maindf and categories in session_state
372
+ # st.write(main_df)
373
+
374
+
375
+ # st.session_state['Cleaned_data']=main_df
376
+
377
+ # st.session_state['category_dict']=category_dict
378
+ if st.button('Save Changes'):
379
+
380
+ with open("Pickle_files/main_df", 'wb') as f:
381
+ pickle.dump(main_df, f)
382
+ with open("Pickle_files/category_dict",'wb') as c:
383
+ pickle.dump(category_dict,c)
384
+ st.success('Changes Saved!')
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.32.1
8
- app_file: Data_Import.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.32.1
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
classes.py CHANGED
@@ -16,15 +16,21 @@ def class_to_dict(class_instance):
16
  attr_dict["modified_spends"] = class_instance.modified_spends
17
  attr_dict["modified_sales"] = class_instance.modified_sales
18
  attr_dict["response_curve_type"] = class_instance.response_curve_type
19
- attr_dict["response_curve_params"] = class_instance.response_curve_params
 
 
20
  attr_dict["penalty"] = class_instance.penalty
21
  attr_dict["bounds"] = class_instance.bounds
22
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
23
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
24
- attr_dict["modified_total_spends"] = class_instance.modified_total_spends
 
 
25
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
26
  attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
27
- attr_dict["modified_mroi"] = class_instance.get_marginal_roi("modified")
 
 
28
 
29
  elif isinstance(class_instance, Scenario):
30
  attr_dict["type"] = "Scenario"
@@ -37,7 +43,9 @@ def class_to_dict(class_instance):
37
  attr_dict["correction"] = class_instance.correction
38
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
39
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
40
- attr_dict["modified_total_spends"] = class_instance.modified_total_spends
 
 
41
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
42
 
43
  return attr_dict
@@ -87,7 +95,9 @@ class Channel:
87
  self.modified_sales = self.calculate_sales()
88
  self.modified_total_spends = self.modified_spends.sum()
89
  self.modified_total_sales = self.modified_sales.sum()
90
- self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
91
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
92
 
93
  def update_penalty(self, penalty):
@@ -109,7 +119,8 @@ class Channel:
109
  x = np.where(
110
  x < self.upper_limit,
111
  x,
112
- self.upper_limit + (x - self.upper_limit) * self.upper_limit / x,
 
113
  )
114
  if self.response_curve_type == "s-curve":
115
  if self.power >= 0:
@@ -158,7 +169,9 @@ class Channel:
158
  self.modified_sales = self.calculate_sales()
159
  self.modified_total_spends = self.modified_spends.sum()
160
  self.modified_total_sales = self.modified_sales.sum()
161
- self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
162
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
163
 
164
  def intialize(self):
@@ -195,7 +208,9 @@ class Scenario:
195
  self.actual_total_sales = self.calculate_actual_total_sales()
196
  self.modified_total_sales = self.calculate_modified_total_sales()
197
  self.modified_total_spends = self.calculate_modified_total_spends()
198
- self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
199
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
200
 
201
  def update_penalty(self, value):
@@ -205,7 +220,9 @@ class Scenario:
205
  def calculate_modified_total_spends(self):
206
  total_actual_spends = 0.0
207
  for channel in self.channels.values():
208
- total_actual_spends += channel.actual_total_spends * channel.conversion_rate
 
 
209
  return total_actual_spends
210
 
211
  def calculate_modified_total_spends(self):
@@ -234,47 +251,12 @@ class Scenario:
234
  self.channels[channel_name].update(modified_spends)
235
  self.modified_total_sales = self.calculate_modified_total_sales()
236
  self.modified_total_spends = self.calculate_modified_total_spends()
237
- self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
238
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
239
 
240
- # def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
241
- # desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
242
-
243
- # def constraint(x):
244
- # for ch, spends in zip(channels_list, x):
245
- # self.update(ch, spends)
246
- # return self.modified_total_sales - desired_sales
247
-
248
- # bounds = []
249
- # for ch in channels_list:
250
- # bounds.append(
251
- # (1 + np.array([-50.0, 100.0]) / 100.0)
252
- # * self.channels[ch].actual_total_spends
253
- # )
254
-
255
- # initial_point = []
256
- # for bound in bounds:
257
- # initial_point.append(bound[0])
258
-
259
- # power = np.ceil(np.log(sum(initial_point)) / np.log(10))
260
-
261
- # constraints = [NonlinearConstraint(constraint, -1.0, 1.0)]
262
-
263
- # res = minimize(
264
- # lambda x: sum(x) / 10 ** (power),
265
- # bounds=bounds,
266
- # x0=initial_point,
267
- # constraints=constraints,
268
- # method=algo,
269
- # options={"maxiter": int(2e7), "catol": 1},
270
- # )
271
-
272
- # for channel_name, modified_spends in zip(channels_list, res.x):
273
- # self.update(channel_name, modified_spends)
274
-
275
- # return zip(channels_list, res.x)
276
-
277
- def optimize_spends(self, sales_percent, channels_list, algo="trust-constr"):
278
  desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
279
 
280
  def constraint(x):
@@ -303,7 +285,7 @@ class Scenario:
303
  x0=initial_point,
304
  constraints=constraints,
305
  method=algo,
306
- options={"maxiter": int(2e7), "xtol": 100},
307
  )
308
 
309
  for channel_name, modified_spends in zip(channels_list, res.x):
@@ -335,11 +317,14 @@ class Scenario:
335
  for channel_name in channels_list:
336
  _channel_class = self.channels[channel_name]
337
  channel_bounds = _channel_class.bounds
338
- channel_actual_total_spends = _channel_class.actual_total_spends * (
339
- (1 + spends_percent / 100)
 
340
  )
341
  old_spends.append(channel_actual_total_spends)
342
- bounds.append((1 + channel_bounds / 100) * channel_actual_total_spends)
 
 
343
 
344
  def objective_function(x):
345
  for channel_name, modified_spends in zip(channels_list, x):
@@ -347,12 +332,12 @@ class Scenario:
347
  return -1 * self.modified_total_sales
348
 
349
  res = minimize(
350
- lambda x: objective_function(x) / 1e8,
351
  method="trust-constr",
352
  x0=old_spends,
353
  constraints=constraint,
354
  bounds=bounds,
355
- options={"maxiter": int(1e7), "xtol": 100},
356
  )
357
  # res = dual_annealing(
358
  # objective_function,
@@ -376,91 +361,81 @@ class Scenario:
376
  channel_data = []
377
 
378
  summary_rows = []
379
- actual_list.append(
380
- {
381
- "name": "Total",
382
- "Spends": self.actual_total_spends,
383
- "Sales": self.actual_total_sales,
384
- }
385
- )
386
- modified_list.append(
387
- {
388
- "name": "Total",
389
- "Spends": self.modified_total_spends,
390
- "Sales": self.modified_total_sales,
391
- }
392
- )
393
  for channel in self.channels.values():
394
  name_mod = channel.name.replace("_", " ")
395
  if name_mod.lower().endswith(" imp"):
396
  name_mod = name_mod.replace("Imp", " Impressions")
397
- summary_rows.append(
398
- [
399
- name_mod,
400
- channel.actual_total_spends,
401
- channel.modified_total_spends,
402
- channel.actual_total_sales,
403
- channel.modified_total_sales,
404
- round(channel.actual_total_sales / channel.actual_total_spends, 2),
405
- round(
406
- channel.modified_total_sales / channel.modified_total_spends,
407
- 2,
408
- ),
409
- channel.get_marginal_roi("actual"),
410
- channel.get_marginal_roi("modified"),
411
- ]
412
- )
 
413
  data[channel.name] = channel.modified_spends
414
  data["Date"] = channel.dates
415
  data["Sales"] = (
416
  data.get("Sales", np.zeros((len(channel.dates),)))
417
  + channel.modified_sales
418
  )
419
- actual_list.append(
420
- {
421
- "name": channel.name,
422
- "Spends": channel.actual_total_spends,
423
- "Sales": channel.actual_total_sales,
424
- "ROI": round(
425
- channel.actual_total_sales / channel.actual_total_spends, 2
426
- ),
427
- }
428
- )
429
- modified_list.append(
430
- {
431
- "name": channel.name,
432
- "Spends": channel.modified_total_spends,
433
- "Sales": channel.modified_total_sales,
434
- "ROI": round(
435
- channel.modified_total_sales / channel.modified_total_spends,
436
- 2,
437
- ),
438
- "Marginal ROI": channel.get_marginal_roi("modified"),
439
- }
440
- )
441
-
442
- channel_data.append(
443
- {
444
- "channel": channel.name,
445
- "spends_act": channel.actual_total_spends,
446
- "spends_mod": channel.modified_total_spends,
447
- "sales_act": channel.actual_total_sales,
448
- "sales_mod": channel.modified_total_sales,
449
- }
450
- )
451
- summary_rows.append(
452
- [
453
- "Total",
454
- self.actual_total_spends,
455
- self.modified_total_spends,
456
- self.actual_total_sales,
457
- self.modified_total_sales,
458
- round(self.actual_total_sales / self.actual_total_spends, 2),
459
- round(self.modified_total_sales / self.modified_total_spends, 2),
460
- 0.0,
461
- 0.0,
462
- ]
463
- )
464
  details["Actual"] = actual_list
465
  details["Modified"] = modified_list
466
  columns_index = pd.MultiIndex.from_product(
@@ -492,7 +467,8 @@ class Scenario:
492
  def from_dict(cls, attr_dict):
493
  channels_list = attr_dict["channels"]
494
  channels = {
495
- channel["name"]: class_from_dict(channel) for channel in channels_list
 
496
  }
497
  return Scenario(
498
  name=attr_dict["name"],
 
16
  attr_dict["modified_spends"] = class_instance.modified_spends
17
  attr_dict["modified_sales"] = class_instance.modified_sales
18
  attr_dict["response_curve_type"] = class_instance.response_curve_type
19
+ attr_dict["response_curve_params"] = (
20
+ class_instance.response_curve_params
21
+ )
22
  attr_dict["penalty"] = class_instance.penalty
23
  attr_dict["bounds"] = class_instance.bounds
24
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
25
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
26
+ attr_dict["modified_total_spends"] = (
27
+ class_instance.modified_total_spends
28
+ )
29
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
30
  attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
31
+ attr_dict["modified_mroi"] = class_instance.get_marginal_roi(
32
+ "modified"
33
+ )
34
 
35
  elif isinstance(class_instance, Scenario):
36
  attr_dict["type"] = "Scenario"
 
43
  attr_dict["correction"] = class_instance.correction
44
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
45
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
46
+ attr_dict["modified_total_spends"] = (
47
+ class_instance.modified_total_spends
48
+ )
49
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
50
 
51
  return attr_dict
 
95
  self.modified_sales = self.calculate_sales()
96
  self.modified_total_spends = self.modified_spends.sum()
97
  self.modified_total_sales = self.modified_sales.sum()
98
+ self.delta_spends = (
99
+ self.modified_total_spends - self.actual_total_spends
100
+ )
101
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
102
 
103
  def update_penalty(self, penalty):
 
119
  x = np.where(
120
  x < self.upper_limit,
121
  x,
122
+ self.upper_limit
123
+ + (x - self.upper_limit) * self.upper_limit / x,
124
  )
125
  if self.response_curve_type == "s-curve":
126
  if self.power >= 0:
 
169
  self.modified_sales = self.calculate_sales()
170
  self.modified_total_spends = self.modified_spends.sum()
171
  self.modified_total_sales = self.modified_sales.sum()
172
+ self.delta_spends = (
173
+ self.modified_total_spends - self.actual_total_spends
174
+ )
175
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
176
 
177
  def intialize(self):
 
208
  self.actual_total_sales = self.calculate_actual_total_sales()
209
  self.modified_total_sales = self.calculate_modified_total_sales()
210
  self.modified_total_spends = self.calculate_modified_total_spends()
211
+ self.delta_spends = (
212
+ self.modified_total_spends - self.actual_total_spends
213
+ )
214
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
215
 
216
  def update_penalty(self, value):
 
220
  def calculate_modified_total_spends(self):
221
  total_actual_spends = 0.0
222
  for channel in self.channels.values():
223
+ total_actual_spends += (
224
+ channel.actual_total_spends * channel.conversion_rate
225
+ )
226
  return total_actual_spends
227
 
228
  def calculate_modified_total_spends(self):
 
251
  self.channels[channel_name].update(modified_spends)
252
  self.modified_total_sales = self.calculate_modified_total_sales()
253
  self.modified_total_spends = self.calculate_modified_total_spends()
254
+ self.delta_spends = (
255
+ self.modified_total_spends - self.actual_total_spends
256
+ )
257
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
258
 
259
+ def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
261
 
262
  def constraint(x):
 
285
  x0=initial_point,
286
  constraints=constraints,
287
  method=algo,
288
+ options={"maxiter": int(2e7), "catol": 1},
289
  )
290
 
291
  for channel_name, modified_spends in zip(channels_list, res.x):
 
317
  for channel_name in channels_list:
318
  _channel_class = self.channels[channel_name]
319
  channel_bounds = _channel_class.bounds
320
+ channel_actual_total_spends = (
321
+ _channel_class.actual_total_spends
322
+ * ((1 + spends_percent / 100))
323
  )
324
  old_spends.append(channel_actual_total_spends)
325
+ bounds.append(
326
+ (1 + channel_bounds / 100) * channel_actual_total_spends
327
+ )
328
 
329
  def objective_function(x):
330
  for channel_name, modified_spends in zip(channels_list, x):
 
332
  return -1 * self.modified_total_sales
333
 
334
  res = minimize(
335
+ lambda x : objective_function(x) / 1e8,
336
  method="trust-constr",
337
  x0=old_spends,
338
  constraints=constraint,
339
  bounds=bounds,
340
+ options={"maxiter": int(1e7), 'xtol' : 100},
341
  )
342
  # res = dual_annealing(
343
  # objective_function,
 
361
  channel_data = []
362
 
363
  summary_rows = []
364
+ actual_list.append({
365
+ "name": "Total",
366
+ "Spends": self.actual_total_spends,
367
+ "Sales": self.actual_total_sales,
368
+ })
369
+ modified_list.append({
370
+ "name": "Total",
371
+ "Spends": self.modified_total_spends,
372
+ "Sales": self.modified_total_sales,
373
+ })
 
 
 
 
374
  for channel in self.channels.values():
375
  name_mod = channel.name.replace("_", " ")
376
  if name_mod.lower().endswith(" imp"):
377
  name_mod = name_mod.replace("Imp", " Impressions")
378
+ summary_rows.append([
379
+ name_mod,
380
+ channel.actual_total_spends,
381
+ channel.modified_total_spends,
382
+ channel.actual_total_sales,
383
+ channel.modified_total_sales,
384
+ round(
385
+ channel.actual_total_sales / channel.actual_total_spends, 2
386
+ ),
387
+ round(
388
+ channel.modified_total_sales
389
+ / channel.modified_total_spends,
390
+ 2,
391
+ ),
392
+ channel.get_marginal_roi("actual"),
393
+ channel.get_marginal_roi("modified"),
394
+ ])
395
  data[channel.name] = channel.modified_spends
396
  data["Date"] = channel.dates
397
  data["Sales"] = (
398
  data.get("Sales", np.zeros((len(channel.dates),)))
399
  + channel.modified_sales
400
  )
401
+ actual_list.append({
402
+ "name": channel.name,
403
+ "Spends": channel.actual_total_spends,
404
+ "Sales": channel.actual_total_sales,
405
+ "ROI": round(
406
+ channel.actual_total_sales / channel.actual_total_spends, 2
407
+ ),
408
+ })
409
+ modified_list.append({
410
+ "name": channel.name,
411
+ "Spends": channel.modified_total_spends,
412
+ "Sales": channel.modified_total_sales,
413
+ "ROI": round(
414
+ channel.modified_total_sales
415
+ / channel.modified_total_spends,
416
+ 2,
417
+ ),
418
+ "Marginal ROI": channel.get_marginal_roi("modified"),
419
+ })
420
+
421
+ channel_data.append({
422
+ "channel": channel.name,
423
+ "spends_act": channel.actual_total_spends,
424
+ "spends_mod": channel.modified_total_spends,
425
+ "sales_act": channel.actual_total_sales,
426
+ "sales_mod": channel.modified_total_sales,
427
+ })
428
+ summary_rows.append([
429
+ "Total",
430
+ self.actual_total_spends,
431
+ self.modified_total_spends,
432
+ self.actual_total_sales,
433
+ self.modified_total_sales,
434
+ round(self.actual_total_sales / self.actual_total_spends, 2),
435
+ round(self.modified_total_sales / self.modified_total_spends, 2),
436
+ 0.0,
437
+ 0.0,
438
+ ])
 
 
 
 
 
 
 
439
  details["Actual"] = actual_list
440
  details["Modified"] = modified_list
441
  columns_index = pd.MultiIndex.from_product(
 
467
  def from_dict(cls, attr_dict):
468
  channels_list = attr_dict["channels"]
469
  channels = {
470
+ channel["name"]: class_from_dict(channel)
471
+ for channel in channels_list
472
  }
473
  return Scenario(
474
  name=attr_dict["name"],
upf_data_converted.csv CHANGED
The diff for this file is too large to render. See raw diff
 
upf_data_converted.xlsx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acafd6c7cb1d8d860d6f055632dced93b1c726f432b230504b869b3e19a5edbc
3
- size 1853475
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92716069afa2c16a8afb6494da6d5f93878558de0215b1b9334ffeb997fdc6b6
3
+ size 1561111
utilities.py CHANGED
@@ -12,6 +12,7 @@ import io
12
  import plotly
13
  from pathlib import Path
14
  import pickle
 
15
  import yaml
16
  from yaml import SafeLoader
17
  from streamlit.components.v1 import html
@@ -23,59 +24,27 @@ import os
23
  import base64
24
 
25
 
26
- color_palette = [
27
- "#F3F3F0",
28
- "#5E7D7E",
29
- "#2FA1FF",
30
- "#00EDED",
31
- "#00EAE4",
32
- "#304550",
33
- "#EDEBEB",
34
- "#7FBEFD",
35
- "#003059",
36
- "#A2F3F3",
37
- "#E1D6E2",
38
- "#B6B6B6",
39
- ]
40
 
41
 
42
- CURRENCY_INDICATOR = "$"
43
 
44
- import streamlit_authenticator as stauth
45
 
 
46
 
47
  def load_authenticator():
48
- with open("config.yaml") as file:
49
  config = yaml.load(file, Loader=SafeLoader)
50
- st.session_state["config"] = config
51
  authenticator = stauth.Authenticate(
52
- credentials=config["credentials"],
53
- cookie_name=config["cookie"]["name"],
54
- key=config["cookie"]["key"],
55
- cookie_expiry_days=config["cookie"]["expiry_days"],
56
- preauthorized=config["preauthorized"],
57
  )
58
- st.session_state["authenticator"] = authenticator
59
  return authenticator
60
 
61
-
62
- # Authentication
63
- def authentication():
64
- with open("config.yaml") as file:
65
- config = yaml.load(file, Loader=SafeLoader)
66
-
67
- authenticator = stauth.Authenticate(
68
- config["credentials"],
69
- config["cookie"]["name"],
70
- config["cookie"]["key"],
71
- config["cookie"]["expiry_days"],
72
- config["preauthorized"],
73
- )
74
-
75
- name, authentication_status, username = authenticator.login("Login", "main")
76
- return authenticator, name, authentication_status, username
77
-
78
-
79
  def nav_page(page_name, timeout_secs=3):
80
  nav_script = """
81
  <script type="text/javascript">
@@ -98,10 +67,7 @@ def nav_page(page_name, timeout_secs=3):
98
  attempt_nav_page("%s", new Date(), %d);
99
  });
100
  </script>
101
- """ % (
102
- page_name,
103
- timeout_secs,
104
- )
105
  html(nav_script)
106
 
107
 
@@ -126,18 +92,23 @@ data_url = base64.b64encode(contents).decode("utf-8")
126
 
127
  file_.close()
128
 
 
129
 
130
- DATA_PATH = "./data"
131
 
132
- IMAGES_PATH = "./data/images_224_224"
133
 
 
134
 
135
  def load_local_css(file_name):
136
 
137
  with open(file_name) as f:
138
 
139
- st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
140
 
 
 
 
141
 
142
  # def set_header():
143
 
@@ -158,24 +129,24 @@ data_url1 = base64.b64encode(contents1).decode("utf-8")
158
 
159
  file_1.close()
160
 
 
 
 
 
 
 
161
 
162
- DATA_PATH1 = "./data"
163
 
164
- IMAGES_PATH1 = "./data/images_224_224"
165
 
166
 
167
  def set_header():
168
- return st.markdown(
169
- f"""<div class='main-header'>
170
  <!-- <h1></h1> -->
171
  <div >
172
  <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
173
  </div>
174
  <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
175
- </div>""",
176
- unsafe_allow_html=True,
177
- )
178
-
179
 
180
  # def set_header():
181
  # logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
@@ -186,87 +157,51 @@ def set_header():
186
  # </div>""", unsafe_allow_html=True)
187
 
188
 
189
- def s_curve(x, K, b, a, x0):
190
- return K / (1 + b * np.exp(-a * (x - x0)))
191
-
192
-
193
- def panel_level(input_df, date_column="Date"):
194
- # Ensure 'Date' is set as the index
195
- if date_column not in input_df.index.names:
196
- input_df = input_df.set_index(date_column)
197
-
198
- # Select numeric columns only (excluding 'Date' since it's now the index)
199
- numeric_columns_df = input_df.select_dtypes(include="number")
200
 
201
- # Group by 'Date' (which is the index) and sum the numeric columns
202
- aggregated_df = numeric_columns_df.groupby(input_df.index).sum()
203
-
204
- # Reset index if you want 'Date' back as a column
205
- aggregated_df = aggregated_df.reset_index()
206
-
207
- return aggregated_df
208
-
209
-
210
- def initialize_data(
211
- panel=None, target_file="Overview_data_test.xlsx", updated_rcs=None, metrics=None
212
- ):
213
  # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
214
  # "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
215
  # "digital_spends":1}
216
- # print('State initialized')
217
-
218
- excel = pd.read_excel(target_file, sheet_name=None)
219
-
220
- # Extract dataframes for raw data, spend input, and contribution MMM
221
- raw_df = excel["RAW DATA MMM"]
222
- spend_df = excel["SPEND INPUT"]
223
- contri_df = excel["CONTRIBUTION MMM"]
224
-
225
- # Check if the panel is not None
226
- if panel is not None and panel != "Aggregated":
227
- raw_df = raw_df[raw_df["Panel"] == panel].drop(columns=["Panel"])
228
- spend_df = spend_df[spend_df["Panel"] == panel].drop(columns=["Panel"])
229
- contri_df = contri_df[contri_df["Panel"] == panel].drop(columns=["Panel"])
230
- elif panel == "Aggregated":
231
- raw_df = panel_level(raw_df, date_column="Date")
232
- spend_df = panel_level(spend_df, date_column="Week")
233
- contri_df = panel_level(contri_df, date_column="Date")
234
-
235
- # Revenue_df = excel['Revenue']
236
-
237
- ## remove sesonalities, indices etc ...
238
- exclude_columns = [
239
- "Date",
240
- "Region",
241
- "Controls_Grammarly_Index_SeasonalAVG",
242
- "Controls_Quillbot_Index",
243
- "Daily_Positive_Outliers",
244
- "External_RemoteClass_Index",
245
- "Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802",
246
- "Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206",
247
- "Intervals ON 20201005-20201019",
248
- "Promotion_PercentOff",
249
- "Promotion_TimeBased",
250
- "Seasonality_Indicator_Chirstmas",
251
- "Seasonality_Indicator_NewYears_Days",
252
- "Seasonality_Indicator_Thanksgiving",
253
- "Trend 20200302 / 20200803",
254
- ]
255
- raw_df["Date"] = pd.to_datetime(raw_df["Date"])
256
- contri_df["Date"] = pd.to_datetime(contri_df["Date"])
257
- input_df = raw_df.sort_values(by="Date")
258
- output_df = contri_df.sort_values(by="Date")
259
- spend_df["Week"] = pd.to_datetime(
260
- spend_df["Week"], format="%Y-%m-%d", errors="coerce"
261
- )
262
- spend_df.sort_values(by="Week", inplace=True)
263
 
264
  # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
265
  # spend_df = spend_df.sort_values(by='Week')
 
266
 
267
  channel_list = [col for col in input_df.columns if col not in exclude_columns]
268
- channel_list = list(set(channel_list) - set(["fb_level_achieved_tier_1", "ga_app"]))
269
-
270
  response_curves = {}
271
  mapes = {}
272
  rmses = {}
@@ -280,14 +215,14 @@ def initialize_data(
280
  dates = input_df.Date.values
281
  actual_output_dic = {}
282
  actual_input_dic = {}
283
-
284
  for inp_col in channel_list:
285
- # st.write(inp_col)
286
  spends = input_df[inp_col].values
287
  x = spends.copy()
288
- # upper limit for penalty
289
- upper_limits[inp_col] = 2 * x.max()
290
-
291
  # contribution
292
  out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
293
  y = output_df[out_col].values.copy()
@@ -295,141 +230,96 @@ def initialize_data(
295
  actual_input_dic[inp_col] = x.copy()
296
  ##output cols aggregation
297
  output_cols.append(out_col)
298
-
299
  ## scale the input
300
- power = np.ceil(np.log(x.max()) / np.log(10)) - 3
301
- if power >= 0:
302
  x = x / 10**power
303
-
304
- x = x.astype("float64")
305
- y = y.astype("float64")
306
- # print('#printing yyyyyyyyy')
307
- # print(inp_col)
308
- # print(x.max())
309
- # print(y.max())
310
- bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
311
-
312
- # bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
313
- params, _ = curve_fit(
314
- s_curve,
315
- x,
316
- y,
317
- p0=(2 * y.max(), 0.01, 1e-5, x.max()),
318
- bounds=bounds,
319
- maxfev=int(1e5),
320
- )
321
  mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
322
- rmse = np.sqrt(((y - s_curve(x, *params)) ** 2).mean())
323
- r2_ = r2_score(y, s_curve(x, *params))
324
-
325
- response_curves[inp_col] = {
326
- "K": params[0],
327
- "b": params[1],
328
- "a": params[2],
329
- "x0": params[3],
330
- }
331
-
332
- updated_rcs_key = f"{metrics}#@{panel}#@{inp_col}"
333
- if updated_rcs is not None and updated_rcs_key in list(updated_rcs.keys()):
334
- response_curves[inp_col] = updated_rcs[updated_rcs_key]
335
 
 
336
  mapes[inp_col] = mape
337
  rmses[inp_col] = rmse
338
  r2[inp_col] = r2_
339
  powers[inp_col] = power
340
-
 
341
  ## conversion rates
342
- spend_col = [
343
- _col
344
- for _col in spend_df.columns
345
- if _col.startswith(inp_col.rsplit("_", 1)[0])
346
- ][0]
347
-
348
- # print('#printing spendssss')
349
- # print(spend_col)
350
- conv = (
351
- spend_df.set_index("Week")[spend_col]
352
- / input_df.set_index("Date")[inp_col].clip(lower=1)
353
- ).reset_index()
354
- conv.rename(columns={"index": "Week"}, inplace=True)
355
- conv["year"] = conv.Week.dt.year
356
- conv_rates[inp_col] = list(conv.drop("Week", axis=1).mean().to_dict().values())[
357
- 0
358
- ]
359
  ##print('Before',conv_rates[inp_col])
360
  # conv_rates[inp_col] = uopx_conv_rates[inp_col]
361
  ##print('After',(conv_rates[inp_col]))
362
-
363
- channel = Channel(
364
- name=inp_col,
365
- dates=dates,
366
- spends=spends,
367
- # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
368
- conversion_rate=conv_rates[inp_col],
369
- response_curve_type="s-curve",
370
- response_curve_params={
371
- "K": params[0],
372
- "b": params[1],
373
- "a": params[2],
374
- "x0": params[3],
375
- },
376
- bounds=np.array([-10, 10]),
377
- )
378
  channels[inp_col] = channel
379
  if sales is None:
380
  sales = channel.actual_sales
381
  else:
382
  sales += channel.actual_sales
383
- other_contributions = (
384
- output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only=True).values
385
- )
386
- correction = output_df.drop("Date", axis=1).sum(axis=1).values - (
387
- sales + other_contributions
388
- )
389
- scenario = Scenario(
390
- name="default",
391
- channels=channels,
392
- constant=other_contributions,
393
- correction=correction,
394
- )
395
  ## setting session variables
396
- st.session_state["initialized"] = True
397
- st.session_state["actual_df"] = input_df
398
- st.session_state["raw_df"] = raw_df
399
- st.session_state["contri_df"] = output_df
400
  default_scenario_dict = class_to_dict(scenario)
401
- st.session_state["default_scenario_dict"] = default_scenario_dict
402
- st.session_state["scenario"] = scenario
403
- st.session_state["channels_list"] = channel_list
404
- st.session_state["optimization_channels"] = {
405
- channel_name: False for channel_name in channel_list
406
- }
407
- st.session_state["rcs"] = response_curves
408
-
409
- st.session_state["powers"] = powers
410
- st.session_state["actual_contribution_df"] = pd.DataFrame(actual_output_dic)
411
- st.session_state["actual_input_df"] = pd.DataFrame(actual_input_dic)
412
-
413
  for channel in channels.values():
414
- st.session_state[channel.name] = numerize(
415
- channel.actual_total_spends * channel.conversion_rate, 1
416
- )
417
-
418
- st.session_state["xlsx_buffer"] = io.BytesIO()
419
-
420
- if Path("../saved_scenarios.pkl").exists():
421
- with open("../saved_scenarios.pkl", "rb") as f:
422
- st.session_state["saved_scenarios"] = pickle.load(f)
423
  else:
424
- st.session_state["saved_scenarios"] = OrderedDict()
425
-
426
- # st.session_state["total_spends_change"] = 0
427
- st.session_state["optimization_channels"] = {
428
- channel_name: False for channel_name in channel_list
429
- }
430
- st.session_state["disable_download_button"] = True
431
-
432
-
433
  # def initialize_data():
434
  # # fetch data from excel
435
  # output = pd.read_excel('data.xlsx',sheet_name=None)
@@ -445,17 +335,17 @@ def initialize_data(
445
  # channel_list.append(col)
446
  # else:
447
  # pass
448
-
449
  # ## NOTE : Considered only Desktop spends for all calculations
450
  # acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
451
  # ## NOTE : Considered one year of data
452
  # acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
453
  # actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
454
-
455
  # ##load response curves
456
  # with open('./grammarly_response_curves.json','r') as f:
457
  # response_curves = json.load(f)
458
-
459
  # ## create channel dict for scenario creation
460
  # dates = actual_df.Date.values
461
  # channels = {}
@@ -473,15 +363,15 @@ def initialize_data(
473
  # response_curve_type=response_curve_type,
474
  # response_curve_params=response_curve_params,
475
  # bounds=np.array([-30,30]))
476
-
477
  # channels[name] = channel
478
  # else:
479
  # constant = info_dict.get('value',0.) * len(dates)
480
-
481
  # ## create scenario
482
  # scenario = Scenario(name='default', channels=channels, constant=constant)
483
  # default_scenario_dict = class_to_dict(scenario)
484
-
485
 
486
  # ## setting session variables
487
  # st.session_state['initialized'] = True
@@ -495,7 +385,7 @@ def initialize_data(
495
  # for channel in channels.values():
496
  # if channel.name not in st.session_state:
497
  # st.session_state[channel.name] = float(channel.actual_total_spends)
498
-
499
  # if 'xlsx_buffer' not in st.session_state:
500
  # st.session_state['xlsx_buffer'] = io.BytesIO()
501
 
@@ -504,121 +394,51 @@ def initialize_data(
504
  # if Path('../saved_scenarios.pkl').exists():
505
  # with open('../saved_scenarios.pkl','rb') as f:
506
  # st.session_state['saved_scenarios'] = pickle.load(f)
507
-
508
  # else:
509
  # st.session_state['saved_scenarios'] = OrderedDict()
510
 
511
  # if 'total_spends_change' not in st.session_state:
512
  # st.session_state['total_spends_change'] = 0
513
-
514
  # if 'optimization_channels' not in st.session_state:
515
  # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
516
-
517
  # if 'disable_download_button' not in st.session_state:
518
  # st.session_state['disable_download_button'] = True
519
-
520
-
521
  def create_channel_summary(scenario):
522
 
523
  # Provided data
524
  data = {
525
- "Channel": [
526
- "Paid Search",
527
- "Ga will cid baixo risco",
528
- "Digital tactic others",
529
- "Fb la tier 1",
530
- "Fb la tier 2",
531
- "Paid social others",
532
- "Programmatic",
533
- "Kwai",
534
- "Indicacao",
535
- "Infleux",
536
- "Influencer",
537
- ],
538
- "Spends": [
539
- "$ 11.3K",
540
- "$ 155.2K",
541
- "$ 50.7K",
542
- "$ 125.4K",
543
- "$ 125.2K",
544
- "$ 105K",
545
- "$ 3.3M",
546
- "$ 47.5K",
547
- "$ 55.9K",
548
- "$ 632.3K",
549
- "$ 48.3K",
550
- ],
551
- "Revenue": [
552
- "558.0K",
553
- "3.5M",
554
- "5.2M",
555
- "3.1M",
556
- "3.1M",
557
- "2.1M",
558
- "20.8M",
559
- "1.6M",
560
- "728.4K",
561
- "22.9M",
562
- "4.8M",
563
- ],
564
  }
565
 
566
  # Create DataFrame
567
  df = pd.DataFrame(data)
568
 
569
  # Convert currency strings to numeric values
570
- df["Spends"] = (
571
- df["Spends"]
572
- .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
573
- .map(pd.eval)
574
- .astype(int)
575
- )
576
- df["Revenue"] = (
577
- df["Revenue"]
578
- .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
579
- .map(pd.eval)
580
- .astype(int)
581
- )
582
 
583
  # Calculate ROI
584
- df["ROI"] = (df["Revenue"] - df["Spends"]) / df["Spends"]
585
 
586
  # Format columns
587
  format_currency = lambda x: f"${x:,.1f}"
588
  format_roi = lambda x: f"{x:.1f}"
589
 
590
- df["Spends"] = [
591
- "$ 11.3K",
592
- "$ 155.2K",
593
- "$ 50.7K",
594
- "$ 125.4K",
595
- "$ 125.2K",
596
- "$ 105K",
597
- "$ 3.3M",
598
- "$ 47.5K",
599
- "$ 55.9K",
600
- "$ 632.3K",
601
- "$ 48.3K",
602
- ]
603
- df["Revenue"] = [
604
- "$ 536.3K",
605
- "$ 3.4M",
606
- "$ 5M",
607
- "$ 3M",
608
- "$ 3M",
609
- "$ 2M",
610
- "$ 20M",
611
- "$ 1.5M",
612
- "$ 7.1M",
613
- "$ 22M",
614
- "$ 4.6M",
615
- ]
616
- df["ROI"] = df["ROI"].apply(format_roi)
617
-
618
  return df
619
 
620
 
621
- # @st.cache(allow_output_mutation=True)
622
  # def create_contribution_pie(scenario):
623
  # #c1f7dc
624
  # colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
@@ -650,23 +470,23 @@ def create_channel_summary(scenario):
650
  # weekly_spends_data = []
651
  # weekly_sales_data = []
652
  # for channel_name in st.session_state['channels_list']:
653
- # weekly_spends_data.append((go.Bar(x=x,
654
  # y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
655
- # name=channel_name_formating(channel_name),
656
  # hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
657
  # legendgroup=channel_name)))
658
- # weekly_sales_data.append((go.Bar(x=x,
659
  # y=scenario.channels[channel_name].actual_sales,
660
- # name=channel_name_formating(channel_name),
661
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
662
  # legendgroup=channel_name, showlegend=False)))
663
  # for _d in weekly_spends_data:
664
  # weekly_contribution_fig.add_trace(_d, row=1, col=1)
665
  # for _d in weekly_sales_data:
666
  # weekly_contribution_fig.add_trace(_d, row=1, col=2)
667
- # weekly_contribution_fig.add_trace(go.Bar(x=x,
668
  # y=scenario.constant + scenario.correction,
669
- # name='Non Media',
670
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
671
  # weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
672
  # weekly_contribution_fig.update_xaxes(showgrid=False)
@@ -704,50 +524,14 @@ def create_channel_summary(scenario):
704
 
705
 
706
  def create_contribution_pie():
707
- color_palette = [
708
- "#F3F3F0",
709
- "#5E7D7E",
710
- "#2FA1FF",
711
- "#00EDED",
712
- "#00EAE4",
713
- "#304550",
714
- "#EDEBEB",
715
- "#7FBEFD",
716
- "#003059",
717
- "#A2F3F3",
718
- "#E1D6E2",
719
- "#B6B6B6",
720
- ]
721
- total_contribution_fig = make_subplots(
722
- rows=1,
723
- cols=2,
724
- subplot_titles=["Spends", "Revenue"],
725
- specs=[[{"type": "pie"}, {"type": "pie"}]],
726
- )
727
 
728
- channels_list = [
729
- "Paid Search",
730
- "Ga will cid baixo risco",
731
- "Digital tactic others",
732
- "Fb la tier 1",
733
- "Fb la tier 2",
734
- "Paid social others",
735
- "Programmatic",
736
- "Kwai",
737
- "Indicacao",
738
- "Infleux",
739
- "Influencer",
740
- "Non Media",
741
- ]
742
 
743
  # Assign colors from the limited palette to channels
744
- colors_map = {
745
- col: color_palette[i % len(color_palette)]
746
- for i, col in enumerate(channels_list)
747
- }
748
- colors_map["Non Media"] = color_palette[
749
- 5
750
- ] # Assign fixed green color for 'Non Media'
751
 
752
  # Hardcoded values for Spends and Revenue
753
  spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
@@ -758,13 +542,10 @@ def create_contribution_pie():
758
  go.Pie(
759
  labels=[channel_name for channel_name in channels_list],
760
  values=spends_values,
761
- marker=dict(
762
- colors=[colors_map[channel_name] for channel_name in channels_list]
763
- ),
764
- hole=0.3,
765
  ),
766
- row=1,
767
- col=1,
768
  )
769
 
770
  # Add trace for Revenue pie chart
@@ -772,196 +553,144 @@ def create_contribution_pie():
772
  go.Pie(
773
  labels=[channel_name for channel_name in channels_list],
774
  values=revenue_values,
775
- marker=dict(
776
- colors=[colors_map[channel_name] for channel_name in channels_list]
777
- ),
778
- hole=0.3,
779
  ),
780
- row=1,
781
- col=2,
782
- )
783
-
784
- total_contribution_fig.update_traces(
785
- textposition="inside", texttemplate="%{percent:.1%}"
786
- )
787
- total_contribution_fig.update_layout(
788
- uniformtext_minsize=12, title="Channel contribution", uniformtext_mode="hide"
789
  )
 
 
 
790
  return total_contribution_fig
791
 
792
-
793
  def create_contribuion_stacked_plot(scenario):
794
- weekly_contribution_fig = make_subplots(
795
- rows=1,
796
- cols=2,
797
- subplot_titles=["Spends", "Revenue"],
798
- specs=[[{"type": "bar"}, {"type": "bar"}]],
799
- )
800
- raw_df = st.session_state["raw_df"]
801
- df = raw_df.sort_values(by="Date")
802
  x = df.Date
803
  weekly_spends_data = []
804
  weekly_sales_data = []
805
-
806
- for i, channel_name in enumerate(st.session_state["channels_list"]):
807
  color = color_palette[i % len(color_palette)]
808
-
809
- weekly_spends_data.append(
810
- go.Bar(
811
- x=x,
812
- y=scenario.channels[channel_name].actual_spends
813
- * scenario.channels[channel_name].conversion_rate,
814
- name=channel_name_formating(channel_name),
815
- hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
816
- legendgroup=channel_name,
817
- marker_color=color,
818
- )
819
- )
820
-
821
- weekly_sales_data.append(
822
- go.Bar(
823
- x=x,
824
- y=scenario.channels[channel_name].actual_sales,
825
- name=channel_name_formating(channel_name),
826
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
827
- legendgroup=channel_name,
828
- showlegend=False,
829
- marker_color=color,
830
- )
831
- )
832
-
833
  for _d in weekly_spends_data:
834
  weekly_contribution_fig.add_trace(_d, row=1, col=1)
835
  for _d in weekly_sales_data:
836
  weekly_contribution_fig.add_trace(_d, row=1, col=2)
837
-
838
- weekly_contribution_fig.add_trace(
839
- go.Bar(
840
- x=x,
841
- y=scenario.constant + scenario.correction,
842
- name="Non Media",
843
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
844
- marker_color=color_palette[-1],
845
- ),
846
- row=1,
847
- col=2,
848
- )
849
-
850
- weekly_contribution_fig.update_layout(
851
- barmode="stack", title="Channel contribution by week", xaxis_title="Date"
852
- )
853
  weekly_contribution_fig.update_xaxes(showgrid=False)
854
  weekly_contribution_fig.update_yaxes(showgrid=False)
855
  return weekly_contribution_fig
856
 
857
-
858
  def create_channel_spends_sales_plot(channel):
859
  if channel is not None:
860
  x = channel.dates
861
  _spends = channel.actual_spends * channel.conversion_rate
862
  _sales = channel.actual_sales
863
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
864
- channel_sales_spends_fig.add_trace(
865
- go.Bar(
866
- x=x,
867
- y=_sales,
868
- marker_color=color_palette[
869
- 3
870
- ], # You can choose a color from the palette
871
- name="Revenue",
872
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
873
- ),
874
- secondary_y=False,
875
- )
876
-
877
- channel_sales_spends_fig.add_trace(
878
- go.Scatter(
879
- x=x,
880
- y=_spends,
881
- line=dict(
882
- color=color_palette[2]
883
- ), # You can choose another color from the palette
884
- name="Spends",
885
- hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
886
- ),
887
- secondary_y=True,
888
- )
889
-
890
- channel_sales_spends_fig.update_layout(
891
- xaxis_title="Date",
892
- yaxis_title="Revenue",
893
- yaxis2_title="Spends ($)",
894
- title="Channel spends and Revenue week-wise",
895
- )
896
  channel_sales_spends_fig.update_xaxes(showgrid=False)
897
  channel_sales_spends_fig.update_yaxes(showgrid=False)
898
  else:
899
- raw_df = st.session_state["raw_df"]
900
- df = raw_df.sort_values(by="Date")
901
  x = df.Date
902
- scenario = class_from_dict(st.session_state["default_scenario_dict"])
903
  _sales = scenario.constant + scenario.correction
904
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
905
- channel_sales_spends_fig.add_trace(
906
- go.Bar(
907
- x=x,
908
- y=_sales,
909
- marker_color=color_palette[
910
- 0
911
- ], # You can choose a color from the palette
912
- name="Revenue",
913
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
914
- ),
915
- secondary_y=False,
916
- )
917
-
918
- channel_sales_spends_fig.update_layout(
919
- xaxis_title="Date",
920
- yaxis_title="Revenue",
921
- yaxis2_title="Spends ($)",
922
- title="Channel spends and Revenue week-wise",
923
- )
924
  channel_sales_spends_fig.update_xaxes(showgrid=False)
925
  channel_sales_spends_fig.update_yaxes(showgrid=False)
926
-
927
  return channel_sales_spends_fig
928
 
929
-
930
- def format_numbers(value, n_decimals=1, include_indicator=True):
931
  if include_indicator:
932
- return f"{CURRENCY_INDICATOR} {numerize(value,n_decimals)}"
933
  else:
934
- return f"{numerize(value,n_decimals)}"
935
 
936
 
937
- def decimal_formater(num_string, n_decimals=1):
938
- parts = num_string.split(".")
939
  if len(parts) == 1:
940
- return num_string + "." + "0" * n_decimals
941
  else:
942
  to_be_padded = n_decimals - len(parts[-1])
943
- if to_be_padded > 0:
944
- return num_string + "0" * to_be_padded
945
  else:
946
  return num_string
947
-
948
-
949
  def channel_name_formating(channel_name):
950
- name_mod = channel_name.replace("_", " ")
951
- if name_mod.lower().endswith(" imp"):
952
- name_mod = name_mod.replace("Imp", "Spend")
953
- elif name_mod.lower().endswith(" clicks"):
954
- name_mod = name_mod.replace("Clicks", "Spend")
955
  return name_mod
956
 
957
 
958
- def send_email(email, message):
959
- s = smtplib.SMTP("smtp.gmail.com", 587)
960
  s.starttls()
961
  s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
962
  s.sendmail("geethu4444@gmail.com", email, message)
963
  s.quit()
964
 
965
-
966
  if __name__ == "__main__":
967
  initialize_data()
 
12
  import plotly
13
  from pathlib import Path
14
  import pickle
15
+ import streamlit_authenticator as stauth
16
  import yaml
17
  from yaml import SafeLoader
18
  from streamlit.components.v1 import html
 
24
  import base64
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
+ color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
30
 
 
31
 
32
+ CURRENCY_INDICATOR = '$'
33
 
34
  def load_authenticator():
35
+ with open('config.yaml') as file:
36
  config = yaml.load(file, Loader=SafeLoader)
37
+ st.session_state['config'] = config
38
  authenticator = stauth.Authenticate(
39
+ config['credentials'],
40
+ config['cookie']['name'],
41
+ config['cookie']['key'],
42
+ config['cookie']['expiry_days'],
43
+ config['preauthorized']
44
  )
45
+ st.session_state['authenticator'] = authenticator
46
  return authenticator
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def nav_page(page_name, timeout_secs=3):
49
  nav_script = """
50
  <script type="text/javascript">
 
67
  attempt_nav_page("%s", new Date(), %d);
68
  });
69
  </script>
70
+ """ % (page_name, timeout_secs)
 
 
 
71
  html(nav_script)
72
 
73
 
 
92
 
93
  file_.close()
94
 
95
+
96
 
97
+ DATA_PATH = './data'
98
 
99
+ IMAGES_PATH = './data/images_224_224'
100
 
101
+
102
 
103
  def load_local_css(file_name):
104
 
105
  with open(file_name) as f:
106
 
107
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
108
 
109
+
110
+
111
+
112
 
113
  # def set_header():
114
 
 
129
 
130
  file_1.close()
131
 
132
+
133
+
134
+ DATA_PATH1 = './data'
135
+
136
+ IMAGES_PATH1 = './data/images_224_224'
137
+
138
 
 
139
 
 
140
 
141
 
142
  def set_header():
143
+ return st.markdown(f"""<div class='main-header'>
 
144
  <!-- <h1></h1> -->
145
  <div >
146
  <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
147
  </div>
148
  <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
149
+ </div>""", unsafe_allow_html=True)
 
 
 
150
 
151
  # def set_header():
152
  # logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
 
157
  # </div>""", unsafe_allow_html=True)
158
 
159
 
160
+ def s_curve(x,K,b,a,x0):
161
+ return K / (1 + b * np.exp(-a*(x-x0)))
 
 
 
 
 
 
 
 
 
162
 
163
+ def initialize_data():
 
 
 
 
 
 
 
 
 
 
 
164
  # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
165
  # "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
166
  # "digital_spends":1}
167
+ #print('State initialized')
168
+ excel = pd.read_excel("Overview_data_test.xlsx",sheet_name=None)
169
+ raw_df = excel['RAW DATA MMM']
170
+
171
+ spend_df = excel['SPEND INPUT']
172
+ contri_df = excel['CONTRIBUTION MMM']
173
+ #Revenue_df = excel['Revenue']
174
+
175
+ ## remove sesonalities, indices etc ...
176
+ exclude_columns = ['Date',
177
+ 'Region',
178
+ 'Controls_Grammarly_Index_SeasonalAVG',
179
+ 'Controls_Quillbot_Index',
180
+ 'Daily_Positive_Outliers',
181
+ 'External_RemoteClass_Index',
182
+ 'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
183
+ 'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
184
+ 'Intervals ON 20201005-20201019',
185
+ 'Promotion_PercentOff',
186
+ 'Promotion_TimeBased',
187
+ 'Seasonality_Indicator_Chirstmas',
188
+ 'Seasonality_Indicator_NewYears_Days',
189
+ 'Seasonality_Indicator_Thanksgiving',
190
+ 'Trend 20200302 / 20200803',
191
+ ]
192
+ raw_df['Date']=pd.to_datetime(raw_df['Date'])
193
+ contri_df['Date']=pd.to_datetime(contri_df['Date'])
194
+ input_df = raw_df.sort_values(by='Date')
195
+ output_df = contri_df.sort_values(by='Date')
196
+ spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
197
+ spend_df.sort_values(by='Week', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
200
  # spend_df = spend_df.sort_values(by='Week')
201
+
202
 
203
  channel_list = [col for col in input_df.columns if col not in exclude_columns]
204
+
 
205
  response_curves = {}
206
  mapes = {}
207
  rmses = {}
 
215
  dates = input_df.Date.values
216
  actual_output_dic = {}
217
  actual_input_dic = {}
218
+
219
  for inp_col in channel_list:
220
+ #st.write(inp_col)
221
  spends = input_df[inp_col].values
222
  x = spends.copy()
223
+ # upper limit for penalty
224
+ upper_limits[inp_col] = 2*x.max()
225
+
226
  # contribution
227
  out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
228
  y = output_df[out_col].values.copy()
 
230
  actual_input_dic[inp_col] = x.copy()
231
  ##output cols aggregation
232
  output_cols.append(out_col)
233
+
234
  ## scale the input
235
+ power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
236
+ if power >= 0 :
237
  x = x / 10**power
238
+
239
+
240
+ x = x.astype('float64')
241
+ y = y.astype('float64')
242
+ #print('#printing yyyyyyyyy')
243
+ #print(inp_col)
244
+ #print(x.max())
245
+ #print(y.max())
246
+ bounds = ((0, 0, 0, 0), (3*y.max(), 1000, 1, x.max()))
247
+
248
+ #bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
249
+ params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
250
+ bounds=bounds,
251
+ maxfev=int(1e5))
 
 
 
 
252
  mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
253
+ rmse = np.sqrt(((y - s_curve(x,*params))**2).mean())
254
+ r2_ = r2_score(y, s_curve(x,*params))
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
257
  mapes[inp_col] = mape
258
  rmses[inp_col] = rmse
259
  r2[inp_col] = r2_
260
  powers[inp_col] = power
261
+
262
+
263
  ## conversion rates
264
+ spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
265
+
266
+ #print('#printing spendssss')
267
+ #print(spend_col)
268
+ conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
269
+ conv.rename(columns={'index':'Week'},inplace=True)
270
+ conv['year'] = conv.Week.dt.year
271
+ conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
 
 
 
 
 
 
 
 
 
272
  ##print('Before',conv_rates[inp_col])
273
  # conv_rates[inp_col] = uopx_conv_rates[inp_col]
274
  ##print('After',(conv_rates[inp_col]))
275
+
276
+
277
+ channel = Channel(name=inp_col,dates=dates,
278
+ spends=spends,
279
+ # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
280
+ conversion_rate = conv_rates[inp_col],
281
+ response_curve_type='s-curve',
282
+ response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
283
+ bounds=np.array([-10,10]))
 
 
 
 
 
 
 
284
  channels[inp_col] = channel
285
  if sales is None:
286
  sales = channel.actual_sales
287
  else:
288
  sales += channel.actual_sales
289
+ other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
290
+ correction = output_df.drop('Date',axis=1).sum(axis=1).values - (sales + other_contributions)
291
+ scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
 
 
 
 
 
 
 
 
 
292
  ## setting session variables
293
+ st.session_state['initialized'] = True
294
+ st.session_state['actual_df'] = input_df
295
+ st.session_state['raw_df'] = raw_df
296
+ st.session_state['contri_df'] = output_df
297
  default_scenario_dict = class_to_dict(scenario)
298
+ st.session_state['default_scenario_dict'] = default_scenario_dict
299
+ st.session_state['scenario'] = scenario
300
+ st.session_state['channels_list'] = channel_list
301
+ st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
302
+ st.session_state['rcs'] = response_curves
303
+ st.session_state['powers'] = powers
304
+ st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
305
+ st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
306
+
 
 
 
307
  for channel in channels.values():
308
+ st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
309
+
310
+ st.session_state['xlsx_buffer'] = io.BytesIO()
311
+
312
+
313
+ if Path('../saved_scenarios.pkl').exists():
314
+ with open('../saved_scenarios.pkl','rb') as f:
315
+ st.session_state['saved_scenarios'] = pickle.load(f)
 
316
  else:
317
+ st.session_state['saved_scenarios'] = OrderedDict()
318
+
319
+ st.session_state['total_spends_change'] = 0
320
+ st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
321
+ st.session_state['disable_download_button'] = True
322
+
 
 
 
323
  # def initialize_data():
324
  # # fetch data from excel
325
  # output = pd.read_excel('data.xlsx',sheet_name=None)
 
335
  # channel_list.append(col)
336
  # else:
337
  # pass
338
+
339
  # ## NOTE : Considered only Desktop spends for all calculations
340
  # acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
341
  # ## NOTE : Considered one year of data
342
  # acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
343
  # actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
344
+
345
  # ##load response curves
346
  # with open('./grammarly_response_curves.json','r') as f:
347
  # response_curves = json.load(f)
348
+
349
  # ## create channel dict for scenario creation
350
  # dates = actual_df.Date.values
351
  # channels = {}
 
363
  # response_curve_type=response_curve_type,
364
  # response_curve_params=response_curve_params,
365
  # bounds=np.array([-30,30]))
366
+
367
  # channels[name] = channel
368
  # else:
369
  # constant = info_dict.get('value',0.) * len(dates)
370
+
371
  # ## create scenario
372
  # scenario = Scenario(name='default', channels=channels, constant=constant)
373
  # default_scenario_dict = class_to_dict(scenario)
374
+
375
 
376
  # ## setting session variables
377
  # st.session_state['initialized'] = True
 
385
  # for channel in channels.values():
386
  # if channel.name not in st.session_state:
387
  # st.session_state[channel.name] = float(channel.actual_total_spends)
388
+
389
  # if 'xlsx_buffer' not in st.session_state:
390
  # st.session_state['xlsx_buffer'] = io.BytesIO()
391
 
 
394
  # if Path('../saved_scenarios.pkl').exists():
395
  # with open('../saved_scenarios.pkl','rb') as f:
396
  # st.session_state['saved_scenarios'] = pickle.load(f)
397
+
398
  # else:
399
  # st.session_state['saved_scenarios'] = OrderedDict()
400
 
401
  # if 'total_spends_change' not in st.session_state:
402
  # st.session_state['total_spends_change'] = 0
403
+
404
  # if 'optimization_channels' not in st.session_state:
405
  # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
406
+
407
  # if 'disable_download_button' not in st.session_state:
408
  # st.session_state['disable_download_button'] = True
409
+
410
+
411
  def create_channel_summary(scenario):
412
 
413
  # Provided data
414
  data = {
415
+ 'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
416
+ 'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
417
+ 'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  }
419
 
420
  # Create DataFrame
421
  df = pd.DataFrame(data)
422
 
423
  # Convert currency strings to numeric values
424
+ df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
425
+ df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
 
 
 
 
 
 
 
 
 
 
426
 
427
  # Calculate ROI
428
+ df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
429
 
430
  # Format columns
431
  format_currency = lambda x: f"${x:,.1f}"
432
  format_roi = lambda x: f"{x:.1f}"
433
 
434
+ df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
435
+ df['Revenue'] = ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
436
+ df['ROI'] = df['ROI'].apply(format_roi)
437
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  return df
439
 
440
 
441
+ #@st.cache(allow_output_mutation=True)
442
  # def create_contribution_pie(scenario):
443
  # #c1f7dc
444
  # colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
 
470
  # weekly_spends_data = []
471
  # weekly_sales_data = []
472
  # for channel_name in st.session_state['channels_list']:
473
+ # weekly_spends_data.append((go.Bar(x=x,
474
  # y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
475
+ # name=channel_name_formating(channel_name),
476
  # hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
477
  # legendgroup=channel_name)))
478
+ # weekly_sales_data.append((go.Bar(x=x,
479
  # y=scenario.channels[channel_name].actual_sales,
480
+ # name=channel_name_formating(channel_name),
481
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
482
  # legendgroup=channel_name, showlegend=False)))
483
  # for _d in weekly_spends_data:
484
  # weekly_contribution_fig.add_trace(_d, row=1, col=1)
485
  # for _d in weekly_sales_data:
486
  # weekly_contribution_fig.add_trace(_d, row=1, col=2)
487
+ # weekly_contribution_fig.add_trace(go.Bar(x=x,
488
  # y=scenario.constant + scenario.correction,
489
+ # name='Non Media',
490
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
491
  # weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
492
  # weekly_contribution_fig.update_xaxes(showgrid=False)
 
524
 
525
 
526
  def create_contribution_pie():
527
+ color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
528
+ total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
+ channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
  # Assign colors from the limited palette to channels
533
+ colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
534
+ colors_map['Non Media'] = color_palette[5] # Assign fixed green color for 'Non Media'
 
 
 
 
 
535
 
536
  # Hardcoded values for Spends and Revenue
537
  spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
 
542
  go.Pie(
543
  labels=[channel_name for channel_name in channels_list],
544
  values=spends_values,
545
+ marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
546
+ hole=0.3
 
 
547
  ),
548
+ row=1, col=1
 
549
  )
550
 
551
  # Add trace for Revenue pie chart
 
553
  go.Pie(
554
  labels=[channel_name for channel_name in channels_list],
555
  values=revenue_values,
556
+ marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
557
+ hole=0.3
 
 
558
  ),
559
+ row=1, col=2
 
 
 
 
 
 
 
 
560
  )
561
+
562
+ total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
563
+ total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
564
  return total_contribution_fig
565
 
 
566
  def create_contribuion_stacked_plot(scenario):
567
+ weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
568
+ raw_df = st.session_state['raw_df']
569
+ df = raw_df.sort_values(by='Date')
 
 
 
 
 
570
  x = df.Date
571
  weekly_spends_data = []
572
  weekly_sales_data = []
573
+
574
+ for i, channel_name in enumerate(st.session_state['channels_list']):
575
  color = color_palette[i % len(color_palette)]
576
+
577
+ weekly_spends_data.append(go.Bar(
578
+ x=x,
579
+ y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
580
+ name=channel_name_formating(channel_name),
581
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
582
+ legendgroup=channel_name,
583
+ marker_color=color,
584
+ ))
585
+
586
+ weekly_sales_data.append(go.Bar(
587
+ x=x,
588
+ y=scenario.channels[channel_name].actual_sales,
589
+ name=channel_name_formating(channel_name),
590
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
591
+ legendgroup=channel_name,
592
+ showlegend=False,
593
+ marker_color=color,
594
+ ))
595
+
 
 
 
 
 
596
  for _d in weekly_spends_data:
597
  weekly_contribution_fig.add_trace(_d, row=1, col=1)
598
  for _d in weekly_sales_data:
599
  weekly_contribution_fig.add_trace(_d, row=1, col=2)
600
+
601
+ weekly_contribution_fig.add_trace(go.Bar(
602
+ x=x,
603
+ y=scenario.constant + scenario.correction,
604
+ name='Non Media',
605
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
606
+ marker_color=color_palette[-1],
607
+ ), row=1, col=2)
608
+
609
+ weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
 
 
 
 
 
 
610
  weekly_contribution_fig.update_xaxes(showgrid=False)
611
  weekly_contribution_fig.update_yaxes(showgrid=False)
612
  return weekly_contribution_fig
613
 
 
614
  def create_channel_spends_sales_plot(channel):
615
  if channel is not None:
616
  x = channel.dates
617
  _spends = channel.actual_spends * channel.conversion_rate
618
  _sales = channel.actual_sales
619
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
620
+ channel_sales_spends_fig.add_trace(go.Bar(
621
+ x=x,
622
+ y=_sales,
623
+ marker_color=color_palette[3], # You can choose a color from the palette
624
+ name='Revenue',
625
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
626
+ ), secondary_y=False)
627
+
628
+ channel_sales_spends_fig.add_trace(go.Scatter(
629
+ x=x,
630
+ y=_spends,
631
+ line=dict(color=color_palette[2]), # You can choose another color from the palette
632
+ name='Spends',
633
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
634
+ ), secondary_y=True)
635
+
636
+ channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  channel_sales_spends_fig.update_xaxes(showgrid=False)
638
  channel_sales_spends_fig.update_yaxes(showgrid=False)
639
  else:
640
+ raw_df = st.session_state['raw_df']
641
+ df = raw_df.sort_values(by='Date')
642
  x = df.Date
643
+ scenario = class_from_dict(st.session_state['default_scenario_dict'])
644
  _sales = scenario.constant + scenario.correction
645
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
646
+ channel_sales_spends_fig.add_trace(go.Bar(
647
+ x=x,
648
+ y=_sales,
649
+ marker_color=color_palette[0], # You can choose a color from the palette
650
+ name='Revenue',
651
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
652
+ ), secondary_y=False)
653
+
654
+ channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
 
 
 
 
 
 
 
 
 
 
655
  channel_sales_spends_fig.update_xaxes(showgrid=False)
656
  channel_sales_spends_fig.update_yaxes(showgrid=False)
657
+
658
  return channel_sales_spends_fig
659
 
660
+ def format_numbers(value, n_decimals=1,include_indicator = True):
 
661
  if include_indicator:
662
+ return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
663
  else:
664
+ return f'{numerize(value,n_decimals)}'
665
 
666
 
667
+ def decimal_formater(num_string,n_decimals=1):
668
+ parts = num_string.split('.')
669
  if len(parts) == 1:
670
+ return num_string+'.' + '0'*n_decimals
671
  else:
672
  to_be_padded = n_decimals - len(parts[-1])
673
+ if to_be_padded > 0 :
674
+ return num_string+'0'*to_be_padded
675
  else:
676
  return num_string
677
+
678
+
679
  def channel_name_formating(channel_name):
680
+ name_mod = channel_name.replace('_', ' ')
681
+ if name_mod.lower().endswith(' imp'):
682
+ name_mod = name_mod.replace('Imp','Spend')
683
+ elif name_mod.lower().endswith(' clicks'):
684
+ name_mod = name_mod.replace('Clicks','Spend')
685
  return name_mod
686
 
687
 
688
+ def send_email(email,message):
689
+ s = smtplib.SMTP('smtp.gmail.com', 587)
690
  s.starttls()
691
  s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
692
  s.sendmail("geethu4444@gmail.com", email, message)
693
  s.quit()
694
 
 
695
  if __name__ == "__main__":
696
  initialize_data()