BlendMMM commited on
Commit
d31d208
1 Parent(s): 30c2b16

Rename pages/Data_Import.py to pages/Help.py

Browse files
Files changed (2) hide show
  1. pages/Data_Import.py +0 -891
  2. pages/Help.py +204 -0
pages/Data_Import.py DELETED
@@ -1,891 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Model Build",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import numpy as np
12
- import pandas as pd
13
- from utilities import set_header, load_local_css, load_authenticator
14
- import pickle
15
-
16
-
17
- load_local_css("styles.css")
18
- set_header()
19
-
20
- authenticator = st.session_state.get("authenticator")
21
- if authenticator is None:
22
- authenticator = load_authenticator()
23
-
24
- name, authentication_status, username = authenticator.login("Login", "main")
25
- auth_status = st.session_state.get("authentication_status")
26
-
27
- # Check for authentication status
28
- if auth_status != True:
29
- st.stop()
30
-
31
-
32
- # Function to validate date column in dataframe
33
- def validate_date_column(df):
34
- try:
35
- # Attempt to convert the 'Date' column to datetime
36
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
37
- return True
38
- except:
39
- return False
40
-
41
-
42
- # Function to determine data interval
43
- def determine_data_interval(common_freq):
44
- if common_freq == 1:
45
- return "daily"
46
- elif common_freq == 7:
47
- return "weekly"
48
- elif 28 <= common_freq <= 31:
49
- return "monthly"
50
- else:
51
- return "irregular"
52
-
53
-
54
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
55
- st.cache_resource(show_spinner=False)
56
-
57
-
58
- def files_to_dataframes(uploaded_files):
59
- df_dict = {}
60
- for uploaded_file in uploaded_files:
61
- # Extract file name without extension
62
- file_name = uploaded_file.name.rsplit(".", 1)[0]
63
-
64
- # Check for duplicate file names
65
- if file_name in df_dict:
66
- st.warning(
67
- f"Duplicate File: {file_name}. This file will be skipped.",
68
- icon="⚠️",
69
- )
70
- continue
71
-
72
- # Read the file into a DataFrame
73
- df = pd.read_excel(uploaded_file)
74
-
75
- # Convert all column names to lowercase
76
- df.columns = df.columns.str.lower().str.strip()
77
-
78
- # Separate numeric and non-numeric columns
79
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
80
- non_numeric_cols = [
81
- col
82
- for col in df.select_dtypes(exclude=["number"]).columns
83
- if col.lower() != "date"
84
- ]
85
-
86
- # Check for 'Date' column
87
- if not (validate_date_column(df) and len(numeric_cols) > 0):
88
- st.warning(
89
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
90
- icon="⚠️",
91
- )
92
- continue
93
-
94
- # Check for interval
95
- common_freq = common_freq = (
96
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
97
- )
98
- # Calculate the data interval (daily, weekly, monthly or irregular)
99
- interval = determine_data_interval(common_freq)
100
- if interval == "irregular":
101
- st.warning(
102
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
103
- icon="⚠️",
104
- )
105
- continue
106
-
107
- # Store both DataFrames in the dictionary under their respective keys
108
- df_dict[file_name] = {
109
- "numeric": numeric_cols,
110
- "non_numeric": non_numeric_cols,
111
- "interval": interval,
112
- "df": df,
113
- }
114
-
115
- return df_dict
116
-
117
-
118
- # Function to adjust dataframe granularity
119
- # def adjust_dataframe_granularity(df, current_granularity, target_granularity):
120
- # # Set index
121
- # df.set_index("date", inplace=True)
122
-
123
- # # Define aggregation rules for resampling
124
- # aggregation_rules = {
125
- # col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
126
- # for col in df.columns
127
- # }
128
-
129
- # resampled_df = df
130
- # if current_granularity == "daily" and target_granularity == "weekly":
131
- # resampled_df = df.resample("W-MON").agg(aggregation_rules)
132
-
133
- # elif current_granularity == "daily" and target_granularity == "monthly":
134
- # resampled_df = df.resample("MS").agg(aggregation_rules)
135
-
136
- # elif current_granularity == "daily" and target_granularity == "daily":
137
- # resampled_df = df.resample("D").agg(aggregation_rules)
138
-
139
- # elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
140
- # # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
141
- # expanded_data = []
142
- # for _, row in df.iterrows():
143
- # if current_granularity == "weekly":
144
- # period_range = pd.date_range(start=row.name, periods=7)
145
- # elif current_granularity == "monthly":
146
- # period_range = pd.date_range(
147
- # start=row.name, periods=row.name.days_in_month
148
- # )
149
-
150
- # for date in period_range:
151
- # new_row = {}
152
- # for col in df.columns:
153
- # if pd.api.types.is_numeric_dtype(df[col]):
154
- # if current_granularity == "weekly":
155
- # new_row[col] = row[col] / 7
156
- # elif current_granularity == "monthly":
157
- # new_row[col] = row[col] / row.name.days_in_month
158
- # else:
159
- # new_row[col] = row[col]
160
- # expanded_data.append((date, new_row))
161
-
162
- # resampled_df = pd.DataFrame(
163
- # [data for _, data in expanded_data],
164
- # index=[date for date, _ in expanded_data],
165
- # )
166
-
167
- # # Reset index
168
- # resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
169
-
170
- # return resampled_df
171
-
172
-
173
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
174
- # Set index
175
- df.set_index("date", inplace=True)
176
-
177
- # Define aggregation rules for resampling
178
- aggregation_rules = {
179
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
180
- for col in df.columns
181
- }
182
-
183
- # Initialize resampled_df
184
- resampled_df = df
185
- if current_granularity == "daily" and target_granularity == "weekly":
186
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
187
- aggregation_rules
188
- )
189
-
190
- elif current_granularity == "daily" and target_granularity == "monthly":
191
- resampled_df = df.resample("MS", closed="left", label="left").agg(
192
- aggregation_rules
193
- )
194
-
195
- elif current_granularity == "daily" and target_granularity == "daily":
196
- resampled_df = df.resample("D").agg(aggregation_rules)
197
-
198
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
199
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
200
- expanded_data = []
201
- for _, row in df.iterrows():
202
- if current_granularity == "weekly":
203
- period_range = pd.date_range(start=row.name, periods=7)
204
- elif current_granularity == "monthly":
205
- period_range = pd.date_range(
206
- start=row.name, periods=row.name.days_in_month
207
- )
208
-
209
- for date in period_range:
210
- new_row = {}
211
- for col in df.columns:
212
- if pd.api.types.is_numeric_dtype(df[col]):
213
- if current_granularity == "weekly":
214
- new_row[col] = row[col] / 7
215
- elif current_granularity == "monthly":
216
- new_row[col] = row[col] / row.name.days_in_month
217
- else:
218
- new_row[col] = row[col]
219
- expanded_data.append((date, new_row))
220
-
221
- resampled_df = pd.DataFrame(
222
- [data for _, data in expanded_data],
223
- index=[date for date, _ in expanded_data],
224
- )
225
-
226
- # Reset index
227
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
228
-
229
- return resampled_df
230
-
231
-
232
- # Function to clean and extract unique values of DMA and Panel
233
- st.cache_resource(show_spinner=False)
234
-
235
-
236
- def clean_and_extract_unique_values(files_dict, selections):
237
- all_dma_values = set()
238
- all_panel_values = set()
239
-
240
- for file_name, file_data in files_dict.items():
241
- df = file_data["df"]
242
-
243
- # 'DMA' and 'Panel' selections
244
- selected_dma = selections[file_name].get("DMA")
245
- selected_panel = selections[file_name].get("Panel")
246
-
247
- # Clean and standardize DMA column if it exists and is selected
248
- if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
249
- df[selected_dma] = (
250
- df[selected_dma].str.lower().str.strip().str.replace("_", " ")
251
- )
252
- all_dma_values.update(df[selected_dma].dropna().unique())
253
-
254
- # Clean and standardize Panel column if it exists and is selected
255
- if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
256
- df[selected_panel] = (
257
- df[selected_panel].str.lower().str.strip().str.replace("_", " ")
258
- )
259
- all_panel_values.update(df[selected_panel].dropna().unique())
260
-
261
- # Update the processed DataFrame back in the dictionary
262
- files_dict[file_name]["df"] = df
263
-
264
- return all_dma_values, all_panel_values
265
-
266
-
267
- # Function to format values for display
268
- st.cache_resource(show_spinner=False)
269
-
270
-
271
- def format_values_for_display(values_list):
272
- # Capitalize the first letter of each word and replace underscores with spaces
273
- formatted_list = [value.replace("_", " ").title() for value in values_list]
274
- # Join values with commas and 'and' before the last value
275
- if len(formatted_list) > 1:
276
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
277
- elif formatted_list:
278
- return formatted_list[0]
279
- return "No values available"
280
-
281
-
282
- # Function to normalizes all data within files_dict to a daily granularity
283
- st.cache(show_spinner=False, allow_output_mutation=True)
284
-
285
-
286
- def standardize_data_to_daily(files_dict, selections):
287
- # Normalize all data to a daily granularity using a provided function
288
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
289
-
290
- # Update the "interval" attribute for each dataset to indicate the new granularity
291
- for files_name, files_data in files_dict.items():
292
- files_data["interval"] = "daily"
293
-
294
- return files_dict
295
-
296
-
297
- # Function to apply granularity transformation to all DataFrames in files_dict
298
- st.cache_resource(show_spinner=False)
299
-
300
-
301
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
302
- for file_name, file_data in files_dict.items():
303
- df = file_data["df"].copy()
304
-
305
- # Handling when DMA or Panel might be 'N/A'
306
- selected_dma = selections[file_name].get("DMA")
307
- selected_panel = selections[file_name].get("Panel")
308
-
309
- # Correcting the segment selection logic & handling 'N/A'
310
- if selected_dma != "N/A" and selected_panel != "N/A":
311
- unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
312
- elif selected_dma != "N/A":
313
- unique_combinations = df[[selected_dma]].drop_duplicates()
314
- selected_panel = None # Ensure Panel is ignored if N/A
315
- elif selected_panel != "N/A":
316
- unique_combinations = df[[selected_panel]].drop_duplicates()
317
- selected_dma = None # Ensure DMA is ignored if N/A
318
- else:
319
- # If both are 'N/A', process the entire dataframe as is
320
- df = adjust_dataframe_granularity(
321
- df, file_data["interval"], granularity_selection
322
- )
323
- files_dict[file_name]["df"] = df
324
- continue # Skip to the next file
325
-
326
- transformed_segments = []
327
- for _, combo in unique_combinations.iterrows():
328
- if selected_dma and selected_panel:
329
- segment = df[
330
- (df[selected_dma] == combo[selected_dma])
331
- & (df[selected_panel] == combo[selected_panel])
332
- ]
333
- elif selected_dma:
334
- segment = df[df[selected_dma] == combo[selected_dma]]
335
- elif selected_panel:
336
- segment = df[df[selected_panel] == combo[selected_panel]]
337
-
338
- # Adjust granularity of the segment
339
- transformed_segment = adjust_dataframe_granularity(
340
- segment, file_data["interval"], granularity_selection
341
- )
342
- transformed_segments.append(transformed_segment)
343
-
344
- # Combine all transformed segments into a single DataFrame for this file
345
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
346
- files_dict[file_name]["df"] = transformed_df
347
-
348
- return files_dict
349
-
350
-
351
- # Function to create main dataframe structure
352
- st.cache_resource(show_spinner=False)
353
-
354
-
355
- def create_main_dataframe(
356
- files_dict, all_dma_values, all_panel_values, granularity_selection
357
- ):
358
- # Determine the global start and end dates across all DataFrames
359
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
360
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
361
-
362
- # Adjust the date_range generation based on the granularity_selection
363
- if granularity_selection == "weekly":
364
- # Generate a weekly range, with weeks starting on Monday
365
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
366
- elif granularity_selection == "monthly":
367
- # Generate a monthly range, starting from the first day of each month
368
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
369
- else: # Default to daily if not weekly or monthly
370
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
371
-
372
- # Collect all unique DMA and Panel values, excluding 'N/A'
373
- all_dmas = all_dma_values
374
- all_panels = all_panel_values
375
-
376
- # Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
377
- dimensions, merge_keys = [], []
378
- if all_panels:
379
- dimensions.append(all_panels)
380
- merge_keys.append("Panel")
381
- if all_dmas:
382
- dimensions.append(all_dmas)
383
- merge_keys.append("DMA")
384
-
385
- dimensions.append(date_range) # Date range is always included
386
- merge_keys.append("date") # Date range is always included
387
-
388
- # Create a main DataFrame template with the dimensions
389
- main_df = pd.MultiIndex.from_product(
390
- dimensions,
391
- names=[name for name, _ in zip(merge_keys, dimensions)],
392
- ).to_frame(index=False)
393
-
394
- return main_df.reset_index(drop=True)
395
-
396
-
397
- # Function to prepare and merge dataFrames
398
- st.cache_resource(show_spinner=False)
399
-
400
-
401
- def merge_into_main_df(main_df, files_dict, selections):
402
- for file_name, file_data in files_dict.items():
403
- df = file_data["df"].copy()
404
-
405
- # Rename selected DMA and Panel columns if not 'N/A'
406
- selected_dma = selections[file_name].get("DMA", "N/A")
407
- selected_panel = selections[file_name].get("Panel", "N/A")
408
- if selected_dma != "N/A":
409
- df.rename(columns={selected_dma: "DMA"}, inplace=True)
410
- if selected_panel != "N/A":
411
- df.rename(columns={selected_panel: "Panel"}, inplace=True)
412
-
413
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
414
- merge_keys = ["date"]
415
- if "Panel" in df.columns:
416
- merge_keys.append("Panel")
417
- if "DMA" in df.columns:
418
- merge_keys.append("DMA")
419
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
420
-
421
- # After all merges, sort by 'date' and reset index for cleanliness
422
- sort_by = ["date"]
423
- if "Panel" in main_df.columns:
424
- sort_by.append("Panel")
425
- if "DMA" in main_df.columns:
426
- sort_by.append("DMA")
427
- main_df.sort_values(by=sort_by, inplace=True)
428
- main_df.reset_index(drop=True, inplace=True)
429
-
430
- return main_df
431
-
432
-
433
- # Function to categorize column
434
- def categorize_column(column_name):
435
- # Define keywords for each category
436
- internal_keywords = [
437
- "Price",
438
- "Discount",
439
- "product_price",
440
- "cost",
441
- "margin",
442
- "inventory",
443
- "sales",
444
- "revenue",
445
- "turnover",
446
- "expense",
447
- ]
448
- exogenous_keywords = [
449
- "GDP",
450
- "Tax",
451
- "Inflation",
452
- "interest_rate",
453
- "employment_rate",
454
- "exchange_rate",
455
- "consumer_spending",
456
- "retail_sales",
457
- "oil_prices",
458
- "weather",
459
- ]
460
-
461
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
462
- for keyword in internal_keywords:
463
- if keyword.lower() in column_name.lower():
464
- return "Internal"
465
- for keyword in exogenous_keywords:
466
- if keyword.lower() in column_name.lower():
467
- return "Exogenous"
468
-
469
- # Default to Media if no match found
470
- return "Media"
471
-
472
-
473
- # Function to calculate missing stats and prepare for editable DataFrame
474
- st.cache_resource(show_spinner=False)
475
-
476
-
477
- def prepare_missing_stats_df(df):
478
- missing_stats = []
479
- for column in df.columns:
480
- if (
481
- column == "date" or column == "DMA" or column == "Panel"
482
- ): # Skip Date, DMA and Panel column
483
- continue
484
-
485
- missing = df[column].isnull().sum()
486
- pct_missing = round((missing / len(df)) * 100, 2)
487
-
488
- # Dynamically assign category based on column name
489
- # category = categorize_column(column)
490
- category = "Media"
491
-
492
- missing_stats.append(
493
- {
494
- "Column": column,
495
- "Missing Values": missing,
496
- "Missing Percentage": pct_missing,
497
- "Impute Method": "Fill with 0", # Default value
498
- "Category": category,
499
- }
500
- )
501
- stats_df = pd.DataFrame(missing_stats)
502
-
503
- return stats_df
504
-
505
-
506
- # Function to add API DataFrame details to the files dictionary
507
- st.cache_resource(show_spinner=False)
508
-
509
-
510
- def add_api_dataframe_to_dict(main_df, files_dict):
511
- files_dict["API"] = {
512
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
513
- "non_numeric": [
514
- col
515
- for col in main_df.select_dtypes(exclude=["number"]).columns
516
- if col.lower() != "date"
517
- ],
518
- "interval": determine_data_interval(
519
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
520
- ),
521
- "df": main_df,
522
- }
523
-
524
- return files_dict
525
-
526
-
527
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
528
- @st.cache_resource(show_spinner=False)
529
- def read_API_data():
530
- return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])
531
-
532
-
533
- # Function to set the 'DMA_Panel_Selected' session state variable to False
534
- def set_DMA_Panel_Selected_false():
535
- st.session_state["DMA_Panel_Selected"] = False
536
-
537
-
538
- # Initialize 'final_df' in session state
539
- if "final_df" not in st.session_state:
540
- st.session_state["final_df"] = pd.DataFrame()
541
-
542
- # Initialize 'bin_dict' in session state
543
- if "bin_dict" not in st.session_state:
544
- st.session_state["bin_dict"] = {}
545
-
546
- # Initialize 'DMA_Panel_Selected' in session state
547
- if "DMA_Panel_Selected" not in st.session_state:
548
- st.session_state["DMA_Panel_Selected"] = False
549
-
550
- # Page Title
551
- st.write("") # Top padding
552
- st.title("Data Import")
553
-
554
-
555
- #########################################################################################################################################################
556
- # Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
557
- #########################################################################################################################################################
558
-
559
-
560
- # Read the Excel file, parsing 'Date' column as datetime
561
- main_df = read_API_data()
562
-
563
- # Convert all column names to lowercase
564
- main_df.columns = main_df.columns.str.lower().str.strip()
565
-
566
- # File uploader
567
- uploaded_files = st.file_uploader(
568
- "Upload additional data",
569
- type=["xlsx"],
570
- accept_multiple_files=True,
571
- on_change=set_DMA_Panel_Selected_false,
572
- )
573
-
574
- # Custom HTML for upload instructions
575
- recommendation_html = f"""
576
- <div style="text-align: justify;">
577
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
578
- </div>
579
- """
580
- st.markdown(recommendation_html, unsafe_allow_html=True)
581
-
582
- # Choose Date Granularity
583
- st.markdown("#### Choose Date Granularity")
584
- # Granularity Selection
585
- granularity_selection = st.selectbox(
586
- "Choose Date Granularity",
587
- ["Daily", "Weekly", "Monthly"],
588
- label_visibility="collapsed",
589
- on_change=set_DMA_Panel_Selected_false,
590
- )
591
- granularity_selection = str(granularity_selection).lower()
592
-
593
- # Convert files to dataframes
594
- files_dict = files_to_dataframes(uploaded_files)
595
-
596
- # Add API Dataframe
597
- if main_df is not None:
598
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
599
-
600
- # Display a warning message if no files have been uploaded and halt further execution
601
- if not files_dict:
602
- st.warning(
603
- "Please upload at least one file to proceed.",
604
- icon="⚠️",
605
- )
606
- st.stop() # Halts further execution until file is uploaded
607
-
608
-
609
- # Select DMA and Panel columns
610
- st.markdown("#### Select DMA and Panel columns")
611
- selections = {}
612
- with st.expander("Select DMA and Panel columns", expanded=False):
613
- count = 0 # Initialize counter to manage the visibility of labels and keys
614
- for file_name, file_data in files_dict.items():
615
- # Determine visibility of the label based on the count
616
- if count == 0:
617
- label_visibility = "visible"
618
- else:
619
- label_visibility = "collapsed"
620
-
621
- # Extract non-numeric columns
622
- non_numeric_cols = file_data["non_numeric"]
623
-
624
- # Prepare DMA and Panel values for dropdown, adding "N/A" as an option
625
- dma_values = non_numeric_cols + ["N/A"]
626
- panel_values = non_numeric_cols + ["N/A"]
627
-
628
- # Skip if only one option is available
629
- if len(dma_values) == 1 and len(panel_values) == 1:
630
- selected_dma, selected_panel = "N/A", "N/A"
631
- # Update the selections for DMA and Panel for the current file
632
- selections[file_name] = {
633
- "DMA": selected_dma,
634
- "Panel": selected_panel,
635
- }
636
- continue
637
-
638
- # Create layout columns for File Name, DMA, and Panel selections
639
- file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])
640
-
641
- with file_name_col:
642
- # Display "File Name" label only for the first file
643
- if count == 0:
644
- st.write("File Name")
645
- else:
646
- st.write("")
647
- st.write(file_name) # Display the file name
648
-
649
- with DMA_col:
650
- # Display a selectbox for DMA values
651
- selected_dma = st.selectbox(
652
- "Select DMA",
653
- dma_values,
654
- on_change=set_DMA_Panel_Selected_false,
655
- label_visibility=label_visibility, # Control visibility of the label
656
- key=f"DMA_selectbox{count}", # Ensure unique key for each selectbox
657
- )
658
-
659
- with Panel_col:
660
- # Display a selectbox for Panel values
661
- selected_panel = st.selectbox(
662
- "Select Panel",
663
- panel_values,
664
- on_change=set_DMA_Panel_Selected_false,
665
- label_visibility=label_visibility, # Control visibility of the label
666
- key=f"Panel_selectbox{count}", # Ensure unique key for each selectbox
667
- )
668
-
669
- # Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
670
- if selected_panel == selected_dma and not (
671
- selected_panel == "N/A" and selected_dma == "N/A"
672
- ):
673
- st.warning(
674
- f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
675
- )
676
- selected_dma, selected_panel = "N/A", "N/A"
677
- st.stop()
678
-
679
- # Update the selections for DMA and Panel for the current file
680
- selections[file_name] = {
681
- "DMA": selected_dma,
682
- "Panel": selected_panel,
683
- }
684
-
685
- count += 1 # Increment the counter after processing each file
686
-
687
- # Accept DMA and Panel selection
688
- if st.button("Accept and Process", use_container_width=True):
689
-
690
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
691
- with st.spinner("Processing...", cache=True):
692
- files_dict = standardize_data_to_daily(files_dict, selections)
693
-
694
- # Convert all data to daily level granularity
695
- files_dict = apply_granularity_to_all(
696
- files_dict, granularity_selection, selections
697
- )
698
-
699
- st.session_state["files_dict"] = files_dict
700
- st.session_state["DMA_Panel_Selected"] = True
701
-
702
-
703
- #########################################################################################################################################################
704
- # Display unique DMA and Panel values
705
- #########################################################################################################################################################
706
-
707
-
708
- # Halts further execution until DMA and Panel columns are selected
709
- if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
710
- files_dict = st.session_state["files_dict"]
711
- else:
712
- st.stop()
713
-
714
- # Set to store unique values of DMA and Panel
715
- with st.spinner("Fetching DMA and Panel values..."):
716
- all_dma_values, all_panel_values = clean_and_extract_unique_values(
717
- files_dict, selections
718
- )
719
-
720
- # List of DMA and Panel columns unique values
721
- list_of_all_dma_values = list(all_dma_values)
722
- list_of_all_panel_values = list(all_panel_values)
723
-
724
- # Format DMA and Panel values for display
725
- formatted_dma_values = format_values_for_display(list_of_all_dma_values)
726
- formatted_panel_values = format_values_for_display(list_of_all_panel_values)
727
-
728
- # Unique DMA and Panel values
729
- st.markdown("#### Unique DMA and Panel values")
730
- # Display DMA and Panel values
731
- with st.expander("Unique DMA and Panel values"):
732
- st.write("")
733
- st.markdown(
734
- f"""
735
- <style>
736
- .justify-text {{
737
- text-align: justify;
738
- }}
739
- </style>
740
- <div class="justify-text">
741
- <strong>Panel Values:</strong> {formatted_panel_values}<br>
742
- <strong>DMA Values:</strong> {formatted_dma_values}
743
- </div>
744
- """,
745
- unsafe_allow_html=True,
746
- )
747
-
748
- # Display total DMA and Panel
749
- st.write("")
750
- st.markdown(
751
- f"""
752
- <div style="text-align: justify;">
753
- <strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
754
- <strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
755
- </div>
756
- """,
757
- unsafe_allow_html=True,
758
- )
759
- st.write("")
760
-
761
-
762
- #########################################################################################################################################################
763
- # Merge all DataFrames
764
- #########################################################################################################################################################
765
-
766
-
767
- # Merge all DataFrames selected
768
- main_df = create_main_dataframe(
769
- files_dict, all_dma_values, all_panel_values, granularity_selection
770
- )
771
- merged_df = merge_into_main_df(main_df, files_dict, selections)
772
-
773
- # # Display the merged DataFrame
774
- # st.markdown("#### Merged DataFrame based on selected DMA and Panel")
775
- # st.dataframe(merged_df)
776
-
777
-
778
- #########################################################################################################################################################
779
- # Categorize Variables and Impute Missing Values
780
- #########################################################################################################################################################
781
-
782
-
783
- # Create an editable DataFrame in Streamlit
784
- st.markdown("#### Select Variables Category & Impute Missing Values")
785
-
786
- # Prepare missing stats DataFrame for editing
787
- missing_stats_df = prepare_missing_stats_df(merged_df)
788
-
789
- edited_stats_df = st.data_editor(
790
- missing_stats_df,
791
- column_config={
792
- "Impute Method": st.column_config.SelectboxColumn(
793
- options=[
794
- "Drop Column",
795
- "Fill with Mean",
796
- "Fill with Median",
797
- "Fill with 0",
798
- ],
799
- required=True,
800
- default="Fill with 0",
801
- ),
802
- "Category": st.column_config.SelectboxColumn(
803
- options=[
804
- "Media",
805
- "Exogenous",
806
- "Internal",
807
- "Response_Metric"
808
- ],
809
- required=True,
810
- default="Media",
811
- ),
812
- },
813
- disabled=["Column", "Missing Values", "Missing Percentage"],
814
- hide_index=True,
815
- use_container_width=True,
816
- )
817
-
818
- # Apply changes based on edited DataFrame
819
- for i, row in edited_stats_df.iterrows():
820
- column = row["Column"]
821
- if row["Impute Method"] == "Drop Column":
822
- merged_df.drop(columns=[column], inplace=True)
823
-
824
- elif row["Impute Method"] == "Fill with Mean":
825
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
826
-
827
- elif row["Impute Method"] == "Fill with Median":
828
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
829
-
830
- elif row["Impute Method"] == "Fill with 0":
831
- merged_df[column].fillna(0, inplace=True)
832
-
833
- # Display the Final DataFrame and exogenous variables
834
- st.markdown("#### Final DataFrame")
835
- final_df = merged_df
836
- st.dataframe(final_df, hide_index=True)
837
-
838
- # Initialize an empty dictionary to hold categories and their variables
839
- category_dict = {}
840
-
841
- # Iterate over each row in the edited DataFrame to populate the dictionary
842
- for i, row in edited_stats_df.iterrows():
843
- column = row["Column"]
844
- category = row["Category"] # The category chosen by the user for this variable
845
-
846
- # Check if the category already exists in the dictionary
847
- if category not in category_dict:
848
- # If not, initialize it with the current column as its first element
849
- category_dict[category] = [column]
850
- else:
851
- # If it exists, append the current column to the list of variables under this category
852
- category_dict[category].append(column)
853
-
854
- # Add Date, DMA and Panel in category dictionary
855
- category_dict.update({"Date": ["date"]})
856
- if "DMA" in final_df.columns:
857
- category_dict["DMA"] = ["DMA"]
858
-
859
- if "Panel" in final_df.columns:
860
- category_dict["Panel"] = ["Panel"]
861
-
862
- # Display the dictionary
863
- st.markdown("#### Variable Category")
864
- for category, variables in category_dict.items():
865
- # Check if there are multiple variables to handle "and" insertion correctly
866
- if len(variables) > 1:
867
- # Join all but the last variable with ", ", then add " and " before the last variable
868
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
869
- else:
870
- # If there's only one variable, no need for "and"
871
- variables_str = variables[0]
872
-
873
- # Display the category and its variables in the desired format
874
- st.markdown(
875
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
876
- unsafe_allow_html=True,
877
- )
878
-
879
- # Store final dataframe and bin dictionary into session state
880
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
881
-
882
- if st.button('Save Changes'):
883
-
884
- with open("Pickle_files/main_df", 'wb') as f:
885
- pickle.dump(st.session_state["final_df"], f)
886
- with open("Pickle_files/category_dict",'wb') as c:
887
- pickle.dump(st.session_state["bin_dict"],c)
888
- st.success('Changes Saved!')
889
-
890
-
891
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/Help.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import streamlit as st
3
+
4
+ st.set_page_config(
5
+ page_title="Model Build",
6
+ page_icon=":shark:",
7
+ layout="wide",
8
+ initial_sidebar_state="collapsed",
9
+ )
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from utilities import set_header, load_local_css, load_authenticator
14
+ import pickle
15
+ import base64
16
+
17
+ load_local_css("styles.css")
18
+ set_header()
19
+
20
+
21
+ st.header('MASTERCARD MMO TOOL')
22
+
23
+ st.subheader('Overview')
24
+
25
+ st.markdown('The tool was developed in accordance with the best practices for building Marketing mix models for different clients and businesses. The tool can be used for building various MMM models, optimize spends and execute various simulations. ')
26
+
27
+ st.markdown('Last Updated: 3/26/204')
28
+ user=st.radio("Select User",['Data Scientist','Media Planner'],horizontal=True,index=1)
29
+
30
+
31
+
32
+ if user=='Data Scientist':
33
+
34
+ with st.expander('**Data Import**'):
35
+
36
+ st.markdown("""
37
+
38
+ The Data Import page allows users to bring in any additional data apart from what’s being fetched using APIs and processed by Data Engineering pipelines and standardize both API and non-API data to the desired end granularity. It features options for feature engineering, variable grouping, and data imputation. Additionally, it provides a comprehensive summary of all actions performed on the page, ensuring a transparent and efficient data preparation process.
39
+
40
+ **Features:**
41
+ - **Categorization:** Allows for the categorization of similar variables for streamlined analysis.
42
+ - **Feature Engineering:** Enables the creation and grouping of columns to build better models.
43
+ - **Data Imputation:** Provides methods to fill in missing or incomplete data values.
44
+ """)
45
+
46
+ with st.expander('**Data Validation**'):
47
+
48
+ st.markdown("""
49
+
50
+ This page is designed to enhance data quality and insights, focusing on selected targets and panels. It offers Response Metric Analysis, Univariate and Bivariate Report, alongside Media and Non-Media Variables Analysis. Correlation is explored to ensure a thorough validation process.
51
+
52
+ **Features:**
53
+ - **Response Metric Analysis:** Evaluates the performance metrics and trends relevant to the selected target and panel.
54
+ - **Univariate and Bivariate Report:** Offers a detailed univariate and bivariate report.
55
+ - **Variables Analysis:** Evaluates the significance of media and non-media variables for the chosen target/panel and validates the variables to be considered for the next steps.
56
+ - **Correlation Analysis:** Utilizes correlation plots to reveal the relationships between variables.
57
+ """)
58
+
59
+ with st.expander('**Transformation**'):
60
+
61
+
62
+ st.markdown("""
63
+
64
+ Transformation capabilities include Media and Exogenous Transformations such as lag, lead, moving average, power, saturation, and adstock adjustments. This page not only applies these transformations but also summarizes the process and the order of operations, providing clarity on the data's manipulation.
65
+
66
+ **Features:**
67
+ - **Transformations:** Applies specific alterations (lag, lead, moving average, power, saturation, and adstock) to media and exogenous variables to enhance model performance.
68
+ - **Summarization of Transformation:** Provides a detailed overview of all transformations applied, including their sequence.
69
+ """)
70
+
71
+ with st.expander('**Build Model**'):
72
+
73
+ st.markdown("""
74
+
75
+ This feature integrates transformation creation with model building for both panel and non-panel levels. It streamlines the process, making it easier for users to construct and refine their models based on the transformed data. After building models, this page assists in selecting the best fit and provides a detailed summary. It includes comparisons of Actual vs. Predicted outcomes, Residual Analysis, and Variance Inflation Factor (VIF) for both test and train datasets.
76
+
77
+ **Features:**
78
+ - **Diverse Models:** Constructs models for various metrics using OLS and mixed effect models.
79
+ - **Model Selection:** Chooses the most significant model utilizing metrics such as coefficients, P-value, R Square, Adjusted R Square, and MAPE.
80
+ """)
81
+
82
+ with st.expander('**Model Tuning**'):
83
+
84
+ st.markdown("""
85
+ ###
86
+ Model Tuning offers advanced options like Event Flags, addition of Trends, Cyclical Pattern, sine and cosine waves. These features help in refining the model by accounting for specific events, trends, and seasonal patterns.
87
+
88
+ **Features:**
89
+ - **Event Flags:** Incorporates the effect of specific events on the target.
90
+ - **Trends:** Incorporates long-term trends and seasonality.
91
+ - **Cyclical Pattern:** Utilizes sine and cosine waves to capture and adjust for seasonal variations.
92
+ - **Contributions Analysis: Calculates contributions from the tuned model for each media channel
93
+ """)
94
+
95
+ with st.expander("**Save Model Results**"):
96
+
97
+ st.markdown("""
98
+
99
+ This page saves the model's outcomes, including channel contributions, an EDA report, and a thorough Analysis of Model Results. It's designed to document and preserve the work done for future reference.
100
+
101
+ **Features:**
102
+ - **Channel Contribution:** Details the impact of each media channel on outcomes.
103
+ - **EDA Report:** Provides an exploratory data analysis summary.
104
+ - **Analysis of Model Results:** Offers a comprehensive review of the model's performance.
105
+ """)
106
+
107
+ with st.expander('**Model Results Overview**'):
108
+
109
+ st.markdown("""
110
+
111
+ This section provides a comprehensive overview of historical spending, including channel-wise spends, revenue, ROI, and weekly contributions. It also details channel spends and revenue on a week-by-week basis, offering a granular look at financial performance.
112
+
113
+ **Features:**
114
+ - **Spends Analysis:** Breaks down channel-wise spend and revenue.
115
+ - **ROI and Contributions:** Evaluates return on investment and weekly/aggregated channel performance.
116
+ """)
117
+
118
+ with st.expander('**Build Response Curves**'):
119
+
120
+ st.markdown("""
121
+
122
+ This page updates response curves and allows for testing and saving these fits. It's essential for understanding how different levels of spending affect outcomes and for refining marketing strategies.
123
+
124
+ **Features:**
125
+ - **Response Curve Update:** Allows for the modification and refinement of response curves.
126
+ - **Curve Testing and Saving:** Facilitates the evaluation of curve fits and preserves/download curve parameters.
127
+ """)
128
+
129
+ with st.expander('**Scenario Planner**'):
130
+
131
+ st.markdown("""
132
+
133
+ The Scenario Planner page enables forward and reverse optimization, allowing users to either maximize targets given certain spends or minimize spends given a target revenue. It includes responsive S curves for multiple channels and optimizes them accordingly.
134
+
135
+ **Features:**
136
+ - **Optimization:** Supports both forward and reverse financial planning, with adjustments based on percentage or actual spend values.
137
+ - **Channel Focused:** Enables optimization based on specific media channels for focused strategies.
138
+ - **Responsive S Curves:** Showcases real-time, responsive S curves that highlight regions indicating underinvestment, optimal spending, and areas of potential overinvestment.
139
+ - **Dynamic Visualization:** Provides ROI/MROI for in-depth analysis and immediate scenario feedback.
140
+ """)
141
+ with st.expander("**Saved Scenarios**"):
142
+
143
+ st.markdown("""
144
+
145
+ Users can save, load, download, and delete scenarios involving spends, ROI, and MROI for both actual and simulated cases. This feature offers flexibility in scenario management and comparison.
146
+
147
+ **Features:**
148
+ - **Optimized Results Visualization:** Displays the outcomes of optimization, highlighting effectiveness and efficiency in scenario planning.
149
+ - **Effectiveness and Efficiency Analysis:** Provides detailed visual insights into how optimization impacts campaign performance and resource utilization.
150
+ """)
151
+
152
+ with st.expander("**Optimized Result Analysis**"):
153
+
154
+ st.markdown("""
155
+
156
+ This analysis page gives an overview of optimized spends (actual, planned, and delta), budget allocation (% split and planned spends), and forecasts on response and return by media channel. It's designed to provide insights into the efficiency and effectiveness of optimized media spending strategies.
157
+
158
+ **Features:**
159
+ - **Optimized Spends Overview:** Compares actual, planned, and delta spends.
160
+ - **Budget Allocation Analysis:** Breaks down the percentage split and planned expenditures.
161
+ """)
162
+
163
+ if user=='Media Planner':
164
+
165
+ with st.expander('**Scenario Planner**'):
166
+
167
+ st.markdown("""
168
+
169
+ The Scenario Planner page enables forward and reverse optimization, allowing users to either maximize targets given certain spends or minimize spends given a target revenue. It includes responsive S curves for multiple channels and optimizes them accordingly.
170
+
171
+ **Features:**
172
+ - **Optimization:** Supports both forward and reverse financial planning, with adjustments based on percentage or actual spend values.
173
+ - **Channel Focused:** Enables optimization based on specific media channels for focused strategies.
174
+ - **Responsive S Curves:** Showcases real-time, responsive S curves that highlight regions indicating underinvestment, optimal spending, and areas of potential overinvestment.
175
+ - **Dynamic Visualization:** Provides ROI/MROI for in-depth analysis and immediate scenario feedback.
176
+ """)
177
+ with st.expander("**Saved Scenario**"):
178
+
179
+ st.markdown("""
180
+
181
+ Users can save, load, download, and delete scenarios involving spends, ROI, and MROI for both actual and simulated cases. This feature offers flexibility in scenario management and comparison.
182
+
183
+ **Features:**
184
+ - **Optimized Results Visualization:** Displays the outcomes of optimization, highlighting effectiveness and efficiency in scenario planning.
185
+ - **Effectiveness and Efficiency Analysis:** Provides detailed visual insights into how optimization impacts campaign performance and resource utilization.
186
+ """)
187
+
188
+ with st.expander("**Optimized Result Analysis**"):
189
+
190
+ st.markdown("""
191
+
192
+ This analysis page gives an overview of optimized spends (actual, planned, and delta), budget allocation (% split and planned spends), and forecasts on response and return by media channel. It's designed to provide insights into the efficiency and effectiveness of optimized media spending strategies.
193
+
194
+ **Features:**
195
+ - **Optimized Spends Overview:** Compares actual, planned, and delta spends.
196
+ - **Budget Allocation Analysis:** Breaks down the percentage split and planned expenditures.
197
+ """)
198
+
199
+
200
+ with open("MMM Tool Description.docx", "rb") as file:
201
+ word_content = file.read()
202
+ b64 = base64.b64encode(word_content).decode()
203
+ href = f'<a href="data:application/octet-stream;base64,{b64}" download="document.docx">Download Document</a>'
204
+ st.markdown(href, unsafe_allow_html=True)