Spaces:
Sleeping
Sleeping
Rename pages/Data_Import.py to pages/Help.py
Browse files- pages/Data_Import.py +0 -891
- pages/Help.py +204 -0
pages/Data_Import.py
DELETED
@@ -1,891 +0,0 @@
|
|
1 |
-
# Importing necessary libraries
|
2 |
-
import streamlit as st
|
3 |
-
|
4 |
-
st.set_page_config(
|
5 |
-
page_title="Model Build",
|
6 |
-
page_icon=":shark:",
|
7 |
-
layout="wide",
|
8 |
-
initial_sidebar_state="collapsed",
|
9 |
-
)
|
10 |
-
|
11 |
-
import numpy as np
|
12 |
-
import pandas as pd
|
13 |
-
from utilities import set_header, load_local_css, load_authenticator
|
14 |
-
import pickle
|
15 |
-
|
16 |
-
|
17 |
-
load_local_css("styles.css")
|
18 |
-
set_header()
|
19 |
-
|
20 |
-
authenticator = st.session_state.get("authenticator")
|
21 |
-
if authenticator is None:
|
22 |
-
authenticator = load_authenticator()
|
23 |
-
|
24 |
-
name, authentication_status, username = authenticator.login("Login", "main")
|
25 |
-
auth_status = st.session_state.get("authentication_status")
|
26 |
-
|
27 |
-
# Check for authentication status
|
28 |
-
if auth_status != True:
|
29 |
-
st.stop()
|
30 |
-
|
31 |
-
|
32 |
-
# Function to validate date column in dataframe
|
33 |
-
def validate_date_column(df):
|
34 |
-
try:
|
35 |
-
# Attempt to convert the 'Date' column to datetime
|
36 |
-
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
|
37 |
-
return True
|
38 |
-
except:
|
39 |
-
return False
|
40 |
-
|
41 |
-
|
42 |
-
# Function to determine data interval
|
43 |
-
def determine_data_interval(common_freq):
|
44 |
-
if common_freq == 1:
|
45 |
-
return "daily"
|
46 |
-
elif common_freq == 7:
|
47 |
-
return "weekly"
|
48 |
-
elif 28 <= common_freq <= 31:
|
49 |
-
return "monthly"
|
50 |
-
else:
|
51 |
-
return "irregular"
|
52 |
-
|
53 |
-
|
54 |
-
# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
|
55 |
-
st.cache_resource(show_spinner=False)
|
56 |
-
|
57 |
-
|
58 |
-
def files_to_dataframes(uploaded_files):
|
59 |
-
df_dict = {}
|
60 |
-
for uploaded_file in uploaded_files:
|
61 |
-
# Extract file name without extension
|
62 |
-
file_name = uploaded_file.name.rsplit(".", 1)[0]
|
63 |
-
|
64 |
-
# Check for duplicate file names
|
65 |
-
if file_name in df_dict:
|
66 |
-
st.warning(
|
67 |
-
f"Duplicate File: {file_name}. This file will be skipped.",
|
68 |
-
icon="⚠️",
|
69 |
-
)
|
70 |
-
continue
|
71 |
-
|
72 |
-
# Read the file into a DataFrame
|
73 |
-
df = pd.read_excel(uploaded_file)
|
74 |
-
|
75 |
-
# Convert all column names to lowercase
|
76 |
-
df.columns = df.columns.str.lower().str.strip()
|
77 |
-
|
78 |
-
# Separate numeric and non-numeric columns
|
79 |
-
numeric_cols = list(df.select_dtypes(include=["number"]).columns)
|
80 |
-
non_numeric_cols = [
|
81 |
-
col
|
82 |
-
for col in df.select_dtypes(exclude=["number"]).columns
|
83 |
-
if col.lower() != "date"
|
84 |
-
]
|
85 |
-
|
86 |
-
# Check for 'Date' column
|
87 |
-
if not (validate_date_column(df) and len(numeric_cols) > 0):
|
88 |
-
st.warning(
|
89 |
-
f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
|
90 |
-
icon="⚠️",
|
91 |
-
)
|
92 |
-
continue
|
93 |
-
|
94 |
-
# Check for interval
|
95 |
-
common_freq = common_freq = (
|
96 |
-
pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
97 |
-
)
|
98 |
-
# Calculate the data interval (daily, weekly, monthly or irregular)
|
99 |
-
interval = determine_data_interval(common_freq)
|
100 |
-
if interval == "irregular":
|
101 |
-
st.warning(
|
102 |
-
f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
|
103 |
-
icon="⚠️",
|
104 |
-
)
|
105 |
-
continue
|
106 |
-
|
107 |
-
# Store both DataFrames in the dictionary under their respective keys
|
108 |
-
df_dict[file_name] = {
|
109 |
-
"numeric": numeric_cols,
|
110 |
-
"non_numeric": non_numeric_cols,
|
111 |
-
"interval": interval,
|
112 |
-
"df": df,
|
113 |
-
}
|
114 |
-
|
115 |
-
return df_dict
|
116 |
-
|
117 |
-
|
118 |
-
# Function to adjust dataframe granularity
|
119 |
-
# def adjust_dataframe_granularity(df, current_granularity, target_granularity):
|
120 |
-
# # Set index
|
121 |
-
# df.set_index("date", inplace=True)
|
122 |
-
|
123 |
-
# # Define aggregation rules for resampling
|
124 |
-
# aggregation_rules = {
|
125 |
-
# col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
|
126 |
-
# for col in df.columns
|
127 |
-
# }
|
128 |
-
|
129 |
-
# resampled_df = df
|
130 |
-
# if current_granularity == "daily" and target_granularity == "weekly":
|
131 |
-
# resampled_df = df.resample("W-MON").agg(aggregation_rules)
|
132 |
-
|
133 |
-
# elif current_granularity == "daily" and target_granularity == "monthly":
|
134 |
-
# resampled_df = df.resample("MS").agg(aggregation_rules)
|
135 |
-
|
136 |
-
# elif current_granularity == "daily" and target_granularity == "daily":
|
137 |
-
# resampled_df = df.resample("D").agg(aggregation_rules)
|
138 |
-
|
139 |
-
# elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
|
140 |
-
# # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
|
141 |
-
# expanded_data = []
|
142 |
-
# for _, row in df.iterrows():
|
143 |
-
# if current_granularity == "weekly":
|
144 |
-
# period_range = pd.date_range(start=row.name, periods=7)
|
145 |
-
# elif current_granularity == "monthly":
|
146 |
-
# period_range = pd.date_range(
|
147 |
-
# start=row.name, periods=row.name.days_in_month
|
148 |
-
# )
|
149 |
-
|
150 |
-
# for date in period_range:
|
151 |
-
# new_row = {}
|
152 |
-
# for col in df.columns:
|
153 |
-
# if pd.api.types.is_numeric_dtype(df[col]):
|
154 |
-
# if current_granularity == "weekly":
|
155 |
-
# new_row[col] = row[col] / 7
|
156 |
-
# elif current_granularity == "monthly":
|
157 |
-
# new_row[col] = row[col] / row.name.days_in_month
|
158 |
-
# else:
|
159 |
-
# new_row[col] = row[col]
|
160 |
-
# expanded_data.append((date, new_row))
|
161 |
-
|
162 |
-
# resampled_df = pd.DataFrame(
|
163 |
-
# [data for _, data in expanded_data],
|
164 |
-
# index=[date for date, _ in expanded_data],
|
165 |
-
# )
|
166 |
-
|
167 |
-
# # Reset index
|
168 |
-
# resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
|
169 |
-
|
170 |
-
# return resampled_df
|
171 |
-
|
172 |
-
|
173 |
-
def adjust_dataframe_granularity(df, current_granularity, target_granularity):
|
174 |
-
# Set index
|
175 |
-
df.set_index("date", inplace=True)
|
176 |
-
|
177 |
-
# Define aggregation rules for resampling
|
178 |
-
aggregation_rules = {
|
179 |
-
col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
|
180 |
-
for col in df.columns
|
181 |
-
}
|
182 |
-
|
183 |
-
# Initialize resampled_df
|
184 |
-
resampled_df = df
|
185 |
-
if current_granularity == "daily" and target_granularity == "weekly":
|
186 |
-
resampled_df = df.resample("W-MON", closed="left", label="left").agg(
|
187 |
-
aggregation_rules
|
188 |
-
)
|
189 |
-
|
190 |
-
elif current_granularity == "daily" and target_granularity == "monthly":
|
191 |
-
resampled_df = df.resample("MS", closed="left", label="left").agg(
|
192 |
-
aggregation_rules
|
193 |
-
)
|
194 |
-
|
195 |
-
elif current_granularity == "daily" and target_granularity == "daily":
|
196 |
-
resampled_df = df.resample("D").agg(aggregation_rules)
|
197 |
-
|
198 |
-
elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
|
199 |
-
# For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
|
200 |
-
expanded_data = []
|
201 |
-
for _, row in df.iterrows():
|
202 |
-
if current_granularity == "weekly":
|
203 |
-
period_range = pd.date_range(start=row.name, periods=7)
|
204 |
-
elif current_granularity == "monthly":
|
205 |
-
period_range = pd.date_range(
|
206 |
-
start=row.name, periods=row.name.days_in_month
|
207 |
-
)
|
208 |
-
|
209 |
-
for date in period_range:
|
210 |
-
new_row = {}
|
211 |
-
for col in df.columns:
|
212 |
-
if pd.api.types.is_numeric_dtype(df[col]):
|
213 |
-
if current_granularity == "weekly":
|
214 |
-
new_row[col] = row[col] / 7
|
215 |
-
elif current_granularity == "monthly":
|
216 |
-
new_row[col] = row[col] / row.name.days_in_month
|
217 |
-
else:
|
218 |
-
new_row[col] = row[col]
|
219 |
-
expanded_data.append((date, new_row))
|
220 |
-
|
221 |
-
resampled_df = pd.DataFrame(
|
222 |
-
[data for _, data in expanded_data],
|
223 |
-
index=[date for date, _ in expanded_data],
|
224 |
-
)
|
225 |
-
|
226 |
-
# Reset index
|
227 |
-
resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
|
228 |
-
|
229 |
-
return resampled_df
|
230 |
-
|
231 |
-
|
232 |
-
# Function to clean and extract unique values of DMA and Panel
|
233 |
-
st.cache_resource(show_spinner=False)
|
234 |
-
|
235 |
-
|
236 |
-
def clean_and_extract_unique_values(files_dict, selections):
|
237 |
-
all_dma_values = set()
|
238 |
-
all_panel_values = set()
|
239 |
-
|
240 |
-
for file_name, file_data in files_dict.items():
|
241 |
-
df = file_data["df"]
|
242 |
-
|
243 |
-
# 'DMA' and 'Panel' selections
|
244 |
-
selected_dma = selections[file_name].get("DMA")
|
245 |
-
selected_panel = selections[file_name].get("Panel")
|
246 |
-
|
247 |
-
# Clean and standardize DMA column if it exists and is selected
|
248 |
-
if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
|
249 |
-
df[selected_dma] = (
|
250 |
-
df[selected_dma].str.lower().str.strip().str.replace("_", " ")
|
251 |
-
)
|
252 |
-
all_dma_values.update(df[selected_dma].dropna().unique())
|
253 |
-
|
254 |
-
# Clean and standardize Panel column if it exists and is selected
|
255 |
-
if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
|
256 |
-
df[selected_panel] = (
|
257 |
-
df[selected_panel].str.lower().str.strip().str.replace("_", " ")
|
258 |
-
)
|
259 |
-
all_panel_values.update(df[selected_panel].dropna().unique())
|
260 |
-
|
261 |
-
# Update the processed DataFrame back in the dictionary
|
262 |
-
files_dict[file_name]["df"] = df
|
263 |
-
|
264 |
-
return all_dma_values, all_panel_values
|
265 |
-
|
266 |
-
|
267 |
-
# Function to format values for display
|
268 |
-
st.cache_resource(show_spinner=False)
|
269 |
-
|
270 |
-
|
271 |
-
def format_values_for_display(values_list):
|
272 |
-
# Capitalize the first letter of each word and replace underscores with spaces
|
273 |
-
formatted_list = [value.replace("_", " ").title() for value in values_list]
|
274 |
-
# Join values with commas and 'and' before the last value
|
275 |
-
if len(formatted_list) > 1:
|
276 |
-
return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
|
277 |
-
elif formatted_list:
|
278 |
-
return formatted_list[0]
|
279 |
-
return "No values available"
|
280 |
-
|
281 |
-
|
282 |
-
# Function to normalizes all data within files_dict to a daily granularity
|
283 |
-
st.cache(show_spinner=False, allow_output_mutation=True)
|
284 |
-
|
285 |
-
|
286 |
-
def standardize_data_to_daily(files_dict, selections):
|
287 |
-
# Normalize all data to a daily granularity using a provided function
|
288 |
-
files_dict = apply_granularity_to_all(files_dict, "daily", selections)
|
289 |
-
|
290 |
-
# Update the "interval" attribute for each dataset to indicate the new granularity
|
291 |
-
for files_name, files_data in files_dict.items():
|
292 |
-
files_data["interval"] = "daily"
|
293 |
-
|
294 |
-
return files_dict
|
295 |
-
|
296 |
-
|
297 |
-
# Function to apply granularity transformation to all DataFrames in files_dict
|
298 |
-
st.cache_resource(show_spinner=False)
|
299 |
-
|
300 |
-
|
301 |
-
def apply_granularity_to_all(files_dict, granularity_selection, selections):
|
302 |
-
for file_name, file_data in files_dict.items():
|
303 |
-
df = file_data["df"].copy()
|
304 |
-
|
305 |
-
# Handling when DMA or Panel might be 'N/A'
|
306 |
-
selected_dma = selections[file_name].get("DMA")
|
307 |
-
selected_panel = selections[file_name].get("Panel")
|
308 |
-
|
309 |
-
# Correcting the segment selection logic & handling 'N/A'
|
310 |
-
if selected_dma != "N/A" and selected_panel != "N/A":
|
311 |
-
unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
|
312 |
-
elif selected_dma != "N/A":
|
313 |
-
unique_combinations = df[[selected_dma]].drop_duplicates()
|
314 |
-
selected_panel = None # Ensure Panel is ignored if N/A
|
315 |
-
elif selected_panel != "N/A":
|
316 |
-
unique_combinations = df[[selected_panel]].drop_duplicates()
|
317 |
-
selected_dma = None # Ensure DMA is ignored if N/A
|
318 |
-
else:
|
319 |
-
# If both are 'N/A', process the entire dataframe as is
|
320 |
-
df = adjust_dataframe_granularity(
|
321 |
-
df, file_data["interval"], granularity_selection
|
322 |
-
)
|
323 |
-
files_dict[file_name]["df"] = df
|
324 |
-
continue # Skip to the next file
|
325 |
-
|
326 |
-
transformed_segments = []
|
327 |
-
for _, combo in unique_combinations.iterrows():
|
328 |
-
if selected_dma and selected_panel:
|
329 |
-
segment = df[
|
330 |
-
(df[selected_dma] == combo[selected_dma])
|
331 |
-
& (df[selected_panel] == combo[selected_panel])
|
332 |
-
]
|
333 |
-
elif selected_dma:
|
334 |
-
segment = df[df[selected_dma] == combo[selected_dma]]
|
335 |
-
elif selected_panel:
|
336 |
-
segment = df[df[selected_panel] == combo[selected_panel]]
|
337 |
-
|
338 |
-
# Adjust granularity of the segment
|
339 |
-
transformed_segment = adjust_dataframe_granularity(
|
340 |
-
segment, file_data["interval"], granularity_selection
|
341 |
-
)
|
342 |
-
transformed_segments.append(transformed_segment)
|
343 |
-
|
344 |
-
# Combine all transformed segments into a single DataFrame for this file
|
345 |
-
transformed_df = pd.concat(transformed_segments, ignore_index=True)
|
346 |
-
files_dict[file_name]["df"] = transformed_df
|
347 |
-
|
348 |
-
return files_dict
|
349 |
-
|
350 |
-
|
351 |
-
# Function to create main dataframe structure
|
352 |
-
st.cache_resource(show_spinner=False)
|
353 |
-
|
354 |
-
|
355 |
-
def create_main_dataframe(
|
356 |
-
files_dict, all_dma_values, all_panel_values, granularity_selection
|
357 |
-
):
|
358 |
-
# Determine the global start and end dates across all DataFrames
|
359 |
-
global_start = min(df["df"]["date"].min() for df in files_dict.values())
|
360 |
-
global_end = max(df["df"]["date"].max() for df in files_dict.values())
|
361 |
-
|
362 |
-
# Adjust the date_range generation based on the granularity_selection
|
363 |
-
if granularity_selection == "weekly":
|
364 |
-
# Generate a weekly range, with weeks starting on Monday
|
365 |
-
date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
|
366 |
-
elif granularity_selection == "monthly":
|
367 |
-
# Generate a monthly range, starting from the first day of each month
|
368 |
-
date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
|
369 |
-
else: # Default to daily if not weekly or monthly
|
370 |
-
date_range = pd.date_range(start=global_start, end=global_end, freq="D")
|
371 |
-
|
372 |
-
# Collect all unique DMA and Panel values, excluding 'N/A'
|
373 |
-
all_dmas = all_dma_values
|
374 |
-
all_panels = all_panel_values
|
375 |
-
|
376 |
-
# Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
|
377 |
-
dimensions, merge_keys = [], []
|
378 |
-
if all_panels:
|
379 |
-
dimensions.append(all_panels)
|
380 |
-
merge_keys.append("Panel")
|
381 |
-
if all_dmas:
|
382 |
-
dimensions.append(all_dmas)
|
383 |
-
merge_keys.append("DMA")
|
384 |
-
|
385 |
-
dimensions.append(date_range) # Date range is always included
|
386 |
-
merge_keys.append("date") # Date range is always included
|
387 |
-
|
388 |
-
# Create a main DataFrame template with the dimensions
|
389 |
-
main_df = pd.MultiIndex.from_product(
|
390 |
-
dimensions,
|
391 |
-
names=[name for name, _ in zip(merge_keys, dimensions)],
|
392 |
-
).to_frame(index=False)
|
393 |
-
|
394 |
-
return main_df.reset_index(drop=True)
|
395 |
-
|
396 |
-
|
397 |
-
# Function to prepare and merge dataFrames
|
398 |
-
st.cache_resource(show_spinner=False)
|
399 |
-
|
400 |
-
|
401 |
-
def merge_into_main_df(main_df, files_dict, selections):
|
402 |
-
for file_name, file_data in files_dict.items():
|
403 |
-
df = file_data["df"].copy()
|
404 |
-
|
405 |
-
# Rename selected DMA and Panel columns if not 'N/A'
|
406 |
-
selected_dma = selections[file_name].get("DMA", "N/A")
|
407 |
-
selected_panel = selections[file_name].get("Panel", "N/A")
|
408 |
-
if selected_dma != "N/A":
|
409 |
-
df.rename(columns={selected_dma: "DMA"}, inplace=True)
|
410 |
-
if selected_panel != "N/A":
|
411 |
-
df.rename(columns={selected_panel: "Panel"}, inplace=True)
|
412 |
-
|
413 |
-
# Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
|
414 |
-
merge_keys = ["date"]
|
415 |
-
if "Panel" in df.columns:
|
416 |
-
merge_keys.append("Panel")
|
417 |
-
if "DMA" in df.columns:
|
418 |
-
merge_keys.append("DMA")
|
419 |
-
main_df = pd.merge(main_df, df, on=merge_keys, how="left")
|
420 |
-
|
421 |
-
# After all merges, sort by 'date' and reset index for cleanliness
|
422 |
-
sort_by = ["date"]
|
423 |
-
if "Panel" in main_df.columns:
|
424 |
-
sort_by.append("Panel")
|
425 |
-
if "DMA" in main_df.columns:
|
426 |
-
sort_by.append("DMA")
|
427 |
-
main_df.sort_values(by=sort_by, inplace=True)
|
428 |
-
main_df.reset_index(drop=True, inplace=True)
|
429 |
-
|
430 |
-
return main_df
|
431 |
-
|
432 |
-
|
433 |
-
# Function to categorize column
|
434 |
-
def categorize_column(column_name):
|
435 |
-
# Define keywords for each category
|
436 |
-
internal_keywords = [
|
437 |
-
"Price",
|
438 |
-
"Discount",
|
439 |
-
"product_price",
|
440 |
-
"cost",
|
441 |
-
"margin",
|
442 |
-
"inventory",
|
443 |
-
"sales",
|
444 |
-
"revenue",
|
445 |
-
"turnover",
|
446 |
-
"expense",
|
447 |
-
]
|
448 |
-
exogenous_keywords = [
|
449 |
-
"GDP",
|
450 |
-
"Tax",
|
451 |
-
"Inflation",
|
452 |
-
"interest_rate",
|
453 |
-
"employment_rate",
|
454 |
-
"exchange_rate",
|
455 |
-
"consumer_spending",
|
456 |
-
"retail_sales",
|
457 |
-
"oil_prices",
|
458 |
-
"weather",
|
459 |
-
]
|
460 |
-
|
461 |
-
# Check if the column name matches any of the keywords for Internal or Exogenous categories
|
462 |
-
for keyword in internal_keywords:
|
463 |
-
if keyword.lower() in column_name.lower():
|
464 |
-
return "Internal"
|
465 |
-
for keyword in exogenous_keywords:
|
466 |
-
if keyword.lower() in column_name.lower():
|
467 |
-
return "Exogenous"
|
468 |
-
|
469 |
-
# Default to Media if no match found
|
470 |
-
return "Media"
|
471 |
-
|
472 |
-
|
473 |
-
# Function to calculate missing stats and prepare for editable DataFrame
|
474 |
-
st.cache_resource(show_spinner=False)
|
475 |
-
|
476 |
-
|
477 |
-
def prepare_missing_stats_df(df):
|
478 |
-
missing_stats = []
|
479 |
-
for column in df.columns:
|
480 |
-
if (
|
481 |
-
column == "date" or column == "DMA" or column == "Panel"
|
482 |
-
): # Skip Date, DMA and Panel column
|
483 |
-
continue
|
484 |
-
|
485 |
-
missing = df[column].isnull().sum()
|
486 |
-
pct_missing = round((missing / len(df)) * 100, 2)
|
487 |
-
|
488 |
-
# Dynamically assign category based on column name
|
489 |
-
# category = categorize_column(column)
|
490 |
-
category = "Media"
|
491 |
-
|
492 |
-
missing_stats.append(
|
493 |
-
{
|
494 |
-
"Column": column,
|
495 |
-
"Missing Values": missing,
|
496 |
-
"Missing Percentage": pct_missing,
|
497 |
-
"Impute Method": "Fill with 0", # Default value
|
498 |
-
"Category": category,
|
499 |
-
}
|
500 |
-
)
|
501 |
-
stats_df = pd.DataFrame(missing_stats)
|
502 |
-
|
503 |
-
return stats_df
|
504 |
-
|
505 |
-
|
506 |
-
# Function to add API DataFrame details to the files dictionary
|
507 |
-
st.cache_resource(show_spinner=False)
|
508 |
-
|
509 |
-
|
510 |
-
def add_api_dataframe_to_dict(main_df, files_dict):
|
511 |
-
files_dict["API"] = {
|
512 |
-
"numeric": list(main_df.select_dtypes(include=["number"]).columns),
|
513 |
-
"non_numeric": [
|
514 |
-
col
|
515 |
-
for col in main_df.select_dtypes(exclude=["number"]).columns
|
516 |
-
if col.lower() != "date"
|
517 |
-
],
|
518 |
-
"interval": determine_data_interval(
|
519 |
-
pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
520 |
-
),
|
521 |
-
"df": main_df,
|
522 |
-
}
|
523 |
-
|
524 |
-
return files_dict
|
525 |
-
|
526 |
-
|
527 |
-
# Function to reads an API into a DataFrame, parsing specified columns as datetime
|
528 |
-
@st.cache_resource(show_spinner=False)
|
529 |
-
def read_API_data():
|
530 |
-
return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])
|
531 |
-
|
532 |
-
|
533 |
-
# Function to set the 'DMA_Panel_Selected' session state variable to False
|
534 |
-
def set_DMA_Panel_Selected_false():
|
535 |
-
st.session_state["DMA_Panel_Selected"] = False
|
536 |
-
|
537 |
-
|
538 |
-
# Initialize 'final_df' in session state
|
539 |
-
if "final_df" not in st.session_state:
|
540 |
-
st.session_state["final_df"] = pd.DataFrame()
|
541 |
-
|
542 |
-
# Initialize 'bin_dict' in session state
|
543 |
-
if "bin_dict" not in st.session_state:
|
544 |
-
st.session_state["bin_dict"] = {}
|
545 |
-
|
546 |
-
# Initialize 'DMA_Panel_Selected' in session state
|
547 |
-
if "DMA_Panel_Selected" not in st.session_state:
|
548 |
-
st.session_state["DMA_Panel_Selected"] = False
|
549 |
-
|
550 |
-
# Page Title
|
551 |
-
st.write("") # Top padding
|
552 |
-
st.title("Data Import")
|
553 |
-
|
554 |
-
|
555 |
-
#########################################################################################################################################################
|
556 |
-
# Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
|
557 |
-
#########################################################################################################################################################
|
558 |
-
|
559 |
-
|
560 |
-
# Read the Excel file, parsing 'Date' column as datetime
|
561 |
-
main_df = read_API_data()
|
562 |
-
|
563 |
-
# Convert all column names to lowercase
|
564 |
-
main_df.columns = main_df.columns.str.lower().str.strip()
|
565 |
-
|
566 |
-
# File uploader
|
567 |
-
uploaded_files = st.file_uploader(
|
568 |
-
"Upload additional data",
|
569 |
-
type=["xlsx"],
|
570 |
-
accept_multiple_files=True,
|
571 |
-
on_change=set_DMA_Panel_Selected_false,
|
572 |
-
)
|
573 |
-
|
574 |
-
# Custom HTML for upload instructions
|
575 |
-
recommendation_html = f"""
|
576 |
-
<div style="text-align: justify;">
|
577 |
-
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
|
578 |
-
</div>
|
579 |
-
"""
|
580 |
-
st.markdown(recommendation_html, unsafe_allow_html=True)
|
581 |
-
|
582 |
-
# Choose Date Granularity
|
583 |
-
st.markdown("#### Choose Date Granularity")
|
584 |
-
# Granularity Selection
|
585 |
-
granularity_selection = st.selectbox(
|
586 |
-
"Choose Date Granularity",
|
587 |
-
["Daily", "Weekly", "Monthly"],
|
588 |
-
label_visibility="collapsed",
|
589 |
-
on_change=set_DMA_Panel_Selected_false,
|
590 |
-
)
|
591 |
-
granularity_selection = str(granularity_selection).lower()
|
592 |
-
|
593 |
-
# Convert files to dataframes
|
594 |
-
files_dict = files_to_dataframes(uploaded_files)
|
595 |
-
|
596 |
-
# Add API Dataframe
|
597 |
-
if main_df is not None:
|
598 |
-
files_dict = add_api_dataframe_to_dict(main_df, files_dict)
|
599 |
-
|
600 |
-
# Display a warning message if no files have been uploaded and halt further execution
|
601 |
-
if not files_dict:
|
602 |
-
st.warning(
|
603 |
-
"Please upload at least one file to proceed.",
|
604 |
-
icon="⚠️",
|
605 |
-
)
|
606 |
-
st.stop() # Halts further execution until file is uploaded
|
607 |
-
|
608 |
-
|
609 |
-
# Select DMA and Panel columns
|
610 |
-
st.markdown("#### Select DMA and Panel columns")
|
611 |
-
selections = {}
|
612 |
-
with st.expander("Select DMA and Panel columns", expanded=False):
|
613 |
-
count = 0 # Initialize counter to manage the visibility of labels and keys
|
614 |
-
for file_name, file_data in files_dict.items():
|
615 |
-
# Determine visibility of the label based on the count
|
616 |
-
if count == 0:
|
617 |
-
label_visibility = "visible"
|
618 |
-
else:
|
619 |
-
label_visibility = "collapsed"
|
620 |
-
|
621 |
-
# Extract non-numeric columns
|
622 |
-
non_numeric_cols = file_data["non_numeric"]
|
623 |
-
|
624 |
-
# Prepare DMA and Panel values for dropdown, adding "N/A" as an option
|
625 |
-
dma_values = non_numeric_cols + ["N/A"]
|
626 |
-
panel_values = non_numeric_cols + ["N/A"]
|
627 |
-
|
628 |
-
# Skip if only one option is available
|
629 |
-
if len(dma_values) == 1 and len(panel_values) == 1:
|
630 |
-
selected_dma, selected_panel = "N/A", "N/A"
|
631 |
-
# Update the selections for DMA and Panel for the current file
|
632 |
-
selections[file_name] = {
|
633 |
-
"DMA": selected_dma,
|
634 |
-
"Panel": selected_panel,
|
635 |
-
}
|
636 |
-
continue
|
637 |
-
|
638 |
-
# Create layout columns for File Name, DMA, and Panel selections
|
639 |
-
file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])
|
640 |
-
|
641 |
-
with file_name_col:
|
642 |
-
# Display "File Name" label only for the first file
|
643 |
-
if count == 0:
|
644 |
-
st.write("File Name")
|
645 |
-
else:
|
646 |
-
st.write("")
|
647 |
-
st.write(file_name) # Display the file name
|
648 |
-
|
649 |
-
with DMA_col:
|
650 |
-
# Display a selectbox for DMA values
|
651 |
-
selected_dma = st.selectbox(
|
652 |
-
"Select DMA",
|
653 |
-
dma_values,
|
654 |
-
on_change=set_DMA_Panel_Selected_false,
|
655 |
-
label_visibility=label_visibility, # Control visibility of the label
|
656 |
-
key=f"DMA_selectbox{count}", # Ensure unique key for each selectbox
|
657 |
-
)
|
658 |
-
|
659 |
-
with Panel_col:
|
660 |
-
# Display a selectbox for Panel values
|
661 |
-
selected_panel = st.selectbox(
|
662 |
-
"Select Panel",
|
663 |
-
panel_values,
|
664 |
-
on_change=set_DMA_Panel_Selected_false,
|
665 |
-
label_visibility=label_visibility, # Control visibility of the label
|
666 |
-
key=f"Panel_selectbox{count}", # Ensure unique key for each selectbox
|
667 |
-
)
|
668 |
-
|
669 |
-
# Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
|
670 |
-
if selected_panel == selected_dma and not (
|
671 |
-
selected_panel == "N/A" and selected_dma == "N/A"
|
672 |
-
):
|
673 |
-
st.warning(
|
674 |
-
f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
|
675 |
-
)
|
676 |
-
selected_dma, selected_panel = "N/A", "N/A"
|
677 |
-
st.stop()
|
678 |
-
|
679 |
-
# Update the selections for DMA and Panel for the current file
|
680 |
-
selections[file_name] = {
|
681 |
-
"DMA": selected_dma,
|
682 |
-
"Panel": selected_panel,
|
683 |
-
}
|
684 |
-
|
685 |
-
count += 1 # Increment the counter after processing each file
|
686 |
-
|
687 |
-
# Accept DMA and Panel selection
|
688 |
-
if st.button("Accept and Process", use_container_width=True):
|
689 |
-
|
690 |
-
# Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
|
691 |
-
with st.spinner("Processing...", cache=True):
|
692 |
-
files_dict = standardize_data_to_daily(files_dict, selections)
|
693 |
-
|
694 |
-
# Convert all data to daily level granularity
|
695 |
-
files_dict = apply_granularity_to_all(
|
696 |
-
files_dict, granularity_selection, selections
|
697 |
-
)
|
698 |
-
|
699 |
-
st.session_state["files_dict"] = files_dict
|
700 |
-
st.session_state["DMA_Panel_Selected"] = True
|
701 |
-
|
702 |
-
|
703 |
-
#########################################################################################################################################################
|
704 |
-
# Display unique DMA and Panel values
|
705 |
-
#########################################################################################################################################################
|
706 |
-
|
707 |
-
|
708 |
-
# Halts further execution until DMA and Panel columns are selected
|
709 |
-
if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
|
710 |
-
files_dict = st.session_state["files_dict"]
|
711 |
-
else:
|
712 |
-
st.stop()
|
713 |
-
|
714 |
-
# Set to store unique values of DMA and Panel
|
715 |
-
with st.spinner("Fetching DMA and Panel values..."):
|
716 |
-
all_dma_values, all_panel_values = clean_and_extract_unique_values(
|
717 |
-
files_dict, selections
|
718 |
-
)
|
719 |
-
|
720 |
-
# List of DMA and Panel columns unique values
|
721 |
-
list_of_all_dma_values = list(all_dma_values)
|
722 |
-
list_of_all_panel_values = list(all_panel_values)
|
723 |
-
|
724 |
-
# Format DMA and Panel values for display
|
725 |
-
formatted_dma_values = format_values_for_display(list_of_all_dma_values)
|
726 |
-
formatted_panel_values = format_values_for_display(list_of_all_panel_values)
|
727 |
-
|
728 |
-
# Unique DMA and Panel values
|
729 |
-
st.markdown("#### Unique DMA and Panel values")
|
730 |
-
# Display DMA and Panel values
|
731 |
-
with st.expander("Unique DMA and Panel values"):
|
732 |
-
st.write("")
|
733 |
-
st.markdown(
|
734 |
-
f"""
|
735 |
-
<style>
|
736 |
-
.justify-text {{
|
737 |
-
text-align: justify;
|
738 |
-
}}
|
739 |
-
</style>
|
740 |
-
<div class="justify-text">
|
741 |
-
<strong>Panel Values:</strong> {formatted_panel_values}<br>
|
742 |
-
<strong>DMA Values:</strong> {formatted_dma_values}
|
743 |
-
</div>
|
744 |
-
""",
|
745 |
-
unsafe_allow_html=True,
|
746 |
-
)
|
747 |
-
|
748 |
-
# Display total DMA and Panel
|
749 |
-
st.write("")
|
750 |
-
st.markdown(
|
751 |
-
f"""
|
752 |
-
<div style="text-align: justify;">
|
753 |
-
<strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
|
754 |
-
<strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
|
755 |
-
</div>
|
756 |
-
""",
|
757 |
-
unsafe_allow_html=True,
|
758 |
-
)
|
759 |
-
st.write("")
|
760 |
-
|
761 |
-
|
762 |
-
#########################################################################################################################################################
|
763 |
-
# Merge all DataFrames
|
764 |
-
#########################################################################################################################################################
|
765 |
-
|
766 |
-
|
767 |
-
# Merge all DataFrames selected
|
768 |
-
main_df = create_main_dataframe(
|
769 |
-
files_dict, all_dma_values, all_panel_values, granularity_selection
|
770 |
-
)
|
771 |
-
merged_df = merge_into_main_df(main_df, files_dict, selections)
|
772 |
-
|
773 |
-
# # Display the merged DataFrame
|
774 |
-
# st.markdown("#### Merged DataFrame based on selected DMA and Panel")
|
775 |
-
# st.dataframe(merged_df)
|
776 |
-
|
777 |
-
|
778 |
-
#########################################################################################################################################################
|
779 |
-
# Categorize Variables and Impute Missing Values
|
780 |
-
#########################################################################################################################################################
|
781 |
-
|
782 |
-
|
783 |
-
# Create an editable DataFrame in Streamlit
|
784 |
-
st.markdown("#### Select Variables Category & Impute Missing Values")
|
785 |
-
|
786 |
-
# Prepare missing stats DataFrame for editing
|
787 |
-
missing_stats_df = prepare_missing_stats_df(merged_df)
|
788 |
-
|
789 |
-
edited_stats_df = st.data_editor(
|
790 |
-
missing_stats_df,
|
791 |
-
column_config={
|
792 |
-
"Impute Method": st.column_config.SelectboxColumn(
|
793 |
-
options=[
|
794 |
-
"Drop Column",
|
795 |
-
"Fill with Mean",
|
796 |
-
"Fill with Median",
|
797 |
-
"Fill with 0",
|
798 |
-
],
|
799 |
-
required=True,
|
800 |
-
default="Fill with 0",
|
801 |
-
),
|
802 |
-
"Category": st.column_config.SelectboxColumn(
|
803 |
-
options=[
|
804 |
-
"Media",
|
805 |
-
"Exogenous",
|
806 |
-
"Internal",
|
807 |
-
"Response_Metric"
|
808 |
-
],
|
809 |
-
required=True,
|
810 |
-
default="Media",
|
811 |
-
),
|
812 |
-
},
|
813 |
-
disabled=["Column", "Missing Values", "Missing Percentage"],
|
814 |
-
hide_index=True,
|
815 |
-
use_container_width=True,
|
816 |
-
)
|
817 |
-
|
818 |
-
# Apply changes based on edited DataFrame
|
819 |
-
for i, row in edited_stats_df.iterrows():
|
820 |
-
column = row["Column"]
|
821 |
-
if row["Impute Method"] == "Drop Column":
|
822 |
-
merged_df.drop(columns=[column], inplace=True)
|
823 |
-
|
824 |
-
elif row["Impute Method"] == "Fill with Mean":
|
825 |
-
merged_df[column].fillna(merged_df[column].mean(), inplace=True)
|
826 |
-
|
827 |
-
elif row["Impute Method"] == "Fill with Median":
|
828 |
-
merged_df[column].fillna(merged_df[column].median(), inplace=True)
|
829 |
-
|
830 |
-
elif row["Impute Method"] == "Fill with 0":
|
831 |
-
merged_df[column].fillna(0, inplace=True)
|
832 |
-
|
833 |
-
# Display the Final DataFrame and exogenous variables
|
834 |
-
st.markdown("#### Final DataFrame")
|
835 |
-
final_df = merged_df
|
836 |
-
st.dataframe(final_df, hide_index=True)
|
837 |
-
|
838 |
-
# Initialize an empty dictionary to hold categories and their variables
|
839 |
-
category_dict = {}
|
840 |
-
|
841 |
-
# Iterate over each row in the edited DataFrame to populate the dictionary
|
842 |
-
for i, row in edited_stats_df.iterrows():
|
843 |
-
column = row["Column"]
|
844 |
-
category = row["Category"] # The category chosen by the user for this variable
|
845 |
-
|
846 |
-
# Check if the category already exists in the dictionary
|
847 |
-
if category not in category_dict:
|
848 |
-
# If not, initialize it with the current column as its first element
|
849 |
-
category_dict[category] = [column]
|
850 |
-
else:
|
851 |
-
# If it exists, append the current column to the list of variables under this category
|
852 |
-
category_dict[category].append(column)
|
853 |
-
|
854 |
-
# Add Date, DMA and Panel in category dictionary
|
855 |
-
category_dict.update({"Date": ["date"]})
|
856 |
-
if "DMA" in final_df.columns:
|
857 |
-
category_dict["DMA"] = ["DMA"]
|
858 |
-
|
859 |
-
if "Panel" in final_df.columns:
|
860 |
-
category_dict["Panel"] = ["Panel"]
|
861 |
-
|
862 |
-
# Display the dictionary
|
863 |
-
st.markdown("#### Variable Category")
|
864 |
-
for category, variables in category_dict.items():
|
865 |
-
# Check if there are multiple variables to handle "and" insertion correctly
|
866 |
-
if len(variables) > 1:
|
867 |
-
# Join all but the last variable with ", ", then add " and " before the last variable
|
868 |
-
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
|
869 |
-
else:
|
870 |
-
# If there's only one variable, no need for "and"
|
871 |
-
variables_str = variables[0]
|
872 |
-
|
873 |
-
# Display the category and its variables in the desired format
|
874 |
-
st.markdown(
|
875 |
-
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
|
876 |
-
unsafe_allow_html=True,
|
877 |
-
)
|
878 |
-
|
879 |
-
# Store final dataframe and bin dictionary into session state
|
880 |
-
st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
|
881 |
-
|
882 |
-
if st.button('Save Changes'):
|
883 |
-
|
884 |
-
with open("Pickle_files/main_df", 'wb') as f:
|
885 |
-
pickle.dump(st.session_state["final_df"], f)
|
886 |
-
with open("Pickle_files/category_dict",'wb') as c:
|
887 |
-
pickle.dump(st.session_state["bin_dict"],c)
|
888 |
-
st.success('Changes Saved!')
|
889 |
-
|
890 |
-
|
891 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/Help.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="Model Build",
|
6 |
+
page_icon=":shark:",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="collapsed",
|
9 |
+
)
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
from utilities import set_header, load_local_css, load_authenticator
|
14 |
+
import pickle
|
15 |
+
import base64
|
16 |
+
|
17 |
+
load_local_css("styles.css")
|
18 |
+
set_header()
|
19 |
+
|
20 |
+
|
21 |
+
st.header('MASTERCARD MMO TOOL')
|
22 |
+
|
23 |
+
st.subheader('Overview')
|
24 |
+
|
25 |
+
st.markdown('The tool was developed in accordance with the best practices for building Marketing mix models for different clients and businesses. The tool can be used for building various MMM models, optimize spends and execute various simulations. ')
|
26 |
+
|
27 |
+
st.markdown('Last Updated: 3/26/204')
|
28 |
+
user=st.radio("Select User",['Data Scientist','Media Planner'],horizontal=True,index=1)
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
if user=='Data Scientist':
|
33 |
+
|
34 |
+
with st.expander('**Data Import**'):
|
35 |
+
|
36 |
+
st.markdown("""
|
37 |
+
|
38 |
+
The Data Import page allows users to bring in any additional data apart from what’s being fetched using APIs and processed by Data Engineering pipelines and standardize both API and non-API data to the desired end granularity. It features options for feature engineering, variable grouping, and data imputation. Additionally, it provides a comprehensive summary of all actions performed on the page, ensuring a transparent and efficient data preparation process.
|
39 |
+
|
40 |
+
**Features:**
|
41 |
+
- **Categorization:** Allows for the categorization of similar variables for streamlined analysis.
|
42 |
+
- **Feature Engineering:** Enables the creation and grouping of columns to build better models.
|
43 |
+
- **Data Imputation:** Provides methods to fill in missing or incomplete data values.
|
44 |
+
""")
|
45 |
+
|
46 |
+
with st.expander('**Data Validation**'):
|
47 |
+
|
48 |
+
st.markdown("""
|
49 |
+
|
50 |
+
This page is designed to enhance data quality and insights, focusing on selected targets and panels. It offers Response Metric Analysis, Univariate and Bivariate Report, alongside Media and Non-Media Variables Analysis. Correlation is explored to ensure a thorough validation process.
|
51 |
+
|
52 |
+
**Features:**
|
53 |
+
- **Response Metric Analysis:** Evaluates the performance metrics and trends relevant to the selected target and panel.
|
54 |
+
- **Univariate and Bivariate Report:** Offers a detailed univariate and bivariate report.
|
55 |
+
- **Variables Analysis:** Evaluates the significance of media and non-media variables for the chosen target/panel and validates the variables to be considered for the next steps.
|
56 |
+
- **Correlation Analysis:** Utilizes correlation plots to reveal the relationships between variables.
|
57 |
+
""")
|
58 |
+
|
59 |
+
with st.expander('**Transformation**'):
|
60 |
+
|
61 |
+
|
62 |
+
st.markdown("""
|
63 |
+
|
64 |
+
Transformation capabilities include Media and Exogenous Transformations such as lag, lead, moving average, power, saturation, and adstock adjustments. This page not only applies these transformations but also summarizes the process and the order of operations, providing clarity on the data's manipulation.
|
65 |
+
|
66 |
+
**Features:**
|
67 |
+
- **Transformations:** Applies specific alterations (lag, lead, moving average, power, saturation, and adstock) to media and exogenous variables to enhance model performance.
|
68 |
+
- **Summarization of Transformation:** Provides a detailed overview of all transformations applied, including their sequence.
|
69 |
+
""")
|
70 |
+
|
71 |
+
with st.expander('**Build Model**'):
|
72 |
+
|
73 |
+
st.markdown("""
|
74 |
+
|
75 |
+
This feature integrates transformation creation with model building for both panel and non-panel levels. It streamlines the process, making it easier for users to construct and refine their models based on the transformed data. After building models, this page assists in selecting the best fit and provides a detailed summary. It includes comparisons of Actual vs. Predicted outcomes, Residual Analysis, and Variance Inflation Factor (VIF) for both test and train datasets.
|
76 |
+
|
77 |
+
**Features:**
|
78 |
+
- **Diverse Models:** Constructs models for various metrics using OLS and mixed effect models.
|
79 |
+
- **Model Selection:** Chooses the most significant model utilizing metrics such as coefficients, P-value, R Square, Adjusted R Square, and MAPE.
|
80 |
+
""")
|
81 |
+
|
82 |
+
with st.expander('**Model Tuning**'):
|
83 |
+
|
84 |
+
st.markdown("""
|
85 |
+
###
|
86 |
+
Model Tuning offers advanced options like Event Flags, addition of Trends, Cyclical Pattern, sine and cosine waves. These features help in refining the model by accounting for specific events, trends, and seasonal patterns.
|
87 |
+
|
88 |
+
**Features:**
|
89 |
+
- **Event Flags:** Incorporates the effect of specific events on the target.
|
90 |
+
- **Trends:** Incorporates long-term trends and seasonality.
|
91 |
+
- **Cyclical Pattern:** Utilizes sine and cosine waves to capture and adjust for seasonal variations.
|
92 |
+
- **Contributions Analysis: Calculates contributions from the tuned model for each media channel
|
93 |
+
""")
|
94 |
+
|
95 |
+
with st.expander("**Save Model Results**"):
|
96 |
+
|
97 |
+
st.markdown("""
|
98 |
+
|
99 |
+
This page saves the model's outcomes, including channel contributions, an EDA report, and a thorough Analysis of Model Results. It's designed to document and preserve the work done for future reference.
|
100 |
+
|
101 |
+
**Features:**
|
102 |
+
- **Channel Contribution:** Details the impact of each media channel on outcomes.
|
103 |
+
- **EDA Report:** Provides an exploratory data analysis summary.
|
104 |
+
- **Analysis of Model Results:** Offers a comprehensive review of the model's performance.
|
105 |
+
""")
|
106 |
+
|
107 |
+
with st.expander('**Model Results Overview**'):
|
108 |
+
|
109 |
+
st.markdown("""
|
110 |
+
|
111 |
+
This section provides a comprehensive overview of historical spending, including channel-wise spends, revenue, ROI, and weekly contributions. It also details channel spends and revenue on a week-by-week basis, offering a granular look at financial performance.
|
112 |
+
|
113 |
+
**Features:**
|
114 |
+
- **Spends Analysis:** Breaks down channel-wise spend and revenue.
|
115 |
+
- **ROI and Contributions:** Evaluates return on investment and weekly/aggregated channel performance.
|
116 |
+
""")
|
117 |
+
|
118 |
+
with st.expander('**Build Response Curves**'):
|
119 |
+
|
120 |
+
st.markdown("""
|
121 |
+
|
122 |
+
This page updates response curves and allows for testing and saving these fits. It's essential for understanding how different levels of spending affect outcomes and for refining marketing strategies.
|
123 |
+
|
124 |
+
**Features:**
|
125 |
+
- **Response Curve Update:** Allows for the modification and refinement of response curves.
|
126 |
+
- **Curve Testing and Saving:** Facilitates the evaluation of curve fits and preserves/download curve parameters.
|
127 |
+
""")
|
128 |
+
|
129 |
+
with st.expander('**Scenario Planner**'):
|
130 |
+
|
131 |
+
st.markdown("""
|
132 |
+
|
133 |
+
The Scenario Planner page enables forward and reverse optimization, allowing users to either maximize targets given certain spends or minimize spends given a target revenue. It includes responsive S curves for multiple channels and optimizes them accordingly.
|
134 |
+
|
135 |
+
**Features:**
|
136 |
+
- **Optimization:** Supports both forward and reverse financial planning, with adjustments based on percentage or actual spend values.
|
137 |
+
- **Channel Focused:** Enables optimization based on specific media channels for focused strategies.
|
138 |
+
- **Responsive S Curves:** Showcases real-time, responsive S curves that highlight regions indicating underinvestment, optimal spending, and areas of potential overinvestment.
|
139 |
+
- **Dynamic Visualization:** Provides ROI/MROI for in-depth analysis and immediate scenario feedback.
|
140 |
+
""")
|
141 |
+
with st.expander("**Saved Scenarios**"):
|
142 |
+
|
143 |
+
st.markdown("""
|
144 |
+
|
145 |
+
Users can save, load, download, and delete scenarios involving spends, ROI, and MROI for both actual and simulated cases. This feature offers flexibility in scenario management and comparison.
|
146 |
+
|
147 |
+
**Features:**
|
148 |
+
- **Optimized Results Visualization:** Displays the outcomes of optimization, highlighting effectiveness and efficiency in scenario planning.
|
149 |
+
- **Effectiveness and Efficiency Analysis:** Provides detailed visual insights into how optimization impacts campaign performance and resource utilization.
|
150 |
+
""")
|
151 |
+
|
152 |
+
with st.expander("**Optimized Result Analysis**"):
|
153 |
+
|
154 |
+
st.markdown("""
|
155 |
+
|
156 |
+
This analysis page gives an overview of optimized spends (actual, planned, and delta), budget allocation (% split and planned spends), and forecasts on response and return by media channel. It's designed to provide insights into the efficiency and effectiveness of optimized media spending strategies.
|
157 |
+
|
158 |
+
**Features:**
|
159 |
+
- **Optimized Spends Overview:** Compares actual, planned, and delta spends.
|
160 |
+
- **Budget Allocation Analysis:** Breaks down the percentage split and planned expenditures.
|
161 |
+
""")
|
162 |
+
|
163 |
+
if user=='Media Planner':
|
164 |
+
|
165 |
+
with st.expander('**Scenario Planner**'):
|
166 |
+
|
167 |
+
st.markdown("""
|
168 |
+
|
169 |
+
The Scenario Planner page enables forward and reverse optimization, allowing users to either maximize targets given certain spends or minimize spends given a target revenue. It includes responsive S curves for multiple channels and optimizes them accordingly.
|
170 |
+
|
171 |
+
**Features:**
|
172 |
+
- **Optimization:** Supports both forward and reverse financial planning, with adjustments based on percentage or actual spend values.
|
173 |
+
- **Channel Focused:** Enables optimization based on specific media channels for focused strategies.
|
174 |
+
- **Responsive S Curves:** Showcases real-time, responsive S curves that highlight regions indicating underinvestment, optimal spending, and areas of potential overinvestment.
|
175 |
+
- **Dynamic Visualization:** Provides ROI/MROI for in-depth analysis and immediate scenario feedback.
|
176 |
+
""")
|
177 |
+
with st.expander("**Saved Scenario**"):
|
178 |
+
|
179 |
+
st.markdown("""
|
180 |
+
|
181 |
+
Users can save, load, download, and delete scenarios involving spends, ROI, and MROI for both actual and simulated cases. This feature offers flexibility in scenario management and comparison.
|
182 |
+
|
183 |
+
**Features:**
|
184 |
+
- **Optimized Results Visualization:** Displays the outcomes of optimization, highlighting effectiveness and efficiency in scenario planning.
|
185 |
+
- **Effectiveness and Efficiency Analysis:** Provides detailed visual insights into how optimization impacts campaign performance and resource utilization.
|
186 |
+
""")
|
187 |
+
|
188 |
+
with st.expander("**Optimized Result Analysis**"):
|
189 |
+
|
190 |
+
st.markdown("""
|
191 |
+
|
192 |
+
This analysis page gives an overview of optimized spends (actual, planned, and delta), budget allocation (% split and planned spends), and forecasts on response and return by media channel. It's designed to provide insights into the efficiency and effectiveness of optimized media spending strategies.
|
193 |
+
|
194 |
+
**Features:**
|
195 |
+
- **Optimized Spends Overview:** Compares actual, planned, and delta spends.
|
196 |
+
- **Budget Allocation Analysis:** Breaks down the percentage split and planned expenditures.
|
197 |
+
""")
|
198 |
+
|
199 |
+
|
200 |
+
with open("MMM Tool Description.docx", "rb") as file:
|
201 |
+
word_content = file.read()
|
202 |
+
b64 = base64.b64encode(word_content).decode()
|
203 |
+
href = f'<a href="data:application/octet-stream;base64,{b64}" download="document.docx">Download Document</a>'
|
204 |
+
st.markdown(href, unsafe_allow_html=True)
|