Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

Mastercard / Data_Import.py

BlendMMM

Update Data_Import.py

6d04460 verified 10 months ago

raw

history blame

40.2 kB

	# Importing necessary libraries
	import streamlit as st

	st.set_page_config(
	page_title="Data Import",
	page_icon=":shark:",
	layout="wide",
	initial_sidebar_state="collapsed",
	)

	import pickle
	import pandas as pd
	from utilities import set_header, load_local_css
	import streamlit_authenticator as stauth
	import yaml
	from yaml import SafeLoader

	load_local_css("styles.css")
	set_header()


	for k, v in st.session_state.items():
	if k not in ["logout", "login", "config"] and not k.startswith(
	"FormSubmitter"
	):
	st.session_state[k] = v
	with open("config.yaml") as file:
	config = yaml.load(file, Loader=SafeLoader)
	st.session_state["config"] = config
	authenticator = stauth.Authenticate(
	config["credentials"],
	config["cookie"]["name"],
	config["cookie"]["key"],
	config["cookie"]["expiry_days"],
	config["preauthorized"],
	)
	st.session_state["authenticator"] = authenticator
	name, authentication_status, username = authenticator.login("Login", "main")
	auth_status = st.session_state.get("authentication_status")

	if auth_status == True:
	authenticator.logout("Logout", "main")
	is_state_initiaized = st.session_state.get("initialized", False)

	if not is_state_initiaized:

	if 'session_name' not in st.session_state:
	st.session_state['session_name']=None


	# Function to validate date column in dataframe
	def validate_date_column(df):
	try:
	# Attempt to convert the 'Date' column to datetime
	df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
	return True
	except:
	return False


	# Function to determine data interval
	def determine_data_interval(common_freq):
	if common_freq == 1:
	return "daily"
	elif common_freq == 7:
	return "weekly"
	elif 28 <= common_freq <= 31:
	return "monthly"
	else:
	return "irregular"


	# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
	st.cache_resource(show_spinner=False)


	def files_to_dataframes(uploaded_files):
	df_dict = {}
	for uploaded_file in uploaded_files:
	# Extract file name without extension
	file_name = uploaded_file.name.rsplit(".", 1)[0]

	# Check for duplicate file names
	if file_name in df_dict:
	st.warning(
	f"Duplicate File: {file_name}. This file will be skipped.",
	icon="⚠️",
	)
	continue

	# Read the file into a DataFrame
	df = pd.read_excel(uploaded_file)

	# Convert all column names to lowercase
	df.columns = df.columns.str.lower().str.strip()

	# Separate numeric and non-numeric columns
	numeric_cols = list(df.select_dtypes(include=["number"]).columns)
	non_numeric_cols = [
	col
	for col in df.select_dtypes(exclude=["number"]).columns
	if col.lower() != "date"
	]

	# Check for 'Date' column
	if not (validate_date_column(df) and len(numeric_cols) > 0):
	st.warning(
	f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
	icon="⚠️",
	)
	continue

	# Check for interval
	common_freq = common_freq = (
	pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
	)
	# Calculate the data interval (daily, weekly, monthly or irregular)
	interval = determine_data_interval(common_freq)
	if interval == "irregular":
	st.warning(
	f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
	icon="⚠️",
	)
	continue

	# Store both DataFrames in the dictionary under their respective keys
	df_dict[file_name] = {
	"numeric": numeric_cols,
	"non_numeric": non_numeric_cols,
	"interval": interval,
	"df": df,
	}

	return df_dict


	# Function to adjust dataframe granularity
	def adjust_dataframe_granularity(df, current_granularity, target_granularity):
	# Set index
	df.set_index("date", inplace=True)

	# Define aggregation rules for resampling
	aggregation_rules = {
	col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
	for col in df.columns
	}

	# Initialize resampled_df
	resampled_df = df
	if current_granularity == "daily" and target_granularity == "weekly":
	resampled_df = df.resample("W-MON", closed="left", label="left").agg(
	aggregation_rules
	)

	elif current_granularity == "daily" and target_granularity == "monthly":
	resampled_df = df.resample("MS", closed="left", label="left").agg(
	aggregation_rules
	)

	elif current_granularity == "daily" and target_granularity == "daily":
	resampled_df = df.resample("D").agg(aggregation_rules)

	elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
	# For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
	expanded_data = []
	for _, row in df.iterrows():
	if current_granularity == "weekly":
	period_range = pd.date_range(start=row.name, periods=7)
	elif current_granularity == "monthly":
	period_range = pd.date_range(
	start=row.name, periods=row.name.days_in_month
	)

	for date in period_range:
	new_row = {}
	for col in df.columns:
	if pd.api.types.is_numeric_dtype(df[col]):
	if current_granularity == "weekly":
	new_row[col] = row[col] / 7
	elif current_granularity == "monthly":
	new_row[col] = row[col] / row.name.days_in_month
	else:
	new_row[col] = row[col]
	expanded_data.append((date, new_row))

	resampled_df = pd.DataFrame(
	[data for _, data in expanded_data],
	index=[date for date, _ in expanded_data],
	)

	# Reset index
	resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})

	return resampled_df


	# Function to clean and extract unique values of Panel_1 and Panel_2
	st.cache_resource(show_spinner=False)


	def clean_and_extract_unique_values(files_dict, selections):
	all_panel1_values = set()
	all_panel2_values = set()

	for file_name, file_data in files_dict.items():
	df = file_data["df"]

	# 'Panel_1' and 'Panel_2' selections
	selected_panel1 = selections[file_name].get("Panel_1")
	selected_panel2 = selections[file_name].get("Panel_2")

	# Clean and standardize Panel_1 column if it exists and is selected
	if (
	selected_panel1
	and selected_panel1 != "N/A"
	and selected_panel1 in df.columns
	):
	df[selected_panel1] = (
	df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
	)
	all_panel1_values.update(df[selected_panel1].dropna().unique())

	# Clean and standardize Panel_2 column if it exists and is selected
	if (
	selected_panel2
	and selected_panel2 != "N/A"
	and selected_panel2 in df.columns
	):
	df[selected_panel2] = (
	df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
	)
	all_panel2_values.update(df[selected_panel2].dropna().unique())

	# Update the processed DataFrame back in the dictionary
	files_dict[file_name]["df"] = df

	return all_panel1_values, all_panel2_values


	# Function to format values for display
	st.cache_resource(show_spinner=False)


	def format_values_for_display(values_list):
	# Capitalize the first letter of each word and replace underscores with spaces
	formatted_list = [value.replace("_", " ").title() for value in values_list]
	# Join values with commas and 'and' before the last value
	if len(formatted_list) > 1:
	return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
	elif formatted_list:
	return formatted_list[0]
	return "No values available"


	# Function to normalizes all data within files_dict to a daily granularity
	st.cache(show_spinner=False, allow_output_mutation=True)


	def standardize_data_to_daily(files_dict, selections):
	# Normalize all data to a daily granularity using a provided function
	files_dict = apply_granularity_to_all(files_dict, "daily", selections)

	# Update the "interval" attribute for each dataset to indicate the new granularity
	for files_name, files_data in files_dict.items():
	files_data["interval"] = "daily"

	return files_dict


	# Function to apply granularity transformation to all DataFrames in files_dict
	st.cache_resource(show_spinner=False)


	def apply_granularity_to_all(files_dict, granularity_selection, selections):
	for file_name, file_data in files_dict.items():
	df = file_data["df"].copy()

	# Handling when Panel_1 or Panel_2 might be 'N/A'
	selected_panel1 = selections[file_name].get("Panel_1")
	selected_panel2 = selections[file_name].get("Panel_2")

	# Correcting the segment selection logic & handling 'N/A'
	if selected_panel1 != "N/A" and selected_panel2 != "N/A":
	unique_combinations = df[
	[selected_panel1, selected_panel2]
	].drop_duplicates()
	elif selected_panel1 != "N/A":
	unique_combinations = df[[selected_panel1]].drop_duplicates()
	selected_panel2 = None # Ensure Panel_2 is ignored if N/A
	elif selected_panel2 != "N/A":
	unique_combinations = df[[selected_panel2]].drop_duplicates()
	selected_panel1 = None # Ensure Panel_1 is ignored if N/A
	else:
	# If both are 'N/A', process the entire dataframe as is
	df = adjust_dataframe_granularity(
	df, file_data["interval"], granularity_selection
	)
	files_dict[file_name]["df"] = df
	continue # Skip to the next file

	transformed_segments = []
	for _, combo in unique_combinations.iterrows():
	if selected_panel1 and selected_panel2:
	segment = df[
	(df[selected_panel1] == combo[selected_panel1])
	& (df[selected_panel2] == combo[selected_panel2])
	]
	elif selected_panel1:
	segment = df[df[selected_panel1] == combo[selected_panel1]]
	elif selected_panel2:
	segment = df[df[selected_panel2] == combo[selected_panel2]]

	# Adjust granularity of the segment
	transformed_segment = adjust_dataframe_granularity(
	segment, file_data["interval"], granularity_selection
	)
	transformed_segments.append(transformed_segment)

	# Combine all transformed segments into a single DataFrame for this file
	transformed_df = pd.concat(transformed_segments, ignore_index=True)
	files_dict[file_name]["df"] = transformed_df

	return files_dict


	# Function to create main dataframe structure
	st.cache_resource(show_spinner=False)


	def create_main_dataframe(
	files_dict, all_panel1_values, all_panel2_values, granularity_selection
	):
	# Determine the global start and end dates across all DataFrames
	global_start = min(df["df"]["date"].min() for df in files_dict.values())
	global_end = max(df["df"]["date"].max() for df in files_dict.values())

	# Adjust the date_range generation based on the granularity_selection
	if granularity_selection == "weekly":
	# Generate a weekly range, with weeks starting on Monday
	date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
	elif granularity_selection == "monthly":
	# Generate a monthly range, starting from the first day of each month
	date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
	else: # Default to daily if not weekly or monthly
	date_range = pd.date_range(start=global_start, end=global_end, freq="D")

	# Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
	all_panel1s = all_panel1_values
	all_panel2s = all_panel2_values

	# Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
	dimensions, merge_keys = [], []
	if all_panel1s:
	dimensions.append(all_panel1s)
	merge_keys.append("Panel_1")
	if all_panel2s:
	dimensions.append(all_panel2s)
	merge_keys.append("Panel_2")

	dimensions.append(date_range) # Date range is always included
	merge_keys.append("date") # Date range is always included

	# Create a main DataFrame template with the dimensions
	main_df = pd.MultiIndex.from_product(
	dimensions,
	names=[name for name, _ in zip(merge_keys, dimensions)],
	).to_frame(index=False)

	return main_df.reset_index(drop=True)


	# Function to prepare and merge dataFrames
	st.cache_resource(show_spinner=False)


	def merge_into_main_df(main_df, files_dict, selections):
	for file_name, file_data in files_dict.items():
	df = file_data["df"].copy()

	# Rename selected Panel_1 and Panel_2 columns if not 'N/A'
	selected_panel1 = selections[file_name].get("Panel_1", "N/A")
	selected_panel2 = selections[file_name].get("Panel_2", "N/A")
	if selected_panel1 != "N/A":
	df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
	if selected_panel2 != "N/A":
	df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)

	# Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
	merge_keys = ["date"]
	if "Panel_1" in df.columns:
	merge_keys.append("Panel_1")
	if "Panel_2" in df.columns:
	merge_keys.append("Panel_2")
	main_df = pd.merge(main_df, df, on=merge_keys, how="left")

	# After all merges, sort by 'date' and reset index for cleanliness
	sort_by = ["date"]
	if "Panel_1" in main_df.columns:
	sort_by.append("Panel_1")
	if "Panel_2" in main_df.columns:
	sort_by.append("Panel_2")
	main_df.sort_values(by=sort_by, inplace=True)
	main_df.reset_index(drop=True, inplace=True)

	return main_df


	# Function to categorize column
	def categorize_column(column_name):
	# Define keywords for each category
	internal_keywords = [
	"Price",
	"Discount",
	"product_price",
	"cost",
	"margin",
	"inventory",
	"sales",
	"revenue",
	"turnover",
	"expense",
	]
	exogenous_keywords = [
	"GDP",
	"Tax",
	"Inflation",
	"interest_rate",
	"employment_rate",
	"exchange_rate",
	"consumer_spending",
	"retail_sales",
	"oil_prices",
	"weather",
	]

	# Check if the column name matches any of the keywords for Internal or Exogenous categories
	for keyword in internal_keywords:
	if keyword.lower() in column_name.lower():
	return "Internal"
	for keyword in exogenous_keywords:
	if keyword.lower() in column_name.lower():
	return "Exogenous"

	# Default to Media if no match found
	return "Media"


	# Function to calculate missing stats and prepare for editable DataFrame
	st.cache_resource(show_spinner=False)


	def prepare_missing_stats_df(df):
	missing_stats = []
	for column in df.columns:
	if (
	column == "date" or column == "Panel_2" or column == "Panel_1"
	): # Skip Date, Panel_1 and Panel_2 column
	continue

	missing = df[column].isnull().sum()
	pct_missing = round((missing / len(df)) * 100, 2)

	# Dynamically assign category based on column name
	category = categorize_column(column)
	# category = "Media" # Keep default bin as Media

	missing_stats.append(
	{
	"Column": column,
	"Missing Values": missing,
	"Missing Percentage": pct_missing,
	"Impute Method": "Fill with 0", # Default value
	"Category": category,
	}
	)
	stats_df = pd.DataFrame(missing_stats)

	return stats_df


	# Function to add API DataFrame details to the files dictionary
	st.cache_resource(show_spinner=False)


	def add_api_dataframe_to_dict(main_df, files_dict):
	files_dict["API"] = {
	"numeric": list(main_df.select_dtypes(include=["number"]).columns),
	"non_numeric": [
	col
	for col in main_df.select_dtypes(exclude=["number"]).columns
	if col.lower() != "date"
	],
	"interval": determine_data_interval(
	pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
	),
	"df": main_df,
	}

	return files_dict


	# Function to reads an API into a DataFrame, parsing specified columns as datetime
	@st.cache_resource(show_spinner=False)
	def read_API_data():
	return pd.read_excel("upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])


	# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
	def set_Panel_1_Panel_2_Selected_false():
	st.session_state["Panel_1_Panel_2_Selected"] = False


	# Function to serialize and save the objects into a pickle file
	@st.cache_resource(show_spinner=False)
	def save_to_pickle(file_path, final_df, bin_dict):
	# Open the file in write-binary mode and dump the objects
	with open(file_path, "wb") as f:
	pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
	# Data is now saved to file


	# Function to processes the merged_df DataFrame based on operations defined in edited_df
	@st.cache_resource(show_spinner=False)
	def process_dataframes(merged_df, edited_df, edited_stats_df):
	# Ensure there are operations defined by the user
	if edited_df.empty:
	return merged_df, edited_stats_df # No operations to apply

	# Perform operations as defined by the user
	for index, row in edited_df.iterrows():
	result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
	col1 = row["Column 1"]
	col2 = row["Column 2"]
	op = row["Operator"]

	# Apply the specified operation
	if op == "+":
	merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
	elif op == "-":
	merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
	elif op == "*":
	merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
	elif op == "/":
	merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
	0, 1e-9
	)

	# Add summary of operation to edited_stats_df
	new_row = {
	"Column": result_column_name,
	"Missing Values": None,
	"Missing Percentage": None,
	"Impute Method": None,
	"Category": row["Category"],
	}
	new_row_df = pd.DataFrame([new_row])

	# Use pd.concat to add the new_row_df to edited_stats_df
	edited_stats_df = pd.concat(
	[edited_stats_df, new_row_df], ignore_index=True, axis=0
	)

	# Combine column names from edited_df for cleanup
	combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))

	# Filter out rows in edited_stats_df and drop columns from merged_df
	edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
	merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)

	return merged_df, edited_stats_df


	# Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
	st.cache_resource(show_spinner=False)


	def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
	# Get columns categorized as 'Response Metrics'
	columns_response_metrics = edited_stats_df[
	edited_stats_df["Category"] == "Response Metrics"
	]["Column"].tolist()

	# Filter numeric columns, excluding those categorized as 'Response Metrics'
	numeric_columns = [
	col
	for col in merged_df.select_dtypes(include=["number"]).columns
	if col not in columns_response_metrics
	]

	# Define the structure of the empty DataFrame
	data = {
	"Column 1": pd.Series([], dtype="str"),
	"Operator": pd.Series([], dtype="str"),
	"Column 2": pd.Series([], dtype="str"),
	"Category": pd.Series([], dtype="str"),
	}
	default_df = pd.DataFrame(data)

	return numeric_columns, default_df


	# Initialize 'final_df' in session state
	if "final_df" not in st.session_state:
	st.session_state["final_df"] = pd.DataFrame()

	# Initialize 'bin_dict' in session state
	if "bin_dict" not in st.session_state:
	st.session_state["bin_dict"] = {}

	# Initialize 'Panel_1_Panel_2_Selected' in session state
	if "Panel_1_Panel_2_Selected" not in st.session_state:
	st.session_state["Panel_1_Panel_2_Selected"] = False


	# Page Title
	st.write("") # Top padding
	st.title("Data Import")


	#########################################################################################################################################################
	# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
	#########################################################################################################################################################


	# Read the Excel file, parsing 'Date' column as datetime
	main_df = read_API_data()

	# Convert all column names to lowercase
	main_df.columns = main_df.columns.str.lower().str.strip()

	# File uploader
	uploaded_files = st.file_uploader(
	"Upload additional data",
	type=["xlsx"],
	accept_multiple_files=True,
	on_change=set_Panel_1_Panel_2_Selected_false,
	)

	# Custom HTML for upload instructions
	recommendation_html = f"""
	<div style="text-align: justify;">
	<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
	</div>
	"""
	st.markdown(recommendation_html, unsafe_allow_html=True)

	# Choose Desired Granularity
	st.markdown("#### Choose Desired Granularity")
	# Granularity Selection
	granularity_selection = st.selectbox(
	"Choose Date Granularity",
	["Daily", "Weekly", "Monthly"],
	label_visibility="collapsed",
	on_change=set_Panel_1_Panel_2_Selected_false,
	)
	granularity_selection = str(granularity_selection).lower()

	# Convert files to dataframes
	files_dict = files_to_dataframes(uploaded_files)

	# Add API Dataframe
	if main_df is not None:
	files_dict = add_api_dataframe_to_dict(main_df, files_dict)

	# Display a warning message if no files have been uploaded and halt further execution
	if not files_dict:
	st.warning(
	"Please upload at least one file to proceed.",
	icon="⚠️",
	)
	st.stop() # Halts further execution until file is uploaded


	# Select Panel_1 and Panel_2 columns
	st.markdown("#### Select Panel columns")
	selections = {}
	with st.expander("Select Panel columns", expanded=False):
	count = 0 # Initialize counter to manage the visibility of labels and keys
	for file_name, file_data in files_dict.items():
	# Determine visibility of the label based on the count
	if count == 0:
	label_visibility = "visible"
	else:
	label_visibility = "collapsed"

	# Extract non-numeric columns
	non_numeric_cols = file_data["non_numeric"]

	# Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
	panel1_values = non_numeric_cols + ["N/A"]
	panel2_values = non_numeric_cols + ["N/A"]

	# Skip if only one option is available
	if len(panel1_values) == 1 and len(panel2_values) == 1:
	selected_panel1, selected_panel2 = "N/A", "N/A"
	# Update the selections for Panel_1 and Panel_2 for the current file
	selections[file_name] = {
	"Panel_1": selected_panel1,
	"Panel_2": selected_panel2,
	}
	continue

	# Create layout columns for File Name, Panel_2, and Panel_1 selections
	file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])

	with file_name_col:
	# Display "File Name" label only for the first file
	if count == 0:
	st.write("File Name")
	else:
	st.write("")
	st.write(file_name) # Display the file name

	with Panel_1_col:
	# Display a selectbox for Panel_1 values
	selected_panel1 = st.selectbox(
	"Select Panel Level 1",
	panel2_values,
	on_change=set_Panel_1_Panel_2_Selected_false,
	label_visibility=label_visibility, # Control visibility of the label
	key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
	)

	with Panel_2_col:
	# Display a selectbox for Panel_2 values
	selected_panel2 = st.selectbox(
	"Select Panel Level 2",
	panel1_values,
	on_change=set_Panel_1_Panel_2_Selected_false,
	label_visibility=label_visibility, # Control visibility of the label
	key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
	)

	# Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
	if selected_panel2 == selected_panel1 and not (
	selected_panel2 == "N/A" and selected_panel1 == "N/A"
	):
	st.warning(
	f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
	)
	selected_panel1, selected_panel2 = "N/A", "N/A"
	st.stop()

	# Update the selections for Panel_1 and Panel_2 for the current file
	selections[file_name] = {
	"Panel_1": selected_panel1,
	"Panel_2": selected_panel2,
	}

	count += 1 # Increment the counter after processing each file

	# Accept Panel_1 and Panel_2 selection
	if st.button("Accept and Process", use_container_width=True):

	# Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
	with st.spinner("Processing..."):
	files_dict = standardize_data_to_daily(files_dict, selections)

	# Convert all data to daily level granularity
	files_dict = apply_granularity_to_all(
	files_dict, granularity_selection, selections
	)

	# Update the 'files_dict' in the session state
	st.session_state["files_dict"] = files_dict

	# Set a flag in the session state to indicate that selection has been made
	st.session_state["Panel_1_Panel_2_Selected"] = True


	#########################################################################################################################################################
	# Display unique Panel_1 and Panel_2 values
	#########################################################################################################################################################


	# Halts further execution until Panel_1 and Panel_2 columns are selected
	if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
	files_dict = st.session_state["files_dict"]
	else:
	st.stop()

	# Set to store unique values of Panel_1 and Panel_2
	with st.spinner("Fetching Panel values..."):
	all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
	files_dict, selections
	)

	# List of Panel_1 and Panel_2 columns unique values
	list_of_all_panel1_values = list(all_panel1_values)
	list_of_all_panel2_values = list(all_panel2_values)

	# Format Panel_1 and Panel_2 values for display
	formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
	formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)

	# Unique Panel_1 and Panel_2 values
	st.markdown("#### Unique Panel values")
	# Display Panel_1 and Panel_2 values
	with st.expander("Unique Panel values"):
	st.write("")
	st.markdown(
	f"""
	<style>
	.justify-text {{
	text-align: justify;
	}}
	</style>
	<div class="justify-text">
	<strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
	<strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
	</div>
	""",
	unsafe_allow_html=True,
	)

	# Display total Panel_1 and Panel_2
	st.write("")
	st.markdown(
	f"""
	<div style="text-align: justify;">
	<strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
	<strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
	</div>
	""",
	unsafe_allow_html=True,
	)
	st.write("")


	#########################################################################################################################################################
	# Merge all DataFrames
	#########################################################################################################################################################


	# Merge all DataFrames selected
	main_df = create_main_dataframe(
	files_dict, all_panel1_values, all_panel2_values, granularity_selection
	)
	merged_df = merge_into_main_df(main_df, files_dict, selections)


	#########################################################################################################################################################
	# Categorize Variables and Impute Missing Values
	#########################################################################################################################################################


	# Create an editable DataFrame in Streamlit
	st.markdown("#### Select Variables Category & Impute Missing Values")

	# Prepare missing stats DataFrame for editing
	missing_stats_df = prepare_missing_stats_df(merged_df)

	edited_stats_df = st.data_editor(
	missing_stats_df,
	column_config={
	"Impute Method": st.column_config.SelectboxColumn(
	options=[
	"Drop Column",
	"Fill with Mean",
	"Fill with Median",
	"Fill with 0",
	],
	required=True,
	default="Fill with 0",
	),
	"Category": st.column_config.SelectboxColumn(
	options=[
	"Media",
	"Exogenous",
	"Internal",
	"Response Metrics",
	],
	required=True,
	default="Media",
	),
	},
	disabled=["Column", "Missing Values", "Missing Percentage"],
	hide_index=True,
	use_container_width=True,
	)

	# Apply changes based on edited DataFrame
	for i, row in edited_stats_df.iterrows():
	column = row["Column"]
	if row["Impute Method"] == "Drop Column":
	merged_df.drop(columns=[column], inplace=True)

	elif row["Impute Method"] == "Fill with Mean":
	merged_df[column].fillna(merged_df[column].mean(), inplace=True)

	elif row["Impute Method"] == "Fill with Median":
	merged_df[column].fillna(merged_df[column].median(), inplace=True)

	elif row["Impute Method"] == "Fill with 0":
	merged_df[column].fillna(0, inplace=True)


	#########################################################################################################################################################
	# Group columns
	#########################################################################################################################################################


	# Display Group columns header
	st.markdown("#### Feature engineering")

	# Prepare the numeric columns and an empty DataFrame for user input
	numeric_columns, default_df = prepare_numeric_columns_and_default_df(
	merged_df, edited_stats_df
	)

	# Display editable Dataframe
	edited_df = st.data_editor(
	default_df,
	column_config={
	"Column 1": st.column_config.SelectboxColumn(
	options=numeric_columns,
	required=True,
	default=numeric_columns[0],
	width=400,
	),
	"Operator": st.column_config.SelectboxColumn(
	options=["+", "-", "*", "/"],
	required=True,
	default="+",
	width=100,
	),
	"Column 2": st.column_config.SelectboxColumn(
	options=numeric_columns,
	required=True,
	default=numeric_columns[0],
	width=400,
	),
	"Category": st.column_config.SelectboxColumn(
	options=[
	"Media",
	"Exogenous",
	"Internal",
	"Response Metrics",
	],
	required=True,
	default="Media",
	width=200,
	),
	},
	num_rows="dynamic",
	)

	# Process the DataFrame based on user inputs and operations specified in edited_df
	final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)


	#########################################################################################################################################################
	# Display the Final DataFrame and variables
	#########################################################################################################################################################


	# Display the Final DataFrame and variables
	st.markdown("#### Final DataFrame")
	st.dataframe(final_df, hide_index=True)

	# Initialize an empty dictionary to hold categories and their variables
	category_dict = {}

	# Iterate over each row in the edited DataFrame to populate the dictionary
	for i, row in edited_stats_df.iterrows():
	column = row["Column"]
	category = row["Category"] # The category chosen by the user for this variable

	# Check if the category already exists in the dictionary
	if category not in category_dict:
	# If not, initialize it with the current column as its first element
	category_dict[category] = [column]
	else:
	# If it exists, append the current column to the list of variables under this category
	category_dict[category].append(column)

	# Add Date, Panel_1 and Panel_12 in category dictionary
	category_dict.update({"Date": ["date"]})
	if "Panel_1" in final_df.columns:
	category_dict["Panel Level 1"] = ["Panel_1"]
	if "Panel_2" in final_df.columns:
	category_dict["Panel Level 2"] = ["Panel_2"]

	# Display the dictionary
	st.markdown("#### Variable Category")
	for category, variables in category_dict.items():
	# Check if there are multiple variables to handle "and" insertion correctly
	if len(variables) > 1:
	# Join all but the last variable with ", ", then add " and " before the last variable
	variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
	else:
	# If there's only one variable, no need for "and"
	variables_str = variables[0]

	# Display the category and its variables in the desired format
	st.markdown(
	f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
	unsafe_allow_html=True,
	)

	# Function to check if Response Metrics is selected
	st.write("")
	response_metrics_col = category_dict.get("Response Metrics", [])
	if len(response_metrics_col) == 0:
	st.warning("Please select Response Metrics column", icon="⚠️")
	st.stop()
	# elif len(response_metrics_col) > 1:
	# st.warning("Please select only one Response Metrics column", icon="⚠️")
	# st.stop()

	# Store final dataframe and bin dictionary into session state
	st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict

	# Save the DataFrame and dictionary from the session state to the pickle file
	if st.button("Accept and Save", use_container_width=True):
	save_to_pickle(
	"data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
	)
	st.toast("💾 Saved Successfully!")