Spaces:
Sleeping
Sleeping
# Importing necessary libraries | |
import streamlit as st | |
st.set_page_config( | |
page_title="Data Import", | |
page_icon=":shark:", | |
layout="wide", | |
initial_sidebar_state="collapsed", | |
) | |
import pickle | |
import pandas as pd | |
from utilities import set_header, load_local_css | |
import streamlit_authenticator as stauth | |
import yaml | |
from yaml import SafeLoader | |
load_local_css("styles.css") | |
set_header() | |
for k, v in st.session_state.items(): | |
if k not in ["logout", "login", "config"] and not k.startswith( | |
"FormSubmitter" | |
): | |
st.session_state[k] = v | |
with open("config.yaml") as file: | |
config = yaml.load(file, Loader=SafeLoader) | |
st.session_state["config"] = config | |
authenticator = stauth.Authenticate( | |
config["credentials"], | |
config["cookie"]["name"], | |
config["cookie"]["key"], | |
config["cookie"]["expiry_days"], | |
config["preauthorized"], | |
) | |
st.session_state["authenticator"] = authenticator | |
name, authentication_status, username = authenticator.login("Login", "main") | |
auth_status = st.session_state.get("authentication_status") | |
if auth_status == True: | |
authenticator.logout("Logout", "main") | |
is_state_initiaized = st.session_state.get("initialized", False) | |
if not is_state_initiaized: | |
if 'session_name' not in st.session_state: | |
st.session_state['session_name']=None | |
# Function to validate date column in dataframe | |
def validate_date_column(df): | |
try: | |
# Attempt to convert the 'Date' column to datetime | |
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y") | |
return True | |
except: | |
return False | |
# Function to determine data interval | |
def determine_data_interval(common_freq): | |
if common_freq == 1: | |
return "daily" | |
elif common_freq == 7: | |
return "weekly" | |
elif 28 <= common_freq <= 31: | |
return "monthly" | |
else: | |
return "irregular" | |
# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary | |
st.cache_resource(show_spinner=False) | |
def files_to_dataframes(uploaded_files): | |
df_dict = {} | |
for uploaded_file in uploaded_files: | |
# Extract file name without extension | |
file_name = uploaded_file.name.rsplit(".", 1)[0] | |
# Check for duplicate file names | |
if file_name in df_dict: | |
st.warning( | |
f"Duplicate File: {file_name}. This file will be skipped.", | |
icon="⚠️", | |
) | |
continue | |
# Read the file into a DataFrame | |
df = pd.read_excel(uploaded_file) | |
# Convert all column names to lowercase | |
df.columns = df.columns.str.lower().str.strip() | |
# Separate numeric and non-numeric columns | |
numeric_cols = list(df.select_dtypes(include=["number"]).columns) | |
non_numeric_cols = [ | |
col | |
for col in df.select_dtypes(exclude=["number"]).columns | |
if col.lower() != "date" | |
] | |
# Check for 'Date' column | |
if not (validate_date_column(df) and len(numeric_cols) > 0): | |
st.warning( | |
f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.", | |
icon="⚠️", | |
) | |
continue | |
# Check for interval | |
common_freq = common_freq = ( | |
pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0] | |
) | |
# Calculate the data interval (daily, weekly, monthly or irregular) | |
interval = determine_data_interval(common_freq) | |
if interval == "irregular": | |
st.warning( | |
f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.", | |
icon="⚠️", | |
) | |
continue | |
# Store both DataFrames in the dictionary under their respective keys | |
df_dict[file_name] = { | |
"numeric": numeric_cols, | |
"non_numeric": non_numeric_cols, | |
"interval": interval, | |
"df": df, | |
} | |
return df_dict | |
# Function to adjust dataframe granularity | |
def adjust_dataframe_granularity(df, current_granularity, target_granularity): | |
# Set index | |
df.set_index("date", inplace=True) | |
# Define aggregation rules for resampling | |
aggregation_rules = { | |
col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first" | |
for col in df.columns | |
} | |
# Initialize resampled_df | |
resampled_df = df | |
if current_granularity == "daily" and target_granularity == "weekly": | |
resampled_df = df.resample("W-MON", closed="left", label="left").agg( | |
aggregation_rules | |
) | |
elif current_granularity == "daily" and target_granularity == "monthly": | |
resampled_df = df.resample("MS", closed="left", label="left").agg( | |
aggregation_rules | |
) | |
elif current_granularity == "daily" and target_granularity == "daily": | |
resampled_df = df.resample("D").agg(aggregation_rules) | |
elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily": | |
# For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period | |
expanded_data = [] | |
for _, row in df.iterrows(): | |
if current_granularity == "weekly": | |
period_range = pd.date_range(start=row.name, periods=7) | |
elif current_granularity == "monthly": | |
period_range = pd.date_range( | |
start=row.name, periods=row.name.days_in_month | |
) | |
for date in period_range: | |
new_row = {} | |
for col in df.columns: | |
if pd.api.types.is_numeric_dtype(df[col]): | |
if current_granularity == "weekly": | |
new_row[col] = row[col] / 7 | |
elif current_granularity == "monthly": | |
new_row[col] = row[col] / row.name.days_in_month | |
else: | |
new_row[col] = row[col] | |
expanded_data.append((date, new_row)) | |
resampled_df = pd.DataFrame( | |
[data for _, data in expanded_data], | |
index=[date for date, _ in expanded_data], | |
) | |
# Reset index | |
resampled_df = resampled_df.reset_index().rename(columns={"index": "date"}) | |
return resampled_df | |
# Function to clean and extract unique values of Panel_1 and Panel_2 | |
st.cache_resource(show_spinner=False) | |
def clean_and_extract_unique_values(files_dict, selections): | |
all_panel1_values = set() | |
all_panel2_values = set() | |
for file_name, file_data in files_dict.items(): | |
df = file_data["df"] | |
# 'Panel_1' and 'Panel_2' selections | |
selected_panel1 = selections[file_name].get("Panel_1") | |
selected_panel2 = selections[file_name].get("Panel_2") | |
# Clean and standardize Panel_1 column if it exists and is selected | |
if ( | |
selected_panel1 | |
and selected_panel1 != "N/A" | |
and selected_panel1 in df.columns | |
): | |
df[selected_panel1] = ( | |
df[selected_panel1].str.lower().str.strip().str.replace("_", " ") | |
) | |
all_panel1_values.update(df[selected_panel1].dropna().unique()) | |
# Clean and standardize Panel_2 column if it exists and is selected | |
if ( | |
selected_panel2 | |
and selected_panel2 != "N/A" | |
and selected_panel2 in df.columns | |
): | |
df[selected_panel2] = ( | |
df[selected_panel2].str.lower().str.strip().str.replace("_", " ") | |
) | |
all_panel2_values.update(df[selected_panel2].dropna().unique()) | |
# Update the processed DataFrame back in the dictionary | |
files_dict[file_name]["df"] = df | |
return all_panel1_values, all_panel2_values | |
# Function to format values for display | |
st.cache_resource(show_spinner=False) | |
def format_values_for_display(values_list): | |
# Capitalize the first letter of each word and replace underscores with spaces | |
formatted_list = [value.replace("_", " ").title() for value in values_list] | |
# Join values with commas and 'and' before the last value | |
if len(formatted_list) > 1: | |
return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1] | |
elif formatted_list: | |
return formatted_list[0] | |
return "No values available" | |
# Function to normalizes all data within files_dict to a daily granularity | |
st.cache(show_spinner=False, allow_output_mutation=True) | |
def standardize_data_to_daily(files_dict, selections): | |
# Normalize all data to a daily granularity using a provided function | |
files_dict = apply_granularity_to_all(files_dict, "daily", selections) | |
# Update the "interval" attribute for each dataset to indicate the new granularity | |
for files_name, files_data in files_dict.items(): | |
files_data["interval"] = "daily" | |
return files_dict | |
# Function to apply granularity transformation to all DataFrames in files_dict | |
st.cache_resource(show_spinner=False) | |
def apply_granularity_to_all(files_dict, granularity_selection, selections): | |
for file_name, file_data in files_dict.items(): | |
df = file_data["df"].copy() | |
# Handling when Panel_1 or Panel_2 might be 'N/A' | |
selected_panel1 = selections[file_name].get("Panel_1") | |
selected_panel2 = selections[file_name].get("Panel_2") | |
# Correcting the segment selection logic & handling 'N/A' | |
if selected_panel1 != "N/A" and selected_panel2 != "N/A": | |
unique_combinations = df[ | |
[selected_panel1, selected_panel2] | |
].drop_duplicates() | |
elif selected_panel1 != "N/A": | |
unique_combinations = df[[selected_panel1]].drop_duplicates() | |
selected_panel2 = None # Ensure Panel_2 is ignored if N/A | |
elif selected_panel2 != "N/A": | |
unique_combinations = df[[selected_panel2]].drop_duplicates() | |
selected_panel1 = None # Ensure Panel_1 is ignored if N/A | |
else: | |
# If both are 'N/A', process the entire dataframe as is | |
df = adjust_dataframe_granularity( | |
df, file_data["interval"], granularity_selection | |
) | |
files_dict[file_name]["df"] = df | |
continue # Skip to the next file | |
transformed_segments = [] | |
for _, combo in unique_combinations.iterrows(): | |
if selected_panel1 and selected_panel2: | |
segment = df[ | |
(df[selected_panel1] == combo[selected_panel1]) | |
& (df[selected_panel2] == combo[selected_panel2]) | |
] | |
elif selected_panel1: | |
segment = df[df[selected_panel1] == combo[selected_panel1]] | |
elif selected_panel2: | |
segment = df[df[selected_panel2] == combo[selected_panel2]] | |
# Adjust granularity of the segment | |
transformed_segment = adjust_dataframe_granularity( | |
segment, file_data["interval"], granularity_selection | |
) | |
transformed_segments.append(transformed_segment) | |
# Combine all transformed segments into a single DataFrame for this file | |
transformed_df = pd.concat(transformed_segments, ignore_index=True) | |
files_dict[file_name]["df"] = transformed_df | |
return files_dict | |
# Function to create main dataframe structure | |
st.cache_resource(show_spinner=False) | |
def create_main_dataframe( | |
files_dict, all_panel1_values, all_panel2_values, granularity_selection | |
): | |
# Determine the global start and end dates across all DataFrames | |
global_start = min(df["df"]["date"].min() for df in files_dict.values()) | |
global_end = max(df["df"]["date"].max() for df in files_dict.values()) | |
# Adjust the date_range generation based on the granularity_selection | |
if granularity_selection == "weekly": | |
# Generate a weekly range, with weeks starting on Monday | |
date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON") | |
elif granularity_selection == "monthly": | |
# Generate a monthly range, starting from the first day of each month | |
date_range = pd.date_range(start=global_start, end=global_end, freq="MS") | |
else: # Default to daily if not weekly or monthly | |
date_range = pd.date_range(start=global_start, end=global_end, freq="D") | |
# Collect all unique Panel_1 and Panel_2 values, excluding 'N/A' | |
all_panel1s = all_panel1_values | |
all_panel2s = all_panel2_values | |
# Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability | |
dimensions, merge_keys = [], [] | |
if all_panel1s: | |
dimensions.append(all_panel1s) | |
merge_keys.append("Panel_1") | |
if all_panel2s: | |
dimensions.append(all_panel2s) | |
merge_keys.append("Panel_2") | |
dimensions.append(date_range) # Date range is always included | |
merge_keys.append("date") # Date range is always included | |
# Create a main DataFrame template with the dimensions | |
main_df = pd.MultiIndex.from_product( | |
dimensions, | |
names=[name for name, _ in zip(merge_keys, dimensions)], | |
).to_frame(index=False) | |
return main_df.reset_index(drop=True) | |
# Function to prepare and merge dataFrames | |
st.cache_resource(show_spinner=False) | |
def merge_into_main_df(main_df, files_dict, selections): | |
for file_name, file_data in files_dict.items(): | |
df = file_data["df"].copy() | |
# Rename selected Panel_1 and Panel_2 columns if not 'N/A' | |
selected_panel1 = selections[file_name].get("Panel_1", "N/A") | |
selected_panel2 = selections[file_name].get("Panel_2", "N/A") | |
if selected_panel1 != "N/A": | |
df.rename(columns={selected_panel1: "Panel_1"}, inplace=True) | |
if selected_panel2 != "N/A": | |
df.rename(columns={selected_panel2: "Panel_2"}, inplace=True) | |
# Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2' | |
merge_keys = ["date"] | |
if "Panel_1" in df.columns: | |
merge_keys.append("Panel_1") | |
if "Panel_2" in df.columns: | |
merge_keys.append("Panel_2") | |
main_df = pd.merge(main_df, df, on=merge_keys, how="left") | |
# After all merges, sort by 'date' and reset index for cleanliness | |
sort_by = ["date"] | |
if "Panel_1" in main_df.columns: | |
sort_by.append("Panel_1") | |
if "Panel_2" in main_df.columns: | |
sort_by.append("Panel_2") | |
main_df.sort_values(by=sort_by, inplace=True) | |
main_df.reset_index(drop=True, inplace=True) | |
return main_df | |
# Function to categorize column | |
def categorize_column(column_name): | |
# Define keywords for each category | |
internal_keywords = [ | |
"Price", | |
"Discount", | |
"product_price", | |
"cost", | |
"margin", | |
"inventory", | |
"sales", | |
"revenue", | |
"turnover", | |
"expense", | |
] | |
exogenous_keywords = [ | |
"GDP", | |
"Tax", | |
"Inflation", | |
"interest_rate", | |
"employment_rate", | |
"exchange_rate", | |
"consumer_spending", | |
"retail_sales", | |
"oil_prices", | |
"weather", | |
] | |
# Check if the column name matches any of the keywords for Internal or Exogenous categories | |
for keyword in internal_keywords: | |
if keyword.lower() in column_name.lower(): | |
return "Internal" | |
for keyword in exogenous_keywords: | |
if keyword.lower() in column_name.lower(): | |
return "Exogenous" | |
# Default to Media if no match found | |
return "Media" | |
# Function to calculate missing stats and prepare for editable DataFrame | |
st.cache_resource(show_spinner=False) | |
def prepare_missing_stats_df(df): | |
missing_stats = [] | |
for column in df.columns: | |
if ( | |
column == "date" or column == "Panel_2" or column == "Panel_1" | |
): # Skip Date, Panel_1 and Panel_2 column | |
continue | |
missing = df[column].isnull().sum() | |
pct_missing = round((missing / len(df)) * 100, 2) | |
# Dynamically assign category based on column name | |
category = categorize_column(column) | |
# category = "Media" # Keep default bin as Media | |
missing_stats.append( | |
{ | |
"Column": column, | |
"Missing Values": missing, | |
"Missing Percentage": pct_missing, | |
"Impute Method": "Fill with 0", # Default value | |
"Category": category, | |
} | |
) | |
stats_df = pd.DataFrame(missing_stats) | |
return stats_df | |
# Function to add API DataFrame details to the files dictionary | |
st.cache_resource(show_spinner=False) | |
def add_api_dataframe_to_dict(main_df, files_dict): | |
files_dict["API"] = { | |
"numeric": list(main_df.select_dtypes(include=["number"]).columns), | |
"non_numeric": [ | |
col | |
for col in main_df.select_dtypes(exclude=["number"]).columns | |
if col.lower() != "date" | |
], | |
"interval": determine_data_interval( | |
pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0] | |
), | |
"df": main_df, | |
} | |
return files_dict | |
# Function to reads an API into a DataFrame, parsing specified columns as datetime | |
def read_API_data(): | |
return pd.read_excel("upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"]) | |
# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False | |
def set_Panel_1_Panel_2_Selected_false(): | |
st.session_state["Panel_1_Panel_2_Selected"] = False | |
# Function to serialize and save the objects into a pickle file | |
def save_to_pickle(file_path, final_df, bin_dict): | |
# Open the file in write-binary mode and dump the objects | |
with open(file_path, "wb") as f: | |
pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f) | |
# Data is now saved to file | |
# Function to processes the merged_df DataFrame based on operations defined in edited_df | |
def process_dataframes(merged_df, edited_df, edited_stats_df): | |
# Ensure there are operations defined by the user | |
if edited_df.empty: | |
return merged_df, edited_stats_df # No operations to apply | |
# Perform operations as defined by the user | |
for index, row in edited_df.iterrows(): | |
result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}" | |
col1 = row["Column 1"] | |
col2 = row["Column 2"] | |
op = row["Operator"] | |
# Apply the specified operation | |
if op == "+": | |
merged_df[result_column_name] = merged_df[col1] + merged_df[col2] | |
elif op == "-": | |
merged_df[result_column_name] = merged_df[col1] - merged_df[col2] | |
elif op == "*": | |
merged_df[result_column_name] = merged_df[col1] * merged_df[col2] | |
elif op == "/": | |
merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace( | |
0, 1e-9 | |
) | |
# Add summary of operation to edited_stats_df | |
new_row = { | |
"Column": result_column_name, | |
"Missing Values": None, | |
"Missing Percentage": None, | |
"Impute Method": None, | |
"Category": row["Category"], | |
} | |
new_row_df = pd.DataFrame([new_row]) | |
# Use pd.concat to add the new_row_df to edited_stats_df | |
edited_stats_df = pd.concat( | |
[edited_stats_df, new_row_df], ignore_index=True, axis=0 | |
) | |
# Combine column names from edited_df for cleanup | |
combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"])) | |
# Filter out rows in edited_stats_df and drop columns from merged_df | |
edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)] | |
merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True) | |
return merged_df, edited_stats_df | |
# Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure | |
st.cache_resource(show_spinner=False) | |
def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df): | |
# Get columns categorized as 'Response Metrics' | |
columns_response_metrics = edited_stats_df[ | |
edited_stats_df["Category"] == "Response Metrics" | |
]["Column"].tolist() | |
# Filter numeric columns, excluding those categorized as 'Response Metrics' | |
numeric_columns = [ | |
col | |
for col in merged_df.select_dtypes(include=["number"]).columns | |
if col not in columns_response_metrics | |
] | |
# Define the structure of the empty DataFrame | |
data = { | |
"Column 1": pd.Series([], dtype="str"), | |
"Operator": pd.Series([], dtype="str"), | |
"Column 2": pd.Series([], dtype="str"), | |
"Category": pd.Series([], dtype="str"), | |
} | |
default_df = pd.DataFrame(data) | |
return numeric_columns, default_df | |
# Initialize 'final_df' in session state | |
if "final_df" not in st.session_state: | |
st.session_state["final_df"] = pd.DataFrame() | |
# Initialize 'bin_dict' in session state | |
if "bin_dict" not in st.session_state: | |
st.session_state["bin_dict"] = {} | |
# Initialize 'Panel_1_Panel_2_Selected' in session state | |
if "Panel_1_Panel_2_Selected" not in st.session_state: | |
st.session_state["Panel_1_Panel_2_Selected"] = False | |
# Page Title | |
st.write("") # Top padding | |
st.title("Data Import") | |
######################################################################################################################################################### | |
# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file | |
######################################################################################################################################################### | |
# Read the Excel file, parsing 'Date' column as datetime | |
main_df = read_API_data() | |
# Convert all column names to lowercase | |
main_df.columns = main_df.columns.str.lower().str.strip() | |
# File uploader | |
uploaded_files = st.file_uploader( | |
"Upload additional data", | |
type=["xlsx"], | |
accept_multiple_files=True, | |
on_change=set_Panel_1_Panel_2_Selected_false, | |
) | |
# Custom HTML for upload instructions | |
recommendation_html = f""" | |
<div style="text-align: justify;"> | |
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values. | |
</div> | |
""" | |
st.markdown(recommendation_html, unsafe_allow_html=True) | |
# Choose Desired Granularity | |
st.markdown("#### Choose Desired Granularity") | |
# Granularity Selection | |
granularity_selection = st.selectbox( | |
"Choose Date Granularity", | |
["Daily", "Weekly", "Monthly"], | |
label_visibility="collapsed", | |
on_change=set_Panel_1_Panel_2_Selected_false, | |
) | |
granularity_selection = str(granularity_selection).lower() | |
# Convert files to dataframes | |
files_dict = files_to_dataframes(uploaded_files) | |
# Add API Dataframe | |
if main_df is not None: | |
files_dict = add_api_dataframe_to_dict(main_df, files_dict) | |
# Display a warning message if no files have been uploaded and halt further execution | |
if not files_dict: | |
st.warning( | |
"Please upload at least one file to proceed.", | |
icon="⚠️", | |
) | |
st.stop() # Halts further execution until file is uploaded | |
# Select Panel_1 and Panel_2 columns | |
st.markdown("#### Select Panel columns") | |
selections = {} | |
with st.expander("Select Panel columns", expanded=False): | |
count = 0 # Initialize counter to manage the visibility of labels and keys | |
for file_name, file_data in files_dict.items(): | |
# Determine visibility of the label based on the count | |
if count == 0: | |
label_visibility = "visible" | |
else: | |
label_visibility = "collapsed" | |
# Extract non-numeric columns | |
non_numeric_cols = file_data["non_numeric"] | |
# Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option | |
panel1_values = non_numeric_cols + ["N/A"] | |
panel2_values = non_numeric_cols + ["N/A"] | |
# Skip if only one option is available | |
if len(panel1_values) == 1 and len(panel2_values) == 1: | |
selected_panel1, selected_panel2 = "N/A", "N/A" | |
# Update the selections for Panel_1 and Panel_2 for the current file | |
selections[file_name] = { | |
"Panel_1": selected_panel1, | |
"Panel_2": selected_panel2, | |
} | |
continue | |
# Create layout columns for File Name, Panel_2, and Panel_1 selections | |
file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4]) | |
with file_name_col: | |
# Display "File Name" label only for the first file | |
if count == 0: | |
st.write("File Name") | |
else: | |
st.write("") | |
st.write(file_name) # Display the file name | |
with Panel_1_col: | |
# Display a selectbox for Panel_1 values | |
selected_panel1 = st.selectbox( | |
"Select Panel Level 1", | |
panel2_values, | |
on_change=set_Panel_1_Panel_2_Selected_false, | |
label_visibility=label_visibility, # Control visibility of the label | |
key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox | |
) | |
with Panel_2_col: | |
# Display a selectbox for Panel_2 values | |
selected_panel2 = st.selectbox( | |
"Select Panel Level 2", | |
panel1_values, | |
on_change=set_Panel_1_Panel_2_Selected_false, | |
label_visibility=label_visibility, # Control visibility of the label | |
key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox | |
) | |
# Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues | |
if selected_panel2 == selected_panel1 and not ( | |
selected_panel2 == "N/A" and selected_panel1 == "N/A" | |
): | |
st.warning( | |
f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.", | |
) | |
selected_panel1, selected_panel2 = "N/A", "N/A" | |
st.stop() | |
# Update the selections for Panel_1 and Panel_2 for the current file | |
selections[file_name] = { | |
"Panel_1": selected_panel1, | |
"Panel_2": selected_panel2, | |
} | |
count += 1 # Increment the counter after processing each file | |
# Accept Panel_1 and Panel_2 selection | |
if st.button("Accept and Process", use_container_width=True): | |
# Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity | |
with st.spinner("Processing..."): | |
files_dict = standardize_data_to_daily(files_dict, selections) | |
# Convert all data to daily level granularity | |
files_dict = apply_granularity_to_all( | |
files_dict, granularity_selection, selections | |
) | |
# Update the 'files_dict' in the session state | |
st.session_state["files_dict"] = files_dict | |
# Set a flag in the session state to indicate that selection has been made | |
st.session_state["Panel_1_Panel_2_Selected"] = True | |
######################################################################################################################################################### | |
# Display unique Panel_1 and Panel_2 values | |
######################################################################################################################################################### | |
# Halts further execution until Panel_1 and Panel_2 columns are selected | |
if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]: | |
files_dict = st.session_state["files_dict"] | |
else: | |
st.stop() | |
# Set to store unique values of Panel_1 and Panel_2 | |
with st.spinner("Fetching Panel values..."): | |
all_panel1_values, all_panel2_values = clean_and_extract_unique_values( | |
files_dict, selections | |
) | |
# List of Panel_1 and Panel_2 columns unique values | |
list_of_all_panel1_values = list(all_panel1_values) | |
list_of_all_panel2_values = list(all_panel2_values) | |
# Format Panel_1 and Panel_2 values for display | |
formatted_panel1_values = format_values_for_display(list_of_all_panel1_values) | |
formatted_panel2_values = format_values_for_display(list_of_all_panel2_values) | |
# Unique Panel_1 and Panel_2 values | |
st.markdown("#### Unique Panel values") | |
# Display Panel_1 and Panel_2 values | |
with st.expander("Unique Panel values"): | |
st.write("") | |
st.markdown( | |
f""" | |
<style> | |
.justify-text {{ | |
text-align: justify; | |
}} | |
</style> | |
<div class="justify-text"> | |
<strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br> | |
<strong>Panel Level 2 Values:</strong> {formatted_panel2_values} | |
</div> | |
""", | |
unsafe_allow_html=True, | |
) | |
# Display total Panel_1 and Panel_2 | |
st.write("") | |
st.markdown( | |
f""" | |
<div style="text-align: justify;"> | |
<strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br> | |
<strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)} | |
</div> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.write("") | |
######################################################################################################################################################### | |
# Merge all DataFrames | |
######################################################################################################################################################### | |
# Merge all DataFrames selected | |
main_df = create_main_dataframe( | |
files_dict, all_panel1_values, all_panel2_values, granularity_selection | |
) | |
merged_df = merge_into_main_df(main_df, files_dict, selections) | |
######################################################################################################################################################### | |
# Categorize Variables and Impute Missing Values | |
######################################################################################################################################################### | |
# Create an editable DataFrame in Streamlit | |
st.markdown("#### Select Variables Category & Impute Missing Values") | |
# Prepare missing stats DataFrame for editing | |
missing_stats_df = prepare_missing_stats_df(merged_df) | |
edited_stats_df = st.data_editor( | |
missing_stats_df, | |
column_config={ | |
"Impute Method": st.column_config.SelectboxColumn( | |
options=[ | |
"Drop Column", | |
"Fill with Mean", | |
"Fill with Median", | |
"Fill with 0", | |
], | |
required=True, | |
default="Fill with 0", | |
), | |
"Category": st.column_config.SelectboxColumn( | |
options=[ | |
"Media", | |
"Exogenous", | |
"Internal", | |
"Response Metrics", | |
], | |
required=True, | |
default="Media", | |
), | |
}, | |
disabled=["Column", "Missing Values", "Missing Percentage"], | |
hide_index=True, | |
use_container_width=True, | |
) | |
# Apply changes based on edited DataFrame | |
for i, row in edited_stats_df.iterrows(): | |
column = row["Column"] | |
if row["Impute Method"] == "Drop Column": | |
merged_df.drop(columns=[column], inplace=True) | |
elif row["Impute Method"] == "Fill with Mean": | |
merged_df[column].fillna(merged_df[column].mean(), inplace=True) | |
elif row["Impute Method"] == "Fill with Median": | |
merged_df[column].fillna(merged_df[column].median(), inplace=True) | |
elif row["Impute Method"] == "Fill with 0": | |
merged_df[column].fillna(0, inplace=True) | |
######################################################################################################################################################### | |
# Group columns | |
######################################################################################################################################################### | |
# Display Group columns header | |
st.markdown("#### Feature engineering") | |
# Prepare the numeric columns and an empty DataFrame for user input | |
numeric_columns, default_df = prepare_numeric_columns_and_default_df( | |
merged_df, edited_stats_df | |
) | |
# Display editable Dataframe | |
edited_df = st.data_editor( | |
default_df, | |
column_config={ | |
"Column 1": st.column_config.SelectboxColumn( | |
options=numeric_columns, | |
required=True, | |
default=numeric_columns[0], | |
width=400, | |
), | |
"Operator": st.column_config.SelectboxColumn( | |
options=["+", "-", "*", "/"], | |
required=True, | |
default="+", | |
width=100, | |
), | |
"Column 2": st.column_config.SelectboxColumn( | |
options=numeric_columns, | |
required=True, | |
default=numeric_columns[0], | |
width=400, | |
), | |
"Category": st.column_config.SelectboxColumn( | |
options=[ | |
"Media", | |
"Exogenous", | |
"Internal", | |
"Response Metrics", | |
], | |
required=True, | |
default="Media", | |
width=200, | |
), | |
}, | |
num_rows="dynamic", | |
) | |
# Process the DataFrame based on user inputs and operations specified in edited_df | |
final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df) | |
######################################################################################################################################################### | |
# Display the Final DataFrame and variables | |
######################################################################################################################################################### | |
# Display the Final DataFrame and variables | |
st.markdown("#### Final DataFrame") | |
st.dataframe(final_df, hide_index=True) | |
# Initialize an empty dictionary to hold categories and their variables | |
category_dict = {} | |
# Iterate over each row in the edited DataFrame to populate the dictionary | |
for i, row in edited_stats_df.iterrows(): | |
column = row["Column"] | |
category = row["Category"] # The category chosen by the user for this variable | |
# Check if the category already exists in the dictionary | |
if category not in category_dict: | |
# If not, initialize it with the current column as its first element | |
category_dict[category] = [column] | |
else: | |
# If it exists, append the current column to the list of variables under this category | |
category_dict[category].append(column) | |
# Add Date, Panel_1 and Panel_12 in category dictionary | |
category_dict.update({"Date": ["date"]}) | |
if "Panel_1" in final_df.columns: | |
category_dict["Panel Level 1"] = ["Panel_1"] | |
if "Panel_2" in final_df.columns: | |
category_dict["Panel Level 2"] = ["Panel_2"] | |
# Display the dictionary | |
st.markdown("#### Variable Category") | |
for category, variables in category_dict.items(): | |
# Check if there are multiple variables to handle "and" insertion correctly | |
if len(variables) > 1: | |
# Join all but the last variable with ", ", then add " and " before the last variable | |
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1] | |
else: | |
# If there's only one variable, no need for "and" | |
variables_str = variables[0] | |
# Display the category and its variables in the desired format | |
st.markdown( | |
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>", | |
unsafe_allow_html=True, | |
) | |
# Function to check if Response Metrics is selected | |
st.write("") | |
response_metrics_col = category_dict.get("Response Metrics", []) | |
if len(response_metrics_col) == 0: | |
st.warning("Please select Response Metrics column", icon="⚠️") | |
st.stop() | |
# elif len(response_metrics_col) > 1: | |
# st.warning("Please select only one Response Metrics column", icon="⚠️") | |
# st.stop() | |
# Store final dataframe and bin dictionary into session state | |
st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict | |
# Save the DataFrame and dictionary from the session state to the pickle file | |
if st.button("Accept and Save", use_container_width=True): | |
save_to_pickle( | |
"data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"] | |
) | |
st.toast("💾 Saved Successfully!") | |