Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

BlendMMM commited on Apr 8, 2024

Commit

8f35613

verified ·

1 Parent(s): 2c56ebc

Upload 78 files

Browse files

Files changed (46) hide show

.gitattributes +4 -5
Data_Import (1).py +995 -0
Data_Import .py +1019 -0
Data_prep_functions.py +72 -59
Model/model_0.pkl +3 -0
Model/model_1.pkl +3 -0
Model/model_2.pkl +3 -0
Model/model_3.pkl +3 -0
Model/model_4.pkl +3 -0
Overview_data_test_panel@#app_installs.xlsx +0 -0
Overview_data_test_panel@#revenue.xlsx +0 -0
Overview_data_test_panelreplace_meapp_installs.xlsx +0 -0
README.md +1 -1
Test/merged_df_contri.csv +0 -0
Test/output_df.csv +16 -0
Test/scenario_test_df.csv +16 -0
Test/x_test_contribution.csv +0 -0
Test/x_test_to_save.csv +0 -0
Test/x_train_contribution.csv +0 -0
Test/x_train_to_save.csv +0 -0
best_models.pkl +2 -2
classes.py +130 -106
data_import.pkl +3 -0
data_test_overview_panel_#total_approved_accounts_revenue.xlsx +3 -0
final_df_transformed.pkl +3 -0
metrics_level_data/Overview_data_test_panel@#app_installs.xlsx +0 -0
metrics_level_data/Overview_data_test_panel@#revenue.xlsx +0 -0
model_output.csv +6 -11
pages/10_Optimized_Result_Analysis.py +23 -77
pages/1_Data_Validation.py +158 -188
pages/2_Transformations.py +522 -0
pages/4_Model_Build.py +826 -0
pages/4_Saved_Model_Results.py +461 -267
pages/5_Model_Tuning_with_panel.py +527 -0
pages/6_Model_Result_Overview.py +348 -0
pages/7_Build_Response_Curves.py +185 -0
pages/8_Scenario_Planner.py +458 -167
requirements.txt +94 -102
summary_df.pkl +1 -1
tuned_model.pkl +3 -0
upf_data_converted_old.csv +0 -0
upf_data_converted_old.xlsx +3 -0
upf_data_converted_randomized_resp_metrics.csv +0 -0
upf_data_converted_randomized_resp_metrics.xlsx +3 -0
utilities.py +534 -263
utilities_with_panel.py +1018 -0

.gitattributes CHANGED Viewed

@@ -33,9 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-E0DAF720 filter=lfs diff=lfs merge=lfs -text
-Profile_Report.html filter=lfs diff=lfs merge=lfs -text
-raw_data_nov7_combined.xlsx filter=lfs diff=lfs merge=lfs -text
-raw_data_nov7_combined1.xlsx filter=lfs diff=lfs merge=lfs -text
-upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text
 Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_test_overview_panel_\#total_approved_accounts_revenue.xlsx filter=lfs diff=lfs merge=lfs -text
 Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text
+upf_data_converted_old.xlsx filter=lfs diff=lfs merge=lfs -text
+upf_data_converted_randomized_resp_metrics.xlsx filter=lfs diff=lfs merge=lfs -text
+upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text

Data_Import (1).py ADDED Viewed

	@@ -0,0 +1,995 @@

+# Importing necessary libraries
+import streamlit as st
+st.set_page_config(
+    page_title="Data Import",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+import pickle
+import pandas as pd
+from utilities import set_header, load_local_css, authentication
+load_local_css("styles.css")
+set_header()
+# Check for authentication status
+authenticator, name, authentication_status, username = authentication()
+if authentication_status != True:
+    st.stop()
+else:
+    authenticator.logout("Logout", "main")
+# Function to validate date column in dataframe
+def validate_date_column(df):
+    try:
+        # Attempt to convert the 'Date' column to datetime
+        df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
+        return True
+    except:
+        return False
+# Function to determine data interval
+def determine_data_interval(common_freq):
+    if common_freq == 1:
+        return "daily"
+    elif common_freq == 7:
+        return "weekly"
+    elif 28 <= common_freq <= 31:
+        return "monthly"
+    else:
+        return "irregular"
+# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
+st.cache_resource(show_spinner=False)
+def files_to_dataframes(uploaded_files):
+    df_dict = {}
+    for uploaded_file in uploaded_files:
+        # Extract file name without extension
+        file_name = uploaded_file.name.rsplit(".", 1)[0]
+        # Check for duplicate file names
+        if file_name in df_dict:
+            st.warning(
+                f"Duplicate File: {file_name}. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Read the file into a DataFrame
+        df = pd.read_excel(uploaded_file)
+        # Convert all column names to lowercase
+        df.columns = df.columns.str.lower().str.strip()
+        # Separate numeric and non-numeric columns
+        numeric_cols = list(df.select_dtypes(include=["number"]).columns)
+        non_numeric_cols = [
+            col
+            for col in df.select_dtypes(exclude=["number"]).columns
+            if col.lower() != "date"
+        ]
+        # Check for 'Date' column
+        if not (validate_date_column(df) and len(numeric_cols) > 0):
+            st.warning(
+                f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Check for interval
+        common_freq = common_freq = (
+            pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
+        )
+        # Calculate the data interval (daily, weekly, monthly or irregular)
+        interval = determine_data_interval(common_freq)
+        if interval == "irregular":
+            st.warning(
+                f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Store both DataFrames in the dictionary under their respective keys
+        df_dict[file_name] = {
+            "numeric": numeric_cols,
+            "non_numeric": non_numeric_cols,
+            "interval": interval,
+            "df": df,
+        }
+    return df_dict
+# Function to adjust dataframe granularity
+def adjust_dataframe_granularity(df, current_granularity, target_granularity):
+    # Set index
+    df.set_index("date", inplace=True)
+    # Define aggregation rules for resampling
+    aggregation_rules = {
+        col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
+        for col in df.columns
+    }
+    # Initialize resampled_df
+    resampled_df = df
+    if current_granularity == "daily" and target_granularity == "weekly":
+        resampled_df = df.resample("W-MON", closed="left", label="left").agg(
+            aggregation_rules
+        )
+    elif current_granularity == "daily" and target_granularity == "monthly":
+        resampled_df = df.resample("MS", closed="left", label="left").agg(
+            aggregation_rules
+        )
+    elif current_granularity == "daily" and target_granularity == "daily":
+        resampled_df = df.resample("D").agg(aggregation_rules)
+    elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
+        # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
+        expanded_data = []
+        for _, row in df.iterrows():
+            if current_granularity == "weekly":
+                period_range = pd.date_range(start=row.name, periods=7)
+            elif current_granularity == "monthly":
+                period_range = pd.date_range(
+                    start=row.name, periods=row.name.days_in_month
+                )
+            for date in period_range:
+                new_row = {}
+                for col in df.columns:
+                    if pd.api.types.is_numeric_dtype(df[col]):
+                        if current_granularity == "weekly":
+                            new_row[col] = row[col] / 7
+                        elif current_granularity == "monthly":
+                            new_row[col] = row[col] / row.name.days_in_month
+                    else:
+                        new_row[col] = row[col]
+                expanded_data.append((date, new_row))
+        resampled_df = pd.DataFrame(
+            [data for _, data in expanded_data],
+            index=[date for date, _ in expanded_data],
+        )
+    # Reset index
+    resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
+    return resampled_df
+# Function to clean and extract unique values of Panel_1 and Panel_2
+st.cache_resource(show_spinner=False)
+def clean_and_extract_unique_values(files_dict, selections):
+    all_panel1_values = set()
+    all_panel2_values = set()
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"]
+        # 'Panel_1' and 'Panel_2' selections
+        selected_panel1 = selections[file_name].get("Panel_1")
+        selected_panel2 = selections[file_name].get("Panel_2")
+        # Clean and standardize Panel_1 column if it exists and is selected
+        if (
+            selected_panel1
+            and selected_panel1 != "N/A"
+            and selected_panel1 in df.columns
+        ):
+            df[selected_panel1] = (
+                df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
+            )
+            all_panel1_values.update(df[selected_panel1].dropna().unique())
+        # Clean and standardize Panel_2 column if it exists and is selected
+        if (
+            selected_panel2
+            and selected_panel2 != "N/A"
+            and selected_panel2 in df.columns
+        ):
+            df[selected_panel2] = (
+                df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
+            )
+            all_panel2_values.update(df[selected_panel2].dropna().unique())
+        # Update the processed DataFrame back in the dictionary
+        files_dict[file_name]["df"] = df
+    return all_panel1_values, all_panel2_values
+# Function to format values for display
+st.cache_resource(show_spinner=False)
+def format_values_for_display(values_list):
+    # Capitalize the first letter of each word and replace underscores with spaces
+    formatted_list = [value.replace("_", " ").title() for value in values_list]
+    # Join values with commas and 'and' before the last value
+    if len(formatted_list) > 1:
+        return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
+    elif formatted_list:
+        return formatted_list[0]
+    return "No values available"
+# Function to normalizes all data within files_dict to a daily granularity
+st.cache(show_spinner=False, allow_output_mutation=True)
+def standardize_data_to_daily(files_dict, selections):
+    # Normalize all data to a daily granularity using a provided function
+    files_dict = apply_granularity_to_all(files_dict, "daily", selections)
+    # Update the "interval" attribute for each dataset to indicate the new granularity
+    for files_name, files_data in files_dict.items():
+        files_data["interval"] = "daily"
+    return files_dict
+# Function to apply granularity transformation to all DataFrames in files_dict
+st.cache_resource(show_spinner=False)
+def apply_granularity_to_all(files_dict, granularity_selection, selections):
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"].copy()
+        # Handling when Panel_1 or Panel_2 might be 'N/A'
+        selected_panel1 = selections[file_name].get("Panel_1")
+        selected_panel2 = selections[file_name].get("Panel_2")
+        # Correcting the segment selection logic & handling 'N/A'
+        if selected_panel1 != "N/A" and selected_panel2 != "N/A":
+            unique_combinations = df[
+                [selected_panel1, selected_panel2]
+            ].drop_duplicates()
+        elif selected_panel1 != "N/A":
+            unique_combinations = df[[selected_panel1]].drop_duplicates()
+            selected_panel2 = None  # Ensure Panel_2 is ignored if N/A
+        elif selected_panel2 != "N/A":
+            unique_combinations = df[[selected_panel2]].drop_duplicates()
+            selected_panel1 = None  # Ensure Panel_1 is ignored if N/A
+        else:
+            # If both are 'N/A', process the entire dataframe as is
+            df = adjust_dataframe_granularity(
+                df, file_data["interval"], granularity_selection
+            )
+            files_dict[file_name]["df"] = df
+            continue  # Skip to the next file
+        transformed_segments = []
+        for _, combo in unique_combinations.iterrows():
+            if selected_panel1 and selected_panel2:
+                segment = df[
+                    (df[selected_panel1] == combo[selected_panel1])
+                    & (df[selected_panel2] == combo[selected_panel2])
+                ]
+            elif selected_panel1:
+                segment = df[df[selected_panel1] == combo[selected_panel1]]
+            elif selected_panel2:
+                segment = df[df[selected_panel2] == combo[selected_panel2]]
+            # Adjust granularity of the segment
+            transformed_segment = adjust_dataframe_granularity(
+                segment, file_data["interval"], granularity_selection
+            )
+            transformed_segments.append(transformed_segment)
+        # Combine all transformed segments into a single DataFrame for this file
+        transformed_df = pd.concat(transformed_segments, ignore_index=True)
+        files_dict[file_name]["df"] = transformed_df
+    return files_dict
+# Function to create main dataframe structure
+st.cache_resource(show_spinner=False)
+def create_main_dataframe(
+    files_dict, all_panel1_values, all_panel2_values, granularity_selection
+):
+    # Determine the global start and end dates across all DataFrames
+    global_start = min(df["df"]["date"].min() for df in files_dict.values())
+    global_end = max(df["df"]["date"].max() for df in files_dict.values())
+    # Adjust the date_range generation based on the granularity_selection
+    if granularity_selection == "weekly":
+        # Generate a weekly range, with weeks starting on Monday
+        date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
+    elif granularity_selection == "monthly":
+        # Generate a monthly range, starting from the first day of each month
+        date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
+    else:  # Default to daily if not weekly or monthly
+        date_range = pd.date_range(start=global_start, end=global_end, freq="D")
+    # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
+    all_panel1s = all_panel1_values
+    all_panel2s = all_panel2_values
+    # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
+    dimensions, merge_keys = [], []
+    if all_panel1s:
+        dimensions.append(all_panel1s)
+        merge_keys.append("Panel_1")
+    if all_panel2s:
+        dimensions.append(all_panel2s)
+        merge_keys.append("Panel_2")
+    dimensions.append(date_range)  # Date range is always included
+    merge_keys.append("date")  # Date range is always included
+    # Create a main DataFrame template with the dimensions
+    main_df = pd.MultiIndex.from_product(
+        dimensions,
+        names=[name for name, _ in zip(merge_keys, dimensions)],
+    ).to_frame(index=False)
+    return main_df.reset_index(drop=True)
+# Function to prepare and merge dataFrames
+st.cache_resource(show_spinner=False)
+def merge_into_main_df(main_df, files_dict, selections):
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"].copy()
+        # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
+        selected_panel1 = selections[file_name].get("Panel_1", "N/A")
+        selected_panel2 = selections[file_name].get("Panel_2", "N/A")
+        if selected_panel1 != "N/A":
+            df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
+        if selected_panel2 != "N/A":
+            df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
+        # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
+        merge_keys = ["date"]
+        if "Panel_1" in df.columns:
+            merge_keys.append("Panel_1")
+        if "Panel_2" in df.columns:
+            merge_keys.append("Panel_2")
+        main_df = pd.merge(main_df, df, on=merge_keys, how="left")
+    # After all merges, sort by 'date' and reset index for cleanliness
+    sort_by = ["date"]
+    if "Panel_1" in main_df.columns:
+        sort_by.append("Panel_1")
+    if "Panel_2" in main_df.columns:
+        sort_by.append("Panel_2")
+    main_df.sort_values(by=sort_by, inplace=True)
+    main_df.reset_index(drop=True, inplace=True)
+    return main_df
+# Function to categorize column
+def categorize_column(column_name):
+    # Define keywords for each category
+    internal_keywords = [
+        "Price",
+        "Discount",
+        "product_price",
+        "cost",
+        "margin",
+        "inventory",
+        "sales",
+        "revenue",
+        "turnover",
+        "expense",
+    ]
+    exogenous_keywords = [
+        "GDP",
+        "Tax",
+        "Inflation",
+        "interest_rate",
+        "employment_rate",
+        "exchange_rate",
+        "consumer_spending",
+        "retail_sales",
+        "oil_prices",
+        "weather",
+    ]
+    # Check if the column name matches any of the keywords for Internal or Exogenous categories
+    for keyword in internal_keywords:
+        if keyword.lower() in column_name.lower():
+            return "Internal"
+    for keyword in exogenous_keywords:
+        if keyword.lower() in column_name.lower():
+            return "Exogenous"
+    # Default to Media if no match found
+    return "Media"
+# Function to calculate missing stats and prepare for editable DataFrame
+st.cache_resource(show_spinner=False)
+def prepare_missing_stats_df(df):
+    missing_stats = []
+    for column in df.columns:
+        if (
+            column == "date" or column == "Panel_2" or column == "Panel_1"
+        ):  # Skip Date, Panel_1 and Panel_2 column
+            continue
+        missing = df[column].isnull().sum()
+        pct_missing = round((missing / len(df)) * 100, 2)
+        # Dynamically assign category based on column name
+        category = categorize_column(column)
+        # category = "Media"  # Keep default bin as Media
+        missing_stats.append(
+            {
+                "Column": column,
+                "Missing Values": missing,
+                "Missing Percentage": pct_missing,
+                "Impute Method": "Fill with 0",  # Default value
+                "Category": category,
+            }
+        )
+    stats_df = pd.DataFrame(missing_stats)
+    return stats_df
+# Function to add API DataFrame details to the files dictionary
+st.cache_resource(show_spinner=False)
+def add_api_dataframe_to_dict(main_df, files_dict):
+    files_dict["API"] = {
+        "numeric": list(main_df.select_dtypes(include=["number"]).columns),
+        "non_numeric": [
+            col
+            for col in main_df.select_dtypes(exclude=["number"]).columns
+            if col.lower() != "date"
+        ],
+        "interval": determine_data_interval(
+            pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
+        ),
+        "df": main_df,
+    }
+    return files_dict
+# Function to reads an API into a DataFrame, parsing specified columns as datetime
+@st.cache_resource(show_spinner=False)
+def read_API_data():
+    return pd.read_excel(r".\upf_data_converted.xlsx", parse_dates=["Date"])
+# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
+def set_Panel_1_Panel_2_Selected_false():
+    st.session_state["Panel_1_Panel_2_Selected"] = False
+# Function to serialize and save the objects into a pickle file
+@st.cache_resource(show_spinner=False)
+def save_to_pickle(file_path, final_df, bin_dict):
+    # Open the file in write-binary mode and dump the objects
+    with open(file_path, "wb") as f:
+        pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
+        # Data is now saved to file
+# Function to processes the merged_df DataFrame based on operations defined in edited_df
+@st.cache_resource(show_spinner=False)
+def process_dataframes(merged_df, edited_df, edited_stats_df):
+    # Ensure there are operations defined by the user
+    if edited_df.empty:
+        return merged_df, edited_stats_df  # No operations to apply
+    # Perform operations as defined by the user
+    for index, row in edited_df.iterrows():
+        result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
+        col1 = row["Column 1"]
+        col2 = row["Column 2"]
+        op = row["Operator"]
+        # Apply the specified operation
+        if op == "+":
+            merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
+        elif op == "-":
+            merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
+        elif op == "*":
+            merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
+        elif op == "/":
+            merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
+                0, 1e-9
+            )
+        # Add summary of operation to edited_stats_df
+        new_row = {
+            "Column": result_column_name,
+            "Missing Values": None,
+            "Missing Percentage": None,
+            "Impute Method": None,
+            "Category": row["Category"],
+        }
+        new_row_df = pd.DataFrame([new_row])
+        # Use pd.concat to add the new_row_df to edited_stats_df
+        edited_stats_df = pd.concat(
+            [edited_stats_df, new_row_df], ignore_index=True, axis=0
+        )
+    # Combine column names from edited_df for cleanup
+    combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
+    # Filter out rows in edited_stats_df and drop columns from merged_df
+    edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
+    merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
+    return merged_df, edited_stats_df
+# Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
+st.cache_resource(show_spinner=False)
+def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
+    # Get columns categorized as 'Response Metrics'
+    columns_response_metrics = edited_stats_df[
+        edited_stats_df["Category"] == "Response Metrics"
+    ]["Column"].tolist()
+    # Filter numeric columns, excluding those categorized as 'Response Metrics'
+    numeric_columns = [
+        col
+        for col in merged_df.select_dtypes(include=["number"]).columns
+        if col not in columns_response_metrics
+    ]
+    # Define the structure of the empty DataFrame
+    data = {
+        "Column 1": pd.Series([], dtype="str"),
+        "Operator": pd.Series([], dtype="str"),
+        "Column 2": pd.Series([], dtype="str"),
+        "Category": pd.Series([], dtype="str"),
+    }
+    default_df = pd.DataFrame(data)
+    return numeric_columns, default_df
+# Initialize 'final_df' in session state
+if "final_df" not in st.session_state:
+    st.session_state["final_df"] = pd.DataFrame()
+# Initialize 'bin_dict' in session state
+if "bin_dict" not in st.session_state:
+    st.session_state["bin_dict"] = {}
+# Initialize 'Panel_1_Panel_2_Selected' in session state
+if "Panel_1_Panel_2_Selected" not in st.session_state:
+    st.session_state["Panel_1_Panel_2_Selected"] = False
+# Page Title
+st.write("")  # Top padding
+st.title("Data Import")
+#########################################################################################################################################################
+# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
+#########################################################################################################################################################
+# Read the Excel file, parsing 'Date' column as datetime
+main_df = read_API_data()
+# Convert all column names to lowercase
+main_df.columns = main_df.columns.str.lower().str.strip()
+# File uploader
+uploaded_files = st.file_uploader(
+    "Upload additional data",
+    type=["xlsx"],
+    accept_multiple_files=True,
+    on_change=set_Panel_1_Panel_2_Selected_false,
+)
+# Custom HTML for upload instructions
+recommendation_html = f"""
+<div style="text-align: justify;">
+<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
+</div>
+"""
+st.markdown(recommendation_html, unsafe_allow_html=True)
+# Choose Desired Granularity
+st.markdown("#### Choose Desired Granularity")
+# Granularity Selection
+granularity_selection = st.selectbox(
+    "Choose Date Granularity",
+    ["Daily", "Weekly", "Monthly"],
+    label_visibility="collapsed",
+    on_change=set_Panel_1_Panel_2_Selected_false,
+)
+granularity_selection = str(granularity_selection).lower()
+# Convert files to dataframes
+files_dict = files_to_dataframes(uploaded_files)
+# Add API Dataframe
+if main_df is not None:
+    files_dict = add_api_dataframe_to_dict(main_df, files_dict)
+# Display a warning message if no files have been uploaded and halt further execution
+if not files_dict:
+    st.warning(
+        "Please upload at least one file to proceed.",
+        icon="⚠️",
+    )
+    st.stop()  # Halts further execution until file is uploaded
+# Select Panel_1 and Panel_2 columns
+st.markdown("#### Select Panel columns")
+selections = {}
+with st.expander("Select Panel columns", expanded=False):
+    count = 0  # Initialize counter to manage the visibility of labels and keys
+    for file_name, file_data in files_dict.items():
+        # Determine visibility of the label based on the count
+        if count == 0:
+            label_visibility = "visible"
+        else:
+            label_visibility = "collapsed"
+        # Extract non-numeric columns
+        non_numeric_cols = file_data["non_numeric"]
+        # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
+        panel1_values = non_numeric_cols + ["N/A"]
+        panel2_values = non_numeric_cols + ["N/A"]
+        # Skip if only one option is available
+        if len(panel1_values) == 1 and len(panel2_values) == 1:
+            selected_panel1, selected_panel2 = "N/A", "N/A"
+            # Update the selections for Panel_1 and Panel_2 for the current file
+            selections[file_name] = {
+                "Panel_1": selected_panel1,
+                "Panel_2": selected_panel2,
+            }
+            continue
+        # Create layout columns for File Name, Panel_2, and Panel_1 selections
+        file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
+        with file_name_col:
+            # Display "File Name" label only for the first file
+            if count == 0:
+                st.write("File Name")
+            else:
+                st.write("")
+            st.write(file_name)  # Display the file name
+        with Panel_1_col:
+            # Display a selectbox for Panel_1 values
+            selected_panel1 = st.selectbox(
+                "Select Panel Level 1",
+                panel2_values,
+                on_change=set_Panel_1_Panel_2_Selected_false,
+                label_visibility=label_visibility,  # Control visibility of the label
+                key=f"Panel_1_selectbox{count}",  # Ensure unique key for each selectbox
+            )
+        with Panel_2_col:
+            # Display a selectbox for Panel_2 values
+            selected_panel2 = st.selectbox(
+                "Select Panel Level 2",
+                panel1_values,
+                on_change=set_Panel_1_Panel_2_Selected_false,
+                label_visibility=label_visibility,  # Control visibility of the label
+                key=f"Panel_2_selectbox{count}",  # Ensure unique key for each selectbox
+            )
+        # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
+        if selected_panel2 == selected_panel1 and not (
+            selected_panel2 == "N/A" and selected_panel1 == "N/A"
+        ):
+            st.warning(
+                f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
+            )
+            selected_panel1, selected_panel2 = "N/A", "N/A"
+            st.stop()
+        # Update the selections for Panel_1 and Panel_2 for the current file
+        selections[file_name] = {
+            "Panel_1": selected_panel1,
+            "Panel_2": selected_panel2,
+        }
+        count += 1  # Increment the counter after processing each file
+    # Accept Panel_1 and Panel_2 selection
+    if st.button("Accept and Process", use_container_width=True):
+        # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
+        with st.spinner("Processing...", cache=True):
+            files_dict = standardize_data_to_daily(files_dict, selections)
+            # Convert all data to daily level granularity
+            files_dict = apply_granularity_to_all(
+                files_dict, granularity_selection, selections
+            )
+        # Update the 'files_dict' in the session state
+        st.session_state["files_dict"] = files_dict
+        # Set a flag in the session state to indicate that selection has been made
+        st.session_state["Panel_1_Panel_2_Selected"] = True
+#########################################################################################################################################################
+# Display unique Panel_1 and Panel_2 values
+#########################################################################################################################################################
+# Halts further execution until Panel_1 and Panel_2 columns are selected
+if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
+    files_dict = st.session_state["files_dict"]
+else:
+    st.stop()
+# Set to store unique values of Panel_1 and Panel_2
+with st.spinner("Fetching Panel values..."):
+    all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
+        files_dict, selections
+    )
+    # List of Panel_1 and Panel_2 columns unique values
+    list_of_all_panel1_values = list(all_panel1_values)
+    list_of_all_panel2_values = list(all_panel2_values)
+    # Format Panel_1 and Panel_2 values for display
+    formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
+    formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
+# Unique Panel_1 and Panel_2 values
+st.markdown("#### Unique Panel values")
+# Display Panel_1 and Panel_2 values
+with st.expander("Unique Panel values"):
+    st.write("")
+    st.markdown(
+        f"""
+    <style>
+    .justify-text {{
+    text-align: justify;
+    }}
+    </style>
+    <div class="justify-text">
+    <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
+    <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
+    </div>
+    """,
+        unsafe_allow_html=True,
+    )
+    # Display total Panel_1 and Panel_2
+    st.write("")
+    st.markdown(
+        f"""
+    <div style="text-align: justify;">
+        <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
+        <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
+    </div>
+    """,
+        unsafe_allow_html=True,
+    )
+    st.write("")
+#########################################################################################################################################################
+# Merge all DataFrames
+#########################################################################################################################################################
+# Merge all DataFrames selected
+main_df = create_main_dataframe(
+    files_dict, all_panel1_values, all_panel2_values, granularity_selection
+)
+merged_df = merge_into_main_df(main_df, files_dict, selections)
+#########################################################################################################################################################
+# Categorize Variables and Impute Missing Values
+#########################################################################################################################################################
+# Create an editable DataFrame in Streamlit
+st.markdown("#### Select Variables Category & Impute Missing Values")
+# Prepare missing stats DataFrame for editing
+missing_stats_df = prepare_missing_stats_df(merged_df)
+edited_stats_df = st.data_editor(
+    missing_stats_df,
+    column_config={
+        "Impute Method": st.column_config.SelectboxColumn(
+            options=[
+                "Drop Column",
+                "Fill with Mean",
+                "Fill with Median",
+                "Fill with 0",
+            ],
+            required=True,
+            default="Fill with 0",
+        ),
+        "Category": st.column_config.SelectboxColumn(
+            options=[
+                "Media",
+                "Exogenous",
+                "Internal",
+                "Response Metrics",
+            ],
+            required=True,
+            default="Media",
+        ),
+    },
+    disabled=["Column", "Missing Values", "Missing Percentage"],
+    hide_index=True,
+    use_container_width=True,
+)
+# Apply changes based on edited DataFrame
+for i, row in edited_stats_df.iterrows():
+    column = row["Column"]
+    if row["Impute Method"] == "Drop Column":
+        merged_df.drop(columns=[column], inplace=True)
+    elif row["Impute Method"] == "Fill with Mean":
+        merged_df[column].fillna(merged_df[column].mean(), inplace=True)
+    elif row["Impute Method"] == "Fill with Median":
+        merged_df[column].fillna(merged_df[column].median(), inplace=True)
+    elif row["Impute Method"] == "Fill with 0":
+        merged_df[column].fillna(0, inplace=True)
+#########################################################################################################################################################
+# Group columns
+#########################################################################################################################################################
+# Display Group columns header
+st.markdown("#### Feature engineering")
+# Prepare the numeric columns and an empty DataFrame for user input
+numeric_columns, default_df = prepare_numeric_columns_and_default_df(
+    merged_df, edited_stats_df
+)
+# Display editable Dataframe
+edited_df = st.data_editor(
+    default_df,
+    column_config={
+        "Column 1": st.column_config.SelectboxColumn(
+            options=numeric_columns,
+            required=True,
+            default=numeric_columns[0],
+            width=400,
+        ),
+        "Operator": st.column_config.SelectboxColumn(
+            options=["+", "-", "*", "/"],
+            required=True,
+            default="+",
+            width=100,
+        ),
+        "Column 2": st.column_config.SelectboxColumn(
+            options=numeric_columns,
+            required=True,
+            default=numeric_columns[0],
+            width=400,
+        ),
+        "Category": st.column_config.SelectboxColumn(
+            options=[
+                "Media",
+                "Exogenous",
+                "Internal",
+                "Response Metrics",
+            ],
+            required=True,
+            default="Media",
+            width=200,
+        ),
+    },
+    num_rows="dynamic",
+)
+# Process the DataFrame based on user inputs and operations specified in edited_df
+final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
+#########################################################################################################################################################
+# Display the Final DataFrame and variables
+#########################################################################################################################################################
+# Display the Final DataFrame and variables
+st.markdown("#### Final DataFrame")
+st.dataframe(final_df, hide_index=True)
+# Initialize an empty dictionary to hold categories and their variables
+category_dict = {}
+# Iterate over each row in the edited DataFrame to populate the dictionary
+for i, row in edited_stats_df.iterrows():
+    column = row["Column"]
+    category = row["Category"]  # The category chosen by the user for this variable
+    # Check if the category already exists in the dictionary
+    if category not in category_dict:
+        # If not, initialize it with the current column as its first element
+        category_dict[category] = [column]
+    else:
+        # If it exists, append the current column to the list of variables under this category
+        category_dict[category].append(column)
+# Add Date, Panel_1 and Panel_12 in category dictionary
+category_dict.update({"Date": ["date"]})
+if "Panel_1" in final_df.columns:
+    category_dict["Panel Level 1"] = ["Panel_1"]
+if "Panel_2" in final_df.columns:
+    category_dict["Panel Level 2"] = ["Panel_2"]
+# Display the dictionary
+st.markdown("#### Variable Category")
+for category, variables in category_dict.items():
+    # Check if there are multiple variables to handle "and" insertion correctly
+    if len(variables) > 1:
+        # Join all but the last variable with ", ", then add " and " before the last variable
+        variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
+    else:
+        # If there's only one variable, no need for "and"
+        variables_str = variables[0]
+    # Display the category and its variables in the desired format
+    st.markdown(
+        f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
+        unsafe_allow_html=True,
+    )
+# Function to check if Response Metrics is selected
+st.write("")
+response_metrics_col = category_dict.get("Response Metrics", [])
+if len(response_metrics_col) == 0:
+    st.warning("Please select Response Metrics column", icon="⚠️")
+    st.stop()
+# elif len(response_metrics_col) > 1:
+#     st.warning("Please select only one Response Metrics column", icon="⚠️")
+#     st.stop()
+# Store final dataframe and bin dictionary into session state
+st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
+# Save the DataFrame and dictionary from the session state to the pickle file
+if st.button("Accept and Save", use_container_width=True):
+    save_to_pickle(
+        "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
+    )
+    st.toast("💾 Saved Successfully!")

Data_Import .py ADDED Viewed

	@@ -0,0 +1,1019 @@

+# Importing necessary libraries
+import streamlit as st
+st.set_page_config(
+    page_title="Data Import",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+import pickle
+import pandas as pd
+from utilities import set_header, load_local_css
+import streamlit_authenticator as stauth
+import yaml
+from yaml import SafeLoader
+load_local_css("styles.css")
+set_header()
+for k, v in st.session_state.items():
+    if k not in ["logout", "login", "config"] and not k.startswith(
+        "FormSubmitter"
+    ):
+        st.session_state[k] = v
+with open("config.yaml") as file:
+    config = yaml.load(file, Loader=SafeLoader)
+    st.session_state["config"] = config
+authenticator = stauth.Authenticate(
+    config["credentials"],
+    config["cookie"]["name"],
+    config["cookie"]["key"],
+    config["cookie"]["expiry_days"],
+    config["preauthorized"],
+)
+st.session_state["authenticator"] = authenticator
+name, authentication_status, username = authenticator.login("Login", "main")
+auth_status = st.session_state.get("authentication_status")
+if auth_status == True:
+    authenticator.logout("Logout", "main")
+    is_state_initiaized = st.session_state.get("initialized", False)
+    if not is_state_initiaized:
+        if 'session_name' not in st.session_state:
+            st.session_state['session_name']=None
+# Function to validate date column in dataframe
+    def validate_date_column(df):
+        try:
+            # Attempt to convert the 'Date' column to datetime
+            df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
+            return True
+        except:
+            return False
+    # Function to determine data interval
+    def determine_data_interval(common_freq):
+        if common_freq == 1:
+            return "daily"
+        elif common_freq == 7:
+            return "weekly"
+        elif 28 <= common_freq <= 31:
+            return "monthly"
+        else:
+            return "irregular"
+    # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
+    st.cache_resource(show_spinner=False)
+    def files_to_dataframes(uploaded_files):
+        df_dict = {}
+        for uploaded_file in uploaded_files:
+            # Extract file name without extension
+            file_name = uploaded_file.name.rsplit(".", 1)[0]
+            # Check for duplicate file names
+            if file_name in df_dict:
+                st.warning(
+                    f"Duplicate File: {file_name}. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Read the file into a DataFrame
+            df = pd.read_excel(uploaded_file)
+            # Convert all column names to lowercase
+            df.columns = df.columns.str.lower().str.strip()
+            # Separate numeric and non-numeric columns
+            numeric_cols = list(df.select_dtypes(include=["number"]).columns)
+            non_numeric_cols = [
+                col
+                for col in df.select_dtypes(exclude=["number"]).columns
+                if col.lower() != "date"
+            ]
+            # Check for 'Date' column
+            if not (validate_date_column(df) and len(numeric_cols) > 0):
+                st.warning(
+                    f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Check for interval
+            common_freq = common_freq = (
+                pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
+            )
+            # Calculate the data interval (daily, weekly, monthly or irregular)
+            interval = determine_data_interval(common_freq)
+            if interval == "irregular":
+                st.warning(
+                    f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
+                    icon="⚠️",
+                )
+                continue
+            # Store both DataFrames in the dictionary under their respective keys
+            df_dict[file_name] = {
+                "numeric": numeric_cols,
+                "non_numeric": non_numeric_cols,
+                "interval": interval,
+                "df": df,
+            }
+        return df_dict
+    # Function to adjust dataframe granularity
+    def adjust_dataframe_granularity(df, current_granularity, target_granularity):
+        # Set index
+        df.set_index("date", inplace=True)
+        # Define aggregation rules for resampling
+        aggregation_rules = {
+            col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
+            for col in df.columns
+        }
+        # Initialize resampled_df
+        resampled_df = df
+        if current_granularity == "daily" and target_granularity == "weekly":
+            resampled_df = df.resample("W-MON", closed="left", label="left").agg(
+                aggregation_rules
+            )
+        elif current_granularity == "daily" and target_granularity == "monthly":
+            resampled_df = df.resample("MS", closed="left", label="left").agg(
+                aggregation_rules
+            )
+        elif current_granularity == "daily" and target_granularity == "daily":
+            resampled_df = df.resample("D").agg(aggregation_rules)
+        elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
+            # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
+            expanded_data = []
+            for _, row in df.iterrows():
+                if current_granularity == "weekly":
+                    period_range = pd.date_range(start=row.name, periods=7)
+                elif current_granularity == "monthly":
+                    period_range = pd.date_range(
+                        start=row.name, periods=row.name.days_in_month
+                    )
+                for date in period_range:
+                    new_row = {}
+                    for col in df.columns:
+                        if pd.api.types.is_numeric_dtype(df[col]):
+                            if current_granularity == "weekly":
+                                new_row[col] = row[col] / 7
+                            elif current_granularity == "monthly":
+                                new_row[col] = row[col] / row.name.days_in_month
+                        else:
+                            new_row[col] = row[col]
+                    expanded_data.append((date, new_row))
+            resampled_df = pd.DataFrame(
+                [data for _, data in expanded_data],
+                index=[date for date, _ in expanded_data],
+            )
+        # Reset index
+        resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
+        return resampled_df
+    # Function to clean and extract unique values of Panel_1 and Panel_2
+    st.cache_resource(show_spinner=False)
+    def clean_and_extract_unique_values(files_dict, selections):
+        all_panel1_values = set()
+        all_panel2_values = set()
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"]
+            # 'Panel_1' and 'Panel_2' selections
+            selected_panel1 = selections[file_name].get("Panel_1")
+            selected_panel2 = selections[file_name].get("Panel_2")
+            # Clean and standardize Panel_1 column if it exists and is selected
+            if (
+                selected_panel1
+                and selected_panel1 != "N/A"
+                and selected_panel1 in df.columns
+            ):
+                df[selected_panel1] = (
+                    df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
+                )
+                all_panel1_values.update(df[selected_panel1].dropna().unique())
+            # Clean and standardize Panel_2 column if it exists and is selected
+            if (
+                selected_panel2
+                and selected_panel2 != "N/A"
+                and selected_panel2 in df.columns
+            ):
+                df[selected_panel2] = (
+                    df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
+                )
+                all_panel2_values.update(df[selected_panel2].dropna().unique())
+            # Update the processed DataFrame back in the dictionary
+            files_dict[file_name]["df"] = df
+        return all_panel1_values, all_panel2_values
+    # Function to format values for display
+    st.cache_resource(show_spinner=False)
+    def format_values_for_display(values_list):
+        # Capitalize the first letter of each word and replace underscores with spaces
+        formatted_list = [value.replace("_", " ").title() for value in values_list]
+        # Join values with commas and 'and' before the last value
+        if len(formatted_list) > 1:
+            return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
+        elif formatted_list:
+            return formatted_list[0]
+        return "No values available"
+    # Function to normalizes all data within files_dict to a daily granularity
+    st.cache(show_spinner=False, allow_output_mutation=True)
+    def standardize_data_to_daily(files_dict, selections):
+        # Normalize all data to a daily granularity using a provided function
+        files_dict = apply_granularity_to_all(files_dict, "daily", selections)
+        # Update the "interval" attribute for each dataset to indicate the new granularity
+        for files_name, files_data in files_dict.items():
+            files_data["interval"] = "daily"
+        return files_dict
+    # Function to apply granularity transformation to all DataFrames in files_dict
+    st.cache_resource(show_spinner=False)
+    def apply_granularity_to_all(files_dict, granularity_selection, selections):
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"].copy()
+            # Handling when Panel_1 or Panel_2 might be 'N/A'
+            selected_panel1 = selections[file_name].get("Panel_1")
+            selected_panel2 = selections[file_name].get("Panel_2")
+            # Correcting the segment selection logic & handling 'N/A'
+            if selected_panel1 != "N/A" and selected_panel2 != "N/A":
+                unique_combinations = df[
+                    [selected_panel1, selected_panel2]
+                ].drop_duplicates()
+            elif selected_panel1 != "N/A":
+                unique_combinations = df[[selected_panel1]].drop_duplicates()
+                selected_panel2 = None  # Ensure Panel_2 is ignored if N/A
+            elif selected_panel2 != "N/A":
+                unique_combinations = df[[selected_panel2]].drop_duplicates()
+                selected_panel1 = None  # Ensure Panel_1 is ignored if N/A
+            else:
+                # If both are 'N/A', process the entire dataframe as is
+                df = adjust_dataframe_granularity(
+                    df, file_data["interval"], granularity_selection
+                )
+                files_dict[file_name]["df"] = df
+                continue  # Skip to the next file
+            transformed_segments = []
+            for _, combo in unique_combinations.iterrows():
+                if selected_panel1 and selected_panel2:
+                    segment = df[
+                        (df[selected_panel1] == combo[selected_panel1])
+                        & (df[selected_panel2] == combo[selected_panel2])
+                    ]
+                elif selected_panel1:
+                    segment = df[df[selected_panel1] == combo[selected_panel1]]
+                elif selected_panel2:
+                    segment = df[df[selected_panel2] == combo[selected_panel2]]
+                # Adjust granularity of the segment
+                transformed_segment = adjust_dataframe_granularity(
+                    segment, file_data["interval"], granularity_selection
+                )
+                transformed_segments.append(transformed_segment)
+            # Combine all transformed segments into a single DataFrame for this file
+            transformed_df = pd.concat(transformed_segments, ignore_index=True)
+            files_dict[file_name]["df"] = transformed_df
+        return files_dict
+    # Function to create main dataframe structure
+    st.cache_resource(show_spinner=False)
+    def create_main_dataframe(
+        files_dict, all_panel1_values, all_panel2_values, granularity_selection
+    ):
+        # Determine the global start and end dates across all DataFrames
+        global_start = min(df["df"]["date"].min() for df in files_dict.values())
+        global_end = max(df["df"]["date"].max() for df in files_dict.values())
+        # Adjust the date_range generation based on the granularity_selection
+        if granularity_selection == "weekly":
+            # Generate a weekly range, with weeks starting on Monday
+            date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
+        elif granularity_selection == "monthly":
+            # Generate a monthly range, starting from the first day of each month
+            date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
+        else:  # Default to daily if not weekly or monthly
+            date_range = pd.date_range(start=global_start, end=global_end, freq="D")
+        # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
+        all_panel1s = all_panel1_values
+        all_panel2s = all_panel2_values
+        # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
+        dimensions, merge_keys = [], []
+        if all_panel1s:
+            dimensions.append(all_panel1s)
+            merge_keys.append("Panel_1")
+        if all_panel2s:
+            dimensions.append(all_panel2s)
+            merge_keys.append("Panel_2")
+        dimensions.append(date_range)  # Date range is always included
+        merge_keys.append("date")  # Date range is always included
+        # Create a main DataFrame template with the dimensions
+        main_df = pd.MultiIndex.from_product(
+            dimensions,
+            names=[name for name, _ in zip(merge_keys, dimensions)],
+        ).to_frame(index=False)
+        return main_df.reset_index(drop=True)
+    # Function to prepare and merge dataFrames
+    st.cache_resource(show_spinner=False)
+    def merge_into_main_df(main_df, files_dict, selections):
+        for file_name, file_data in files_dict.items():
+            df = file_data["df"].copy()
+            # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
+            selected_panel1 = selections[file_name].get("Panel_1", "N/A")
+            selected_panel2 = selections[file_name].get("Panel_2", "N/A")
+            if selected_panel1 != "N/A":
+                df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
+            if selected_panel2 != "N/A":
+                df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
+            # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
+            merge_keys = ["date"]
+            if "Panel_1" in df.columns:
+                merge_keys.append("Panel_1")
+            if "Panel_2" in df.columns:
+                merge_keys.append("Panel_2")
+            main_df = pd.merge(main_df, df, on=merge_keys, how="left")
+        # After all merges, sort by 'date' and reset index for cleanliness
+        sort_by = ["date"]
+        if "Panel_1" in main_df.columns:
+            sort_by.append("Panel_1")
+        if "Panel_2" in main_df.columns:
+            sort_by.append("Panel_2")
+        main_df.sort_values(by=sort_by, inplace=True)
+        main_df.reset_index(drop=True, inplace=True)
+        return main_df
+    # Function to categorize column
+    def categorize_column(column_name):
+        # Define keywords for each category
+        internal_keywords = [
+            "Price",
+            "Discount",
+            "product_price",
+            "cost",
+            "margin",
+            "inventory",
+            "sales",
+            "revenue",
+            "turnover",
+            "expense",
+        ]
+        exogenous_keywords = [
+            "GDP",
+            "Tax",
+            "Inflation",
+            "interest_rate",
+            "employment_rate",
+            "exchange_rate",
+            "consumer_spending",
+            "retail_sales",
+            "oil_prices",
+            "weather",
+        ]
+        # Check if the column name matches any of the keywords for Internal or Exogenous categories
+        for keyword in internal_keywords:
+            if keyword.lower() in column_name.lower():
+                return "Internal"
+        for keyword in exogenous_keywords:
+            if keyword.lower() in column_name.lower():
+                return "Exogenous"
+        # Default to Media if no match found
+        return "Media"
+    # Function to calculate missing stats and prepare for editable DataFrame
+    st.cache_resource(show_spinner=False)
+    def prepare_missing_stats_df(df):
+        missing_stats = []
+        for column in df.columns:
+            if (
+                column == "date" or column == "Panel_2" or column == "Panel_1"
+            ):  # Skip Date, Panel_1 and Panel_2 column
+                continue
+            missing = df[column].isnull().sum()
+            pct_missing = round((missing / len(df)) * 100, 2)
+            # Dynamically assign category based on column name
+            category = categorize_column(column)
+            # category = "Media"  # Keep default bin as Media
+            missing_stats.append(
+                {
+                    "Column": column,
+                    "Missing Values": missing,
+                    "Missing Percentage": pct_missing,
+                    "Impute Method": "Fill with 0",  # Default value
+                    "Category": category,
+                }
+            )
+        stats_df = pd.DataFrame(missing_stats)
+        return stats_df
+    # Function to add API DataFrame details to the files dictionary
+    st.cache_resource(show_spinner=False)
+    def add_api_dataframe_to_dict(main_df, files_dict):
+        files_dict["API"] = {
+            "numeric": list(main_df.select_dtypes(include=["number"]).columns),
+            "non_numeric": [
+                col
+                for col in main_df.select_dtypes(exclude=["number"]).columns
+                if col.lower() != "date"
+            ],
+            "interval": determine_data_interval(
+                pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
+            ),
+            "df": main_df,
+        }
+        return files_dict
+    # Function to reads an API into a DataFrame, parsing specified columns as datetime
+    @st.cache_resource(show_spinner=False)
+    def read_API_data():
+        return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
+    # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
+    def set_Panel_1_Panel_2_Selected_false():
+        st.session_state["Panel_1_Panel_2_Selected"] = False
+    # Function to serialize and save the objects into a pickle file
+    @st.cache_resource(show_spinner=False)
+    def save_to_pickle(file_path, final_df, bin_dict):
+        # Open the file in write-binary mode and dump the objects
+        with open(file_path, "wb") as f:
+            pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
+            # Data is now saved to file
+    # Function to processes the merged_df DataFrame based on operations defined in edited_df
+    @st.cache_resource(show_spinner=False)
+    def process_dataframes(merged_df, edited_df, edited_stats_df):
+        # Ensure there are operations defined by the user
+        if edited_df.empty:
+            return merged_df, edited_stats_df  # No operations to apply
+        # Perform operations as defined by the user
+        for index, row in edited_df.iterrows():
+            result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
+            col1 = row["Column 1"]
+            col2 = row["Column 2"]
+            op = row["Operator"]
+            # Apply the specified operation
+            if op == "+":
+                merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
+            elif op == "-":
+                merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
+            elif op == "*":
+                merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
+            elif op == "/":
+                merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
+                    0, 1e-9
+                )
+            # Add summary of operation to edited_stats_df
+            new_row = {
+                "Column": result_column_name,
+                "Missing Values": None,
+                "Missing Percentage": None,
+                "Impute Method": None,
+                "Category": row["Category"],
+            }
+            new_row_df = pd.DataFrame([new_row])
+            # Use pd.concat to add the new_row_df to edited_stats_df
+            edited_stats_df = pd.concat(
+                [edited_stats_df, new_row_df], ignore_index=True, axis=0
+            )
+        # Combine column names from edited_df for cleanup
+        combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
+        # Filter out rows in edited_stats_df and drop columns from merged_df
+        edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
+        merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
+        return merged_df, edited_stats_df
+    # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
+    st.cache_resource(show_spinner=False)
+    def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
+        # Get columns categorized as 'Response Metrics'
+        columns_response_metrics = edited_stats_df[
+            edited_stats_df["Category"] == "Response Metrics"
+        ]["Column"].tolist()
+        # Filter numeric columns, excluding those categorized as 'Response Metrics'
+        numeric_columns = [
+            col
+            for col in merged_df.select_dtypes(include=["number"]).columns
+            if col not in columns_response_metrics
+        ]
+        # Define the structure of the empty DataFrame
+        data = {
+            "Column 1": pd.Series([], dtype="str"),
+            "Operator": pd.Series([], dtype="str"),
+            "Column 2": pd.Series([], dtype="str"),
+            "Category": pd.Series([], dtype="str"),
+        }
+        default_df = pd.DataFrame(data)
+        return numeric_columns, default_df
+    # Initialize 'final_df' in session state
+    if "final_df" not in st.session_state:
+        st.session_state["final_df"] = pd.DataFrame()
+    # Initialize 'bin_dict' in session state
+    if "bin_dict" not in st.session_state:
+        st.session_state["bin_dict"] = {}
+    # Initialize 'Panel_1_Panel_2_Selected' in session state
+    if "Panel_1_Panel_2_Selected" not in st.session_state:
+        st.session_state["Panel_1_Panel_2_Selected"] = False
+    # Page Title
+    st.write("")  # Top padding
+    st.title("Data Import")
+    #########################################################################################################################################################
+    # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
+    #########################################################################################################################################################
+    # Read the Excel file, parsing 'Date' column as datetime
+    main_df = read_API_data()
+    # Convert all column names to lowercase
+    main_df.columns = main_df.columns.str.lower().str.strip()
+    # File uploader
+    uploaded_files = st.file_uploader(
+        "Upload additional data",
+        type=["xlsx"],
+        accept_multiple_files=True,
+        on_change=set_Panel_1_Panel_2_Selected_false,
+    )
+    # Custom HTML for upload instructions
+    recommendation_html = f"""
+    <div style="text-align: justify;">
+    <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
+    </div>
+    """
+    st.markdown(recommendation_html, unsafe_allow_html=True)
+    # Choose Desired Granularity
+    st.markdown("#### Choose Desired Granularity")
+    # Granularity Selection
+    granularity_selection = st.selectbox(
+        "Choose Date Granularity",
+        ["Daily", "Weekly", "Monthly"],
+        label_visibility="collapsed",
+        on_change=set_Panel_1_Panel_2_Selected_false,
+    )
+    granularity_selection = str(granularity_selection).lower()
+    # Convert files to dataframes
+    files_dict = files_to_dataframes(uploaded_files)
+    # Add API Dataframe
+    if main_df is not None:
+        files_dict = add_api_dataframe_to_dict(main_df, files_dict)
+    # Display a warning message if no files have been uploaded and halt further execution
+    if not files_dict:
+        st.warning(
+            "Please upload at least one file to proceed.",
+            icon="⚠️",
+        )
+        st.stop()  # Halts further execution until file is uploaded
+    # Select Panel_1 and Panel_2 columns
+    st.markdown("#### Select Panel columns")
+    selections = {}
+    with st.expander("Select Panel columns", expanded=False):
+        count = 0  # Initialize counter to manage the visibility of labels and keys
+        for file_name, file_data in files_dict.items():
+            # Determine visibility of the label based on the count
+            if count == 0:
+                label_visibility = "visible"
+            else:
+                label_visibility = "collapsed"
+            # Extract non-numeric columns
+            non_numeric_cols = file_data["non_numeric"]
+            # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
+            panel1_values = non_numeric_cols + ["N/A"]
+            panel2_values = non_numeric_cols + ["N/A"]
+            # Skip if only one option is available
+            if len(panel1_values) == 1 and len(panel2_values) == 1:
+                selected_panel1, selected_panel2 = "N/A", "N/A"
+                # Update the selections for Panel_1 and Panel_2 for the current file
+                selections[file_name] = {
+                    "Panel_1": selected_panel1,
+                    "Panel_2": selected_panel2,
+                }
+                continue
+            # Create layout columns for File Name, Panel_2, and Panel_1 selections
+            file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
+            with file_name_col:
+                # Display "File Name" label only for the first file
+                if count == 0:
+                    st.write("File Name")
+                else:
+                    st.write("")
+                st.write(file_name)  # Display the file name
+            with Panel_1_col:
+                # Display a selectbox for Panel_1 values
+                selected_panel1 = st.selectbox(
+                    "Select Panel Level 1",
+                    panel2_values,
+                    on_change=set_Panel_1_Panel_2_Selected_false,
+                    label_visibility=label_visibility,  # Control visibility of the label
+                    key=f"Panel_1_selectbox{count}",  # Ensure unique key for each selectbox
+                )
+            with Panel_2_col:
+                # Display a selectbox for Panel_2 values
+                selected_panel2 = st.selectbox(
+                    "Select Panel Level 2",
+                    panel1_values,
+                    on_change=set_Panel_1_Panel_2_Selected_false,
+                    label_visibility=label_visibility,  # Control visibility of the label
+                    key=f"Panel_2_selectbox{count}",  # Ensure unique key for each selectbox
+                )
+            # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
+            if selected_panel2 == selected_panel1 and not (
+                selected_panel2 == "N/A" and selected_panel1 == "N/A"
+            ):
+                st.warning(
+                    f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
+                )
+                selected_panel1, selected_panel2 = "N/A", "N/A"
+                st.stop()
+            # Update the selections for Panel_1 and Panel_2 for the current file
+            selections[file_name] = {
+                "Panel_1": selected_panel1,
+                "Panel_2": selected_panel2,
+            }
+            count += 1  # Increment the counter after processing each file
+        # Accept Panel_1 and Panel_2 selection
+        if st.button("Accept and Process", use_container_width=True):
+            # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
+            with st.spinner("Processing..."):
+                files_dict = standardize_data_to_daily(files_dict, selections)
+                # Convert all data to daily level granularity
+                files_dict = apply_granularity_to_all(
+                    files_dict, granularity_selection, selections
+                )
+            # Update the 'files_dict' in the session state
+            st.session_state["files_dict"] = files_dict
+            # Set a flag in the session state to indicate that selection has been made
+            st.session_state["Panel_1_Panel_2_Selected"] = True
+    #########################################################################################################################################################
+    # Display unique Panel_1 and Panel_2 values
+    #########################################################################################################################################################
+    # Halts further execution until Panel_1 and Panel_2 columns are selected
+    if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
+        files_dict = st.session_state["files_dict"]
+    else:
+        st.stop()
+    # Set to store unique values of Panel_1 and Panel_2
+    with st.spinner("Fetching Panel values..."):
+        all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
+            files_dict, selections
+        )
+        # List of Panel_1 and Panel_2 columns unique values
+        list_of_all_panel1_values = list(all_panel1_values)
+        list_of_all_panel2_values = list(all_panel2_values)
+        # Format Panel_1 and Panel_2 values for display
+        formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
+        formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
+    # Unique Panel_1 and Panel_2 values
+    st.markdown("#### Unique Panel values")
+    # Display Panel_1 and Panel_2 values
+    with st.expander("Unique Panel values"):
+        st.write("")
+        st.markdown(
+            f"""
+        <style>
+        .justify-text {{
+        text-align: justify;
+        }}
+        </style>
+        <div class="justify-text">
+        <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
+        <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
+        </div>
+        """,
+            unsafe_allow_html=True,
+        )
+        # Display total Panel_1 and Panel_2
+        st.write("")
+        st.markdown(
+            f"""
+        <div style="text-align: justify;">
+            <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
+            <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
+        </div>
+        """,
+            unsafe_allow_html=True,
+        )
+        st.write("")
+    #########################################################################################################################################################
+    # Merge all DataFrames
+    #########################################################################################################################################################
+    # Merge all DataFrames selected
+    main_df = create_main_dataframe(
+        files_dict, all_panel1_values, all_panel2_values, granularity_selection
+    )
+    merged_df = merge_into_main_df(main_df, files_dict, selections)
+    #########################################################################################################################################################
+    # Categorize Variables and Impute Missing Values
+    #########################################################################################################################################################
+    # Create an editable DataFrame in Streamlit
+    st.markdown("#### Select Variables Category & Impute Missing Values")
+    # Prepare missing stats DataFrame for editing
+    missing_stats_df = prepare_missing_stats_df(merged_df)
+    edited_stats_df = st.data_editor(
+        missing_stats_df,
+        column_config={
+            "Impute Method": st.column_config.SelectboxColumn(
+                options=[
+                    "Drop Column",
+                    "Fill with Mean",
+                    "Fill with Median",
+                    "Fill with 0",
+                ],
+                required=True,
+                default="Fill with 0",
+            ),
+            "Category": st.column_config.SelectboxColumn(
+                options=[
+                    "Media",
+                    "Exogenous",
+                    "Internal",
+                    "Response Metrics",
+                ],
+                required=True,
+                default="Media",
+            ),
+        },
+        disabled=["Column", "Missing Values", "Missing Percentage"],
+        hide_index=True,
+        use_container_width=True,
+    )
+    # Apply changes based on edited DataFrame
+    for i, row in edited_stats_df.iterrows():
+        column = row["Column"]
+        if row["Impute Method"] == "Drop Column":
+            merged_df.drop(columns=[column], inplace=True)
+        elif row["Impute Method"] == "Fill with Mean":
+            merged_df[column].fillna(merged_df[column].mean(), inplace=True)
+        elif row["Impute Method"] == "Fill with Median":
+            merged_df[column].fillna(merged_df[column].median(), inplace=True)
+        elif row["Impute Method"] == "Fill with 0":
+            merged_df[column].fillna(0, inplace=True)
+    #########################################################################################################################################################
+    # Group columns
+    #########################################################################################################################################################
+    # Display Group columns header
+    st.markdown("#### Feature engineering")
+    # Prepare the numeric columns and an empty DataFrame for user input
+    numeric_columns, default_df = prepare_numeric_columns_and_default_df(
+        merged_df, edited_stats_df
+    )
+    # Display editable Dataframe
+    edited_df = st.data_editor(
+        default_df,
+        column_config={
+            "Column 1": st.column_config.SelectboxColumn(
+                options=numeric_columns,
+                required=True,
+                default=numeric_columns[0],
+                width=400,
+            ),
+            "Operator": st.column_config.SelectboxColumn(
+                options=["+", "-", "*", "/"],
+                required=True,
+                default="+",
+                width=100,
+            ),
+            "Column 2": st.column_config.SelectboxColumn(
+                options=numeric_columns,
+                required=True,
+                default=numeric_columns[0],
+                width=400,
+            ),
+            "Category": st.column_config.SelectboxColumn(
+                options=[
+                    "Media",
+                    "Exogenous",
+                    "Internal",
+                    "Response Metrics",
+                ],
+                required=True,
+                default="Media",
+                width=200,
+            ),
+        },
+        num_rows="dynamic",
+    )
+    # Process the DataFrame based on user inputs and operations specified in edited_df
+    final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
+    #########################################################################################################################################################
+    # Display the Final DataFrame and variables
+    #########################################################################################################################################################
+    # Display the Final DataFrame and variables
+    st.markdown("#### Final DataFrame")
+    st.dataframe(final_df, hide_index=True)
+    # Initialize an empty dictionary to hold categories and their variables
+    category_dict = {}
+    # Iterate over each row in the edited DataFrame to populate the dictionary
+    for i, row in edited_stats_df.iterrows():
+        column = row["Column"]
+        category = row["Category"]  # The category chosen by the user for this variable
+        # Check if the category already exists in the dictionary
+        if category not in category_dict:
+            # If not, initialize it with the current column as its first element
+            category_dict[category] = [column]
+        else:
+            # If it exists, append the current column to the list of variables under this category
+            category_dict[category].append(column)
+    # Add Date, Panel_1 and Panel_12 in category dictionary
+    category_dict.update({"Date": ["date"]})
+    if "Panel_1" in final_df.columns:
+        category_dict["Panel Level 1"] = ["Panel_1"]
+    if "Panel_2" in final_df.columns:
+        category_dict["Panel Level 2"] = ["Panel_2"]
+    # Display the dictionary
+    st.markdown("#### Variable Category")
+    for category, variables in category_dict.items():
+        # Check if there are multiple variables to handle "and" insertion correctly
+        if len(variables) > 1:
+            # Join all but the last variable with ", ", then add " and " before the last variable
+            variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
+        else:
+            # If there's only one variable, no need for "and"
+            variables_str = variables[0]
+        # Display the category and its variables in the desired format
+        st.markdown(
+            f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
+            unsafe_allow_html=True,
+        )
+    # Function to check if Response Metrics is selected
+    st.write("")
+    response_metrics_col = category_dict.get("Response Metrics", [])
+    if len(response_metrics_col) == 0:
+        st.warning("Please select Response Metrics column", icon="⚠️")
+        st.stop()
+    # elif len(response_metrics_col) > 1:
+    #     st.warning("Please select only one Response Metrics column", icon="⚠️")
+    #     st.stop()
+    # Store final dataframe and bin dictionary into session state
+    st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
+    # Save the DataFrame and dictionary from the session state to the pickle file
+    if st.button("Accept and Save", use_container_width=True):
+        save_to_pickle(
+            "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
+        )
+        st.toast("💾 Saved Successfully!")

Data_prep_functions.py CHANGED Viewed

@@ -86,76 +86,89 @@ def create_dual_axis_line_chart(date_series, promo_price_series, non_promo_price
 def to_percentage(value):
   return f'{value * 100:.1f}%'
-def plot_actual_vs_predicted(date, y, predicted_values, model, target_column=None, flag=None, repeat_all_years=False, is_panel=False):
-    if flag is not None:
-        fig = make_subplots(specs=[[{"secondary_y": True}]])
-    else:
-        fig = go.Figure()
-    if is_panel:
-        df = pd.DataFrame()
-        df['date'] = date
-        df['Actual'] = y
-        df['Predicted'] = predicted_values
-        df_agg = df.groupby('date').agg({'Actual': 'sum', 'Predicted': 'sum'}).reset_index()
-        df_agg.columns = ['date', 'Actual', 'Predicted']
-        assert len(df_agg) == pd.Series(date).nunique()
-        fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
-        fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))
-    else:
-        fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
-        fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
-    line_values = []
     if flag:
-        min_date, max_date = flag[0], flag[1]
-        min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
-        max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
-        month = pd.to_datetime(min_date).month
-        day = pd.to_datetime(min_date).day
-        if repeat_all_years:
-            line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >= int(min_week)) & (pd.Timestamp(x).week <= int(max_week)) else 0))
-            assert len(line_values) == len(date)
-            fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)
-        else:
-            line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
-            fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)
     mape = mean_absolute_percentage_error(y, predicted_values)
     r2 = r2_score(y, predicted_values)
     adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
     metrics_table = pd.DataFrame({
-        'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
-        'Value': [mape, r2, adjr2]
     })
-    # Convert date to datetime
-    date = pd.to_datetime(date)
-    # Calculate the number of days between each tick based on the date range
-    date_range = (max(date) - min(date)).days
-    #x_axis_tick_spacing = max(1, date_range // 50)  # Divide the date range by 14 to get approximately 15 ticks
     fig.update_layout(
-        xaxis=dict(title='Date', tickangle=-30),
-        yaxis=dict(title=target_column),
     )
     fig.add_annotation(
-        text=f"MAPE: {mape * 100:0.1f}%,  Adjr2: {adjr2 * 100:.1f}%",
-        xref="paper",
-        yref="paper",
-        x=0.95,
-        y=1.2,
-        showarrow=False,
     )
-    return metrics_table, line_values, fig
 def plot_residual_predicted(actual, predicted, df):
         df_=df.copy()

 def to_percentage(value):
   return f'{value * 100:.1f}%'
+def plot_actual_vs_predicted(date, y, predicted_values, model,target_column=None, flag=None, repeat_all_years=False, is_panel=False):
+    if flag is not None :
+      fig = make_subplots(specs=[[{"secondary_y": True}]])
+    else :
+      fig = go.Figure()
+    if is_panel :
+      df=pd.DataFrame()
+      df['date'] = date
+      df['Actual'] = y
+      df['Predicted'] = predicted_values
+      df_agg = df.groupby('date').agg({'Actual':'sum', 'Predicted':'sum'}).reset_index()
+      df_agg.columns = ['date', 'Actual', 'Predicted']
+      assert len(df_agg) == pd.Series(date).nunique()
+      # date = df_agg['date']
+      # y = df_agg['Actual']
+      # predicted_values = df_agg['Predicted']
+      # ymax = df_agg['Actual'].max() # Sprint3 - ymax to set y value for flag
+      fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
+      fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))
+    else :
+      fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
+      fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
+    line_values=[]
     if flag:
+      min_date, max_date = flag[0], flag[1]
+      min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
+      max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
+      month=pd.to_datetime(min_date).month
+      day=pd.to_datetime(min_date).day
+      #st.write(pd.to_datetime(min_date).week)
+      #st.write(min_week)
+      # Initialize an empty list to store line values
+      # Sprint3 change : put flags to secondary axis, & made their y value to 1 instead of 5M
+      if repeat_all_years:
+        #line_values=list(pd.to_datetime((pd.Series(date)).dt.week).map(lambda x: 10000 if x==min_week else 0  ))
+        #st.write(pd.Series(date).map(lambda x: pd.Timestamp(x).week))
+        line_values=list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >=int(min_week)) & (pd.Timestamp(x).week <=int(max_week)) else 0))
+        assert len(line_values) == len(date)
+        #st.write(line_values)
+        fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
+      else:
+        line_values = []
+        line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
+        #st.write(line_values)
+        fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
+    # Calculate MAPE
     mape = mean_absolute_percentage_error(y, predicted_values)
+    # Calculate AdjR2 # Assuming X is your feature matrix
     r2 = r2_score(y, predicted_values)
     adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
+    # Create a table to display the metrics
     metrics_table = pd.DataFrame({
+      'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
+      'Value': [mape, r2, adjr2]
     })
+    # st.write(metrics_table)
     fig.update_layout(
+      xaxis=dict(title='Date'),
+      yaxis=dict(title=target_column),
+      xaxis_tickangle=-30
     )
     fig.add_annotation(
+    text=f"MAPE: {mape*100:0.1f}%,  Adjr2: {adjr2 *100:.1f}%",
+    xref="paper",
+    yref="paper",
+    x=0.95,  # Adjust these values to position the annotation
+    y=1.2,
+    showarrow=False,
     )
+    # print("{}{}"*20, len(line_values))
+  #metrics_table.set_index(['Metric'],inplace=True)
+    return metrics_table,line_values, fig
 def plot_residual_predicted(actual, predicted, df):
         df_=df.copy()

Model/model_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e25f247a6804043e242b2a688b9b5ca840bce3da95bfd52863f33cd1a83ce2e2
+size 3160085

Model/model_1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8179ad18c0894ab80fc5bc7daf85da4c29a0d79989a04fdfb3fe448bae00c582
+size 3160085

Model/model_2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc881f53a6a3dbca759c116f200606d946a48a1342dbabf75c84802df9cacd0d
+size 3160100

Model/model_3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70ce7759767772d382d41e509022338fb35efc361367d488d876494ff0a915e
+size 3160100

Model/model_4.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4227165221399f4430d82db0afdb68af986789d38efee3cabbba07db2b286759
+size 3160079

Overview_data_test_panel@#app_installs.xlsx ADDED Viewed

Binary file (28.1 kB). View file

Overview_data_test_panel@#revenue.xlsx ADDED Viewed

Binary file (28.1 kB). View file

Overview_data_test_panelreplace_meapp_installs.xlsx ADDED Viewed

Binary file (28.1 kB). View file

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: pink
 sdk: streamlit
 sdk_version: 1.32.1
-app_file: Data_Import.py
 pinned: false
 ---

 colorTo: pink
 sdk: streamlit
 sdk_version: 1.32.1
+app_file: app.py
 pinned: false
 ---

Test/merged_df_contri.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Test/output_df.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+Date,ga_app,kwai,fb_level_achieved_tier_1,fb_level_achieved_tier_2,paid_search,programmatic,digital_tactic_others,const
+2023-08-28,-0.35397136801899115,66.71033836253882,174.81135610042415,3594.2897643195533,150.11933145463811,297.6127449335578,3.5410008622400904,17088.40000435223
+2023-09-04,-0.27103315177908194,72.74975974406102,194.2113426820265,3860.2811580984967,158.86380529049313,335.6495710921645,3.925829952759959,18800.68227530415
+2023-09-11,-0.18972320624535735,73.56577516680593,203.22388059872446,3779.8710519990336,154.802475164174,311.6912405544196,4.084632969053109,18923.5133215299
+2023-09-18,-0.13280624437175015,72.56271222233786,200.12443813391138,3681.8929038825913,155.7098287205689,311.05112450305245,4.051292175357099,19046.344367755646
+2023-09-25,-0.09296437106022509,73.70727325917034,205.74779198138953,3858.65253006403,155.02816278138727,312.86845990465827,3.9265582664040735,19169.175413981393
+2023-10-02,-0.06507505974215756,72.34256892327214,205.58713073299748,3726.8536377627233,159.8242700571235,315.5994755570924,3.955202754813552,19292.006460207143
+2023-10-09,-0.045552541819510295,74.38740927114137,207.0439308259877,3845.7054965140105,166.1387318784968,318.3770263805087,3.9670100767811185,19414.83750643289
+2023-10-16,-0.031886779273657205,73.92804257031634,209.0350517896794,3749.259107713571,158.5179131618084,308.27664915352324,3.935545074442725,19537.66855265864
+2023-10-23,-0.02232074549156004,74.2265721786869,214.96921278574305,3766.838626589657,155.11867956784573,298.7838125908522,3.8717920437881834,19660.499598884388
+2023-10-30,-0.015624521844092026,73.13776666139266,215.11994117361186,3861.8716038759217,150.99199274844668,305.8173177680258,3.8593412414854895,19783.330645110138
+2023-11-06,-0.010937165290864418,73.92209125196376,208.19044332496705,3939.163063071122,155.63698971642444,320.41327017703395,3.844088730158042,19906.161691335885
+2023-11-13,-0.007656015703605092,75.65843124761166,208.86440994169482,3793.1062744683286,156.5242431409553,320.3204189984107,4.021312960163909,20028.99273756163
+2023-11-20,-0.005359210992523565,73.88051276100926,218.40774072300528,3684.900260569517,163.258344706366,322.7402649826382,4.0473156754345965,20151.823783787382
+2023-11-27,-0.0037514476947664945,72.1846283175467,213.20545855013495,3856.792298375503,167.13396999671053,332.60329700992924,3.949159871187085,20274.65483001313
+2023-12-04,-0.002626013386336546,72.23564873518644,203.08444230779233,3848.078121929866,167.24638929455608,325.2003051931162,3.9989148636147225,20397.485876238876

Test/scenario_test_df.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+other_contributions,correction,sales
+17088.04603298421,-215.4682810582599,4502.552817091212
+18800.41124215237,74.62945753481836,4551.0520093251835
+18923.323598323655,-24.472395662971394,4551.711452115181
+19046.211561511274,-125.71083540064501,4551.1031350384665
+19169.082449610334,59.723662814169074,4550.207113442869
+19291.941385147402,-62.72601966545335,4546.888305453476
+19414.791953891072,67.80597281407609,4547.8136321328475
+19537.636665879367,-49.327276753389015,4552.279586216728
+19660.477278138897,-34.96735624499706,4548.776052001568
+19783.315020588292,63.3505618488889,4547.4474016199965
+19906.150754170594,157.53118273497603,4543.63876353669
+20028.98508154593,8.48155599979873,4550.013534757365
+20151.81842457639,-76.79487376436737,4544.029313182335
+20274.651078565435,90.96984069810424,4554.898971422908
+20397.48325022549,65.02213269566346,4554.821689628467

Test/x_test_contribution.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Test/x_test_to_save.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

Test/x_train_contribution.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Test/x_train_to_save.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

best_models.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dff906988316f9b0e935e828d967162a5f23b402d69a1de1fcd884225cd6a349
-size 3755214

 version https://git-lfs.github.com/spec/v1
+oid sha256:63e3de089d3f2a199a396228c6c0cf7f5db60c36fe3b7a6fb5cf3e74a92ae304
+size 4095026

classes.py CHANGED Viewed

@@ -16,21 +16,15 @@ def class_to_dict(class_instance):
         attr_dict["modified_spends"] = class_instance.modified_spends
         attr_dict["modified_sales"] = class_instance.modified_sales
         attr_dict["response_curve_type"] = class_instance.response_curve_type
-        attr_dict["response_curve_params"] = (
-            class_instance.response_curve_params
-        )
         attr_dict["penalty"] = class_instance.penalty
         attr_dict["bounds"] = class_instance.bounds
         attr_dict["actual_total_spends"] = class_instance.actual_total_spends
         attr_dict["actual_total_sales"] = class_instance.actual_total_sales
-        attr_dict["modified_total_spends"] = (
-            class_instance.modified_total_spends
-        )
         attr_dict["modified_total_sales"] = class_instance.modified_total_sales
         attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
-        attr_dict["modified_mroi"] = class_instance.get_marginal_roi(
-            "modified"
-        )
     elif isinstance(class_instance, Scenario):
         attr_dict["type"] = "Scenario"
@@ -43,9 +37,7 @@ def class_to_dict(class_instance):
         attr_dict["correction"] = class_instance.correction
         attr_dict["actual_total_spends"] = class_instance.actual_total_spends
         attr_dict["actual_total_sales"] = class_instance.actual_total_sales
-        attr_dict["modified_total_spends"] = (
-            class_instance.modified_total_spends
-        )
         attr_dict["modified_total_sales"] = class_instance.modified_total_sales
     return attr_dict
@@ -95,9 +87,7 @@ class Channel:
         self.modified_sales = self.calculate_sales()
         self.modified_total_spends = self.modified_spends.sum()
         self.modified_total_sales = self.modified_sales.sum()
-        self.delta_spends = (
-            self.modified_total_spends - self.actual_total_spends
-        )
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def update_penalty(self, penalty):
@@ -119,8 +109,7 @@ class Channel:
             x = np.where(
                 x < self.upper_limit,
                 x,
-                self.upper_limit
-                + (x - self.upper_limit) * self.upper_limit / x,
             )
         if self.response_curve_type == "s-curve":
             if self.power >= 0:
@@ -169,9 +158,7 @@ class Channel:
         self.modified_sales = self.calculate_sales()
         self.modified_total_spends = self.modified_spends.sum()
         self.modified_total_sales = self.modified_sales.sum()
-        self.delta_spends = (
-            self.modified_total_spends - self.actual_total_spends
-        )
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def intialize(self):
@@ -208,9 +195,7 @@ class Scenario:
         self.actual_total_sales = self.calculate_actual_total_sales()
         self.modified_total_sales = self.calculate_modified_total_sales()
         self.modified_total_spends = self.calculate_modified_total_spends()
-        self.delta_spends = (
-            self.modified_total_spends - self.actual_total_spends
-        )
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def update_penalty(self, value):
@@ -220,9 +205,7 @@ class Scenario:
     def calculate_modified_total_spends(self):
         total_actual_spends = 0.0
         for channel in self.channels.values():
-            total_actual_spends += (
-                channel.actual_total_spends * channel.conversion_rate
-            )
         return total_actual_spends
     def calculate_modified_total_spends(self):
@@ -251,12 +234,47 @@ class Scenario:
         self.channels[channel_name].update(modified_spends)
         self.modified_total_sales = self.calculate_modified_total_sales()
         self.modified_total_spends = self.calculate_modified_total_spends()
-        self.delta_spends = (
-            self.modified_total_spends - self.actual_total_spends
-        )
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
-    def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
         desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
         def constraint(x):
@@ -285,7 +303,7 @@ class Scenario:
             x0=initial_point,
             constraints=constraints,
             method=algo,
-            options={"maxiter": int(2e7), "catol": 1},
         )
         for channel_name, modified_spends in zip(channels_list, res.x):
@@ -317,14 +335,11 @@ class Scenario:
         for channel_name in channels_list:
             _channel_class = self.channels[channel_name]
             channel_bounds = _channel_class.bounds
-            channel_actual_total_spends = (
-                _channel_class.actual_total_spends
-                * ((1 + spends_percent / 100))
             )
             old_spends.append(channel_actual_total_spends)
-            bounds.append(
-                (1 + channel_bounds / 100) * channel_actual_total_spends
-            )
         def objective_function(x):
             for channel_name, modified_spends in zip(channels_list, x):
@@ -332,12 +347,12 @@ class Scenario:
             return -1 * self.modified_total_sales
         res = minimize(
-            lambda x : objective_function(x) / 1e8,
             method="trust-constr",
             x0=old_spends,
             constraints=constraint,
             bounds=bounds,
-            options={"maxiter": int(1e7), 'xtol' : 100},
         )
         # res = dual_annealing(
         # objective_function,
@@ -361,81 +376,91 @@ class Scenario:
         channel_data = []
         summary_rows = []
-        actual_list.append({
-            "name": "Total",
-            "Spends": self.actual_total_spends,
-            "Sales": self.actual_total_sales,
-        })
-        modified_list.append({
-            "name": "Total",
-            "Spends": self.modified_total_spends,
-            "Sales": self.modified_total_sales,
-        })
         for channel in self.channels.values():
             name_mod = channel.name.replace("_", " ")
             if name_mod.lower().endswith(" imp"):
                 name_mod = name_mod.replace("Imp", " Impressions")
-            summary_rows.append([
-                name_mod,
-                channel.actual_total_spends,
-                channel.modified_total_spends,
-                channel.actual_total_sales,
-                channel.modified_total_sales,
-                round(
-                    channel.actual_total_sales / channel.actual_total_spends, 2
-                ),
-                round(
-                    channel.modified_total_sales
-                    / channel.modified_total_spends,
-                    2,
-                ),
-                channel.get_marginal_roi("actual"),
-                channel.get_marginal_roi("modified"),
-            ])
             data[channel.name] = channel.modified_spends
             data["Date"] = channel.dates
             data["Sales"] = (
                 data.get("Sales", np.zeros((len(channel.dates),)))
                 + channel.modified_sales
             )
-            actual_list.append({
-                "name": channel.name,
-                "Spends": channel.actual_total_spends,
-                "Sales": channel.actual_total_sales,
-                "ROI": round(
-                    channel.actual_total_sales / channel.actual_total_spends, 2
-                ),
-            })
-            modified_list.append({
-                "name": channel.name,
-                "Spends": channel.modified_total_spends,
-                "Sales": channel.modified_total_sales,
-                "ROI": round(
-                    channel.modified_total_sales
-                    / channel.modified_total_spends,
-                    2,
-                ),
-                "Marginal ROI": channel.get_marginal_roi("modified"),
-            })
-            channel_data.append({
-                "channel": channel.name,
-                "spends_act": channel.actual_total_spends,
-                "spends_mod": channel.modified_total_spends,
-                "sales_act": channel.actual_total_sales,
-                "sales_mod": channel.modified_total_sales,
-            })
-        summary_rows.append([
-            "Total",
-            self.actual_total_spends,
-            self.modified_total_spends,
-            self.actual_total_sales,
-            self.modified_total_sales,
-            round(self.actual_total_sales / self.actual_total_spends, 2),
-            round(self.modified_total_sales / self.modified_total_spends, 2),
-            0.0,
-            0.0,
-        ])
         details["Actual"] = actual_list
         details["Modified"] = modified_list
         columns_index = pd.MultiIndex.from_product(
@@ -467,8 +492,7 @@ class Scenario:
     def from_dict(cls, attr_dict):
         channels_list = attr_dict["channels"]
         channels = {
-            channel["name"]: class_from_dict(channel)
-            for channel in channels_list
         }
         return Scenario(
             name=attr_dict["name"],

         attr_dict["modified_spends"] = class_instance.modified_spends
         attr_dict["modified_sales"] = class_instance.modified_sales
         attr_dict["response_curve_type"] = class_instance.response_curve_type
+        attr_dict["response_curve_params"] = class_instance.response_curve_params
         attr_dict["penalty"] = class_instance.penalty
         attr_dict["bounds"] = class_instance.bounds
         attr_dict["actual_total_spends"] = class_instance.actual_total_spends
         attr_dict["actual_total_sales"] = class_instance.actual_total_sales
+        attr_dict["modified_total_spends"] = class_instance.modified_total_spends
         attr_dict["modified_total_sales"] = class_instance.modified_total_sales
         attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
+        attr_dict["modified_mroi"] = class_instance.get_marginal_roi("modified")
     elif isinstance(class_instance, Scenario):
         attr_dict["type"] = "Scenario"
         attr_dict["correction"] = class_instance.correction
         attr_dict["actual_total_spends"] = class_instance.actual_total_spends
         attr_dict["actual_total_sales"] = class_instance.actual_total_sales
+        attr_dict["modified_total_spends"] = class_instance.modified_total_spends
         attr_dict["modified_total_sales"] = class_instance.modified_total_sales
     return attr_dict
         self.modified_sales = self.calculate_sales()
         self.modified_total_spends = self.modified_spends.sum()
         self.modified_total_sales = self.modified_sales.sum()
+        self.delta_spends = self.modified_total_spends - self.actual_total_spends
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def update_penalty(self, penalty):
             x = np.where(
                 x < self.upper_limit,
                 x,
+                self.upper_limit + (x - self.upper_limit) * self.upper_limit / x,
             )
         if self.response_curve_type == "s-curve":
             if self.power >= 0:
         self.modified_sales = self.calculate_sales()
         self.modified_total_spends = self.modified_spends.sum()
         self.modified_total_sales = self.modified_sales.sum()
+        self.delta_spends = self.modified_total_spends - self.actual_total_spends
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def intialize(self):
         self.actual_total_sales = self.calculate_actual_total_sales()
         self.modified_total_sales = self.calculate_modified_total_sales()
         self.modified_total_spends = self.calculate_modified_total_spends()
+        self.delta_spends = self.modified_total_spends - self.actual_total_spends
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
     def update_penalty(self, value):
     def calculate_modified_total_spends(self):
         total_actual_spends = 0.0
         for channel in self.channels.values():
+            total_actual_spends += channel.actual_total_spends * channel.conversion_rate
         return total_actual_spends
     def calculate_modified_total_spends(self):
         self.channels[channel_name].update(modified_spends)
         self.modified_total_sales = self.calculate_modified_total_sales()
         self.modified_total_spends = self.calculate_modified_total_spends()
+        self.delta_spends = self.modified_total_spends - self.actual_total_spends
         self.delta_sales = self.modified_total_sales - self.actual_total_sales
+    # def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
+    #     desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
+    #     def constraint(x):
+    #         for ch, spends in zip(channels_list, x):
+    #             self.update(ch, spends)
+    #         return self.modified_total_sales - desired_sales
+    #     bounds = []
+    #     for ch in channels_list:
+    #         bounds.append(
+    #             (1 + np.array([-50.0, 100.0]) / 100.0)
+    #             * self.channels[ch].actual_total_spends
+    #         )
+    #     initial_point = []
+    #     for bound in bounds:
+    #         initial_point.append(bound[0])
+    #     power = np.ceil(np.log(sum(initial_point)) / np.log(10))
+    #     constraints = [NonlinearConstraint(constraint, -1.0, 1.0)]
+    #     res = minimize(
+    #         lambda x: sum(x) / 10 ** (power),
+    #         bounds=bounds,
+    #         x0=initial_point,
+    #         constraints=constraints,
+    #         method=algo,
+    #         options={"maxiter": int(2e7), "catol": 1},
+    #     )
+    #     for channel_name, modified_spends in zip(channels_list, res.x):
+    #         self.update(channel_name, modified_spends)
+    #     return zip(channels_list, res.x)
+    def optimize_spends(self, sales_percent, channels_list, algo="trust-constr"):
         desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
         def constraint(x):
             x0=initial_point,
             constraints=constraints,
             method=algo,
+            options={"maxiter": int(2e7), "xtol": 100},
         )
         for channel_name, modified_spends in zip(channels_list, res.x):
         for channel_name in channels_list:
             _channel_class = self.channels[channel_name]
             channel_bounds = _channel_class.bounds
+            channel_actual_total_spends = _channel_class.actual_total_spends * (
+                (1 + spends_percent / 100)
             )
             old_spends.append(channel_actual_total_spends)
+            bounds.append((1 + channel_bounds / 100) * channel_actual_total_spends)
         def objective_function(x):
             for channel_name, modified_spends in zip(channels_list, x):
             return -1 * self.modified_total_sales
         res = minimize(
+            lambda x: objective_function(x) / 1e8,
             method="trust-constr",
             x0=old_spends,
             constraints=constraint,
             bounds=bounds,
+            options={"maxiter": int(1e7), "xtol": 100},
         )
         # res = dual_annealing(
         # objective_function,
         channel_data = []
         summary_rows = []
+        actual_list.append(
+            {
+                "name": "Total",
+                "Spends": self.actual_total_spends,
+                "Sales": self.actual_total_sales,
+            }
+        )
+        modified_list.append(
+            {
+                "name": "Total",
+                "Spends": self.modified_total_spends,
+                "Sales": self.modified_total_sales,
+            }
+        )
         for channel in self.channels.values():
             name_mod = channel.name.replace("_", " ")
             if name_mod.lower().endswith(" imp"):
                 name_mod = name_mod.replace("Imp", " Impressions")
+            summary_rows.append(
+                [
+                    name_mod,
+                    channel.actual_total_spends,
+                    channel.modified_total_spends,
+                    channel.actual_total_sales,
+                    channel.modified_total_sales,
+                    round(channel.actual_total_sales / channel.actual_total_spends, 2),
+                    round(
+                        channel.modified_total_sales / channel.modified_total_spends,
+                        2,
+                    ),
+                    channel.get_marginal_roi("actual"),
+                    channel.get_marginal_roi("modified"),
+                ]
+            )
             data[channel.name] = channel.modified_spends
             data["Date"] = channel.dates
             data["Sales"] = (
                 data.get("Sales", np.zeros((len(channel.dates),)))
                 + channel.modified_sales
             )
+            actual_list.append(
+                {
+                    "name": channel.name,
+                    "Spends": channel.actual_total_spends,
+                    "Sales": channel.actual_total_sales,
+                    "ROI": round(
+                        channel.actual_total_sales / channel.actual_total_spends, 2
+                    ),
+                }
+            )
+            modified_list.append(
+                {
+                    "name": channel.name,
+                    "Spends": channel.modified_total_spends,
+                    "Sales": channel.modified_total_sales,
+                    "ROI": round(
+                        channel.modified_total_sales / channel.modified_total_spends,
+                        2,
+                    ),
+                    "Marginal ROI": channel.get_marginal_roi("modified"),
+                }
+            )
+            channel_data.append(
+                {
+                    "channel": channel.name,
+                    "spends_act": channel.actual_total_spends,
+                    "spends_mod": channel.modified_total_spends,
+                    "sales_act": channel.actual_total_sales,
+                    "sales_mod": channel.modified_total_sales,
+                }
+            )
+        summary_rows.append(
+            [
+                "Total",
+                self.actual_total_spends,
+                self.modified_total_spends,
+                self.actual_total_sales,
+                self.modified_total_sales,
+                round(self.actual_total_sales / self.actual_total_spends, 2),
+                round(self.modified_total_sales / self.modified_total_spends, 2),
+                0.0,
+                0.0,
+            ]
+        )
         details["Actual"] = actual_list
         details["Modified"] = modified_list
         columns_index = pd.MultiIndex.from_product(
     def from_dict(cls, attr_dict):
         channels_list = attr_dict["channels"]
         channels = {
+            channel["name"]: class_from_dict(channel) for channel in channels_list
         }
         return Scenario(
             name=attr_dict["name"],

data_import.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d759e0caf40a5cf6ddfe5c391289fa964363652dba2ffe919fa1ab7c6b4399ec
+size 2246178

data_test_overview_panel_#total_approved_accounts_revenue.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:763047805d36dca3502a6ed9c6dcee9a0c99c945ee92bb61a7c0f6647486a96c
+size 1637428

final_df_transformed.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d775eda5ee0172e1511622b69b301023cdf2c5dbe74bb62d79264fe926eee1b
+size 19479046

metrics_level_data/Overview_data_test_panel@#app_installs.xlsx ADDED Viewed

Binary file (28.1 kB). View file

metrics_level_data/Overview_data_test_panel@#revenue.xlsx ADDED Viewed

Binary file (28.1 kB). View file

model_output.csv CHANGED Viewed

@@ -1,11 +1,6 @@
-,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2
-0,Model/model_0.pkl,0,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_clicks']",0.2101108376942587,0.8443530956877969,0.8442167683191552
-1,Model/model_1.pkl,1,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_clicks_lag_3']",0.21209032951119616,0.8459839652330053,0.8458490663036549
-2,Model/model_2.pkl,2,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_impressions']",0.21016185105024765,0.8443545867054447,0.8442182606427493
-3,Model/model_3.pkl,3,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_impressions_lag_3']",0.21224939270932452,0.8462289218635773,0.8460942374858302
-4,Model/model_4.pkl,4,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_clicks']",0.21018683127739526,0.8421437296960563,0.8420054671970414
-5,Model/model_5.pkl,5,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_clicks_lag_3']",0.21221059311555665,0.8436849097221487,0.843547997105539
-6,Model/model_6.pkl,6,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions']",0.21023311688137142,0.8421414101917525,0.8420031456611397
-7,Model/model_7.pkl,7,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions_lag_3']",0.21230002407340917,0.8438639613954715,0.843727205605903
-8,Model/model_8.pkl,8,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks']",0.21138525009178905,0.8446253227642725,0.8444892338327598
-9,Model/model_9.pkl,9,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks_lag_3']",0.2123701406564611,0.8464957579981922,0.8463613073357782

+,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2,pos_count
+0,Model/model_0.pkl,0,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_2']",0.217990735975396,0.8737098317237447,0.8735992172119913,8
+1,Model/model_1.pkl,1,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_1']",0.2179731139181846,0.873704484501189,0.8735938653059323,8
+2,Model/model_2.pkl,2,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_2']",0.22282859947602898,0.8741134168513375,0.8740031558300612,7
+3,Model/model_3.pkl,3,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_1']",0.22288787053617995,0.8740146663445868,0.8739043188301239,8
+4,Model/model_4.pkl,4,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_cost_adstock_0_7_lag_2']",0.21714189338473494,0.8736897844153089,0.8735791523446015,8

pages/10_Optimized_Result_Analysis.py CHANGED Viewed

@@ -14,15 +14,7 @@ import plotly.express as px
 import numpy as np
 import plotly.graph_objects as go
 import pandas as pd
-from plotly.subplots import make_subplots
-def format_number(x):
-    if x >= 1_000_000:
-        return f'{x / 1_000_000:.2f}M'
-    elif x >= 1_000:
-        return f'{x / 1_000:.2f}K'
-    else:
-        return f'{x:.2f}'
 def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
     fig = px.bar(data, x=x, y=y, orientation='h',
@@ -104,13 +96,11 @@ spends_data=pd.read_excel('Overview_data_test.xlsx')
 with open('summary_df.pkl', 'rb') as file:
   summary_df_sorted = pickle.load(file)
-  #st.write(summary_df_sorted)
 selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
 st.header('Optimized Spends Overview')
 ___columns=st.columns(3)
-summary_df_sorted=summary_df_sorted.sort_values(by=['Optimized_spend'],ascending=False)
 with ___columns[2]:
     fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
     st.plotly_chart(fig,use_container_width=True)
@@ -344,75 +334,31 @@ with st.expander("Return Forecast by Media Channel"):
     summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
-    #
-    summary_df_sorted['Efficiency'] = summary_df_sorted['ResponseMetricValue'] / summary_df_sorted['Optimized_spend']
-    summary_df_sorted=summary_df_sorted.sort_values(by='Optimized_spend',ascending=True)
-    #st.dataframe(summary_df_sorted)
-    channel_colors = px.colors.qualitative.Plotly
-    fig = make_subplots(rows=1, cols=3, subplot_titles=('Optimized Spends', 'Effectiveness', 'Efficiency'), horizontal_spacing=0.05)
-    for i, channel in enumerate(summary_df_sorted['Channel_name'].unique()):
-        channel_df = summary_df_sorted[summary_df_sorted['Channel_name'] == channel]
-        channel_color = channel_colors[i % len(channel_colors)]
-        fig.add_trace(go.Bar(x=channel_df['Optimized_spend'],
-                            y=channel_df['Channel_name'],
-                            text=channel_df['Optimized_spend'].apply(format_number),
-                            marker_color=channel_color,
-                            orientation='h'), row=1, col=1)
-        fig.add_trace(go.Bar(x=channel_df['ResponseMetricValue'],
-                            y=channel_df['Channel_name'],
-                            text=channel_df['ResponseMetricValue'].apply(format_number),
-                            marker_color=channel_color,
-                            orientation='h', showlegend=False), row=1, col=2)
-        fig.add_trace(go.Bar(x=channel_df['Efficiency'],
-                            y=channel_df['Channel_name'],
-                            text=channel_df['Efficiency'].apply(format_number),
-                            marker_color=channel_color,
-                            orientation='h', showlegend=False), row=1, col=3)
-    fig.update_layout(
-        height=600,
-        width=900,
-        title='Media Channel Performance',
-        showlegend=False
-    )
-    fig.update_yaxes(showticklabels=False ,row=1, col=2 )
-    fig.update_yaxes(showticklabels=False, row=1, col=3)
-    fig.update_xaxes(showticklabels=False, row=1, col=1)
-    fig.update_xaxes(showticklabels=False, row=1, col=2)
-    fig.update_xaxes(showticklabels=False, row=1, col=3)
-    st.plotly_chart(fig, use_container_width=True)
-    # columns= st.columns(3)
-    # with columns[0]:
-    #     fig=summary_plot(summary_df_sorted, x='Optimized_spend', y='Channel_name', title='', text_column='Optimized_spend',color='Channel_name')
-    #     st.plotly_chart(fig,use_container_width=True)
-    # with columns[1]:
-    #     # effectiveness=(selected_metric.groupby(by=['MediaChannelName'])['ResponseMetricValue'].sum()).values
-    #     # effectiveness_df=pd.DataFrame({'Channel':st.session_state['raw_data']['MediaChannelName'].unique(),"ResponseMetricValue":effectiveness})
-    #     # # effectiveness.reset_index(inplace=True)
-    #     # # st.dataframe(effectiveness.head())
-    #     fig=summary_plot(summary_df_sorted, x='ResponseMetricValue', y='Channel_name', title='Effectiveness', text_column='ResponseMetricValue',color='Channel_name')
-    #     st.plotly_chart(fig,use_container_width=True)
-    # with columns[2]:
-    #     fig=summary_plot(summary_df_sorted, x='Efficiency', y='Channel_name', title='Efficiency', text_column='Efficiency',color='Channel_name',format_as_decimal=True)
-    #     st.plotly_chart(fig,use_container_width=True)
 # Create figure with subplots
 # fig = make_subplots(rows=1, cols=2)

 import numpy as np
 import plotly.graph_objects as go
 import pandas as pd
 def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
     fig = px.bar(data, x=x, y=y, orientation='h',
 with open('summary_df.pkl', 'rb') as file:
   summary_df_sorted = pickle.load(file)
 selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
 st.header('Optimized Spends Overview')
 ___columns=st.columns(3)
 with ___columns[2]:
     fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
     st.plotly_chart(fig,use_container_width=True)
     summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
+    # st.dataframe(summary_df_sorted.head(2))
+    summary_df_sorted['Efficiency']=summary_df_sorted['ResponseMetricValue']/summary_df_sorted['Optimized_spend']
+# # # st.dataframe(summary_df_sorted.head(2))
+# st.dataframe(summary_df_sorted.head(2))
+    columns= st.columns(3)
+    with columns[0]:
+        fig=summary_plot(summary_df_sorted, x='Optimized_spend', y='Channel_name', title='', text_column='Optimized_spend',color='Channel_name')
+        st.plotly_chart(fig,use_container_width=True)
+    with columns[1]:
+        # effectiveness=(selected_metric.groupby(by=['MediaChannelName'])['ResponseMetricValue'].sum()).values
+        # effectiveness_df=pd.DataFrame({'Channel':st.session_state['raw_data']['MediaChannelName'].unique(),"ResponseMetricValue":effectiveness})
+        # # effectiveness.reset_index(inplace=True)
+        # # st.dataframe(effectiveness.head())
+        fig=summary_plot(summary_df_sorted, x='ResponseMetricValue', y='Channel_name', title='Effectiveness', text_column='ResponseMetricValue',color='Channel_name')
+        st.plotly_chart(fig,use_container_width=True)
+    with columns[2]:
+        fig=summary_plot(summary_df_sorted, x='Efficiency', y='Channel_name', title='Efficiency', text_column='Efficiency',color='Channel_name',format_as_decimal=True)
+        st.plotly_chart(fig,use_container_width=True)
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 # Create figure with subplots
 # fig = make_subplots(rows=1, cols=2)

pages/1_Data_Validation.py CHANGED Viewed

@@ -9,7 +9,7 @@ from streamlit_pandas_profiling import st_profile_report
 import streamlit as st
 import streamlit.components.v1 as components
 import sweetviz as sv
-from utilities import set_header,initialize_data,load_local_css
 from st_aggrid import GridOptionsBuilder,GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 from st_aggrid import AgGrid
@@ -17,8 +17,7 @@ import base64
 import os
 import tempfile
 from ydata_profiling import ProfileReport
-from streamlit_pandas_profiling import st_profile_report
 st.set_page_config(
   page_title="Data Validation",
@@ -31,68 +30,52 @@ set_header()
-#preprocessing
-# with open('Categorised_data.pkl', 'rb') as file:
-#   Categorised_data = pickle.load(file)
-# with open("edited_dataframe.pkl", 'rb') as file:
-# df = pickle.load(file)
-# date=df.index
-# df.reset_index(inplace=True)
-# df['date'] = pd.to_datetime(date)
-#prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
-#spends=pd.read_excel('EDA_Data.xlsx',sheet_name='SPEND INPUT')
-#spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
-#df=pd.concat([df,spends],axis=1)
-#df['date'] =pd.to_datetime(df['date']).dt.strftime('%m/%d/%Y')
-#df['Prospects']=prospects['Prospects']
-#df.drop(['Week'],axis=1,inplace=True)
-# Deserialize and load the objects from the pickle file
-# Deserialize and load the objects from the pickle file
 with open('data_import.pkl', 'rb') as f:
     data = pickle.load(f)
-# Accessing the loaded objects
 st.session_state['cleaned_data']= data['final_df']
 st.session_state['category_dict'] = data['bin_dict']
 st.title('Data Validation and Insights')
-# with open("Pickle_files/main_df",'rb') as f:
-#    st.session_state['cleaned_data']= pickle.load(f)
-# with open("Pickle_files/category_dict",'rb') as c:
-#    st.session_state['category_dict']=pickle.load(c)
-# st.write(st.session_state['cleaned_data'])
 target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
 target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
 st.session_state['target_column']=target_column
-fig=line_plot_target(st.session_state['cleaned_data'], target=target_column, title=f'{target_column} Over Time')
-st.plotly_chart(fig, use_container_width=True)
-media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
-# st.write(media_channel)
-Non_media_channel=[col for col in st.session_state['cleaned_data'].columns if col not in media_channel]
-st.markdown('### Annual Data Summary')
-st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
-if st.checkbox('Show raw data'):
-    st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
 col1 = st.columns(1)
 if "selected_feature" not in st.session_state:
@@ -114,29 +97,30 @@ def generate_profile_report(df):
     return report_path
-st.header('Univariate and Bivariate Analysis')
-eda_columns=st.columns(2)
-with eda_columns[0]:
-    if st.button('Generate Profile Report'):
-        with st.spinner('Generating Report'):
-            report_file = generate_profile_report(st.session_state['cleaned_data'])
-            if os.path.exists(report_file):
-                with open(report_file, 'rb') as f:
-                    st.success('Report Generated')
-                    st.download_button(
-                        label="Download EDA Report",
-                        data=f.read(),
-                        file_name="pandas_profiling_report.html",
-                        mime="text/html"
-                    )
-            else:
-                st.warning("Report generation failed. Unable to find the report file.")
 with eda_columns[1]:
-    if st.button('Generate Sweetviz Report'):
        with st.spinner('Generating Report'):
-            report_file = generate_report_with_target(st.session_state['cleaned_data'], target_column)
             if os.path.exists(report_file):
                 with open(report_file, 'rb') as f:
@@ -152,130 +136,116 @@ with eda_columns[1]:
-st.warning('Work in Progress')
-# selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel]))
-# # selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)])
-# st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in  media_channel  if    Categorised_data[col]['VB'] in selected_media ] )
-# spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()]
-# spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]]
-# #st.write(spends_features)
-# #st.write(spends_feature)
-# #st.write(selected_feature)
-# val_variables=[col for col in media_channel if col!='date']
-# if len(spends_feature)==0:
-#     st.warning('No spends varaible available for the selected metric in data')
-# else:
-#     st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly')
-#     # Create the dual-axis line plot
-#     fig_row1 = line_plot(df, x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
-#     st.plotly_chart(fig_row1, use_container_width=True)
-#     st.markdown('### Annual Data Summary')
-#     st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
-#     if st.button('Validate'):
-#         st.session_state['Validation'].append(st.session_state["selected_feature"])
-#     if st.checkbox('Validate all'):
-#         st.session_state['Validation'].extend(val_variables)
-#         st.success('All media variables are validated ✅')
-#     if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables):
-#         #st.write(st.session_state['Validation'])
-#         validation_data=pd.DataFrame({'Variables':val_variables,
-#                                     'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
-#                                     'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
-#         gd=GridOptionsBuilder.from_dataframe(validation_data)
-#         gd.configure_pagination(enabled=True)
-#         gd.configure_selection(use_checkbox=True,selection_mode='multiple')
-#         #gd.configure_selection_toggle_all(None, show_toggle_all=True)
-#         #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
-#         gridoptions=gd.build()
-#         #st.text(st.session_state['Validation'])
-#         table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
-#         #st.table(table)
-#         selected_rows = table["selected_rows"]
-#         st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
-#         not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
-#         if not_validated_variables:
-#             not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
-#             st.warning(not_validated_message)
-# st.header('2. Non Media Variables')
-# selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='date']
-# selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row )
-# if not selected_columns_row4:
-#     st.warning('Please select at least one.')
-# else:
-#     # Create the dual-axis line plot
-#     fig_row4 = line_plot(df, x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
-#     st.plotly_chart(fig_row4, use_container_width=True)
-#     selected_non_media=selected_columns_row4
-#     sum_df = df[['date', selected_non_media,target_column]]
-#     sum_df['Year']=pd.to_datetime(df['date']).dt.year
-#     #st.dataframe(df)
-#     #st.dataframe(sum_df.head(2))
-#     sum_df=sum_df.groupby('Year').agg('sum')
-#     sum_df.loc['Grand Total']=sum_df.sum()
-#     sum_df=sum_df.applymap(format_numbers)
-#     sum_df.fillna('-',inplace=True)
-#     sum_df=sum_df.replace({"0.0":'-','nan':'-'})
-#     st.markdown('### Annual Data Summary')
-#     st.dataframe(sum_df,use_container_width=True)
-#     # if st.checkbox('Validate',key='2'):
-#     #     st.session_state['Validation'].append(selected_columns_row4)
-# # val_variables=[col for col in media_channel if col!='date']
-# # if st.checkbox('Validate all'):
-# #     st.session_state['Validation'].extend(val_variables)
-# # validation_data=pd.DataFrame({'Variables':val_variables,
-# #                             'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
-# #                             'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
-# # gd=GridOptionsBuilder.from_dataframe(validation_data)
-# # gd.configure_pagination(enabled=True)
-# # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
-# # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
-# # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
-# # gridoptions=gd.build()
-# # #st.text(st.session_state['Validation'])
-# # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
-# # #st.table(table)
-# # selected_rows = table["selected_rows"]
-# # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
-# # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
-# # if not_validated_variables:
-# #     not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
-# #     st.warning(not_validated_message)
-# options = list(df.select_dtypes(np.number).columns)
-# st.markdown(' ')
-# st.markdown(' ')
-# st.markdown('# Exploratory Data Analysis')
-# st.markdown(' ')
-# selected_options = []
-# num_columns = 4
-# num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows
-# # Create a grid of checkboxes
-# st.header('Select Features for Correlation Plot')
-# tick=False
-# if st.checkbox('Select all'):
-#     tick=True
-# selected_options = []
-# for row in range(num_rows):
-#     cols = st.columns(num_columns)
-#     for col in cols:
-#         if options:
-#             option = options.pop(0)
-#             selected = col.checkbox(option,value=tick)
-#             if selected:
-#                 selected_options.append(option)
-# # Display selected options
-# #st.write('You selected:', selected_options)
-# st.pyplot(correlation_plot(df,selected_options,target_column))

 import streamlit as st
 import streamlit.components.v1 as components
 import sweetviz as sv
+from utilities import set_header,load_local_css
 from st_aggrid import GridOptionsBuilder,GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 from st_aggrid import AgGrid
 import os
 import tempfile
 from ydata_profiling import ProfileReport
+import re
 st.set_page_config(
   page_title="Data Validation",
 with open('data_import.pkl', 'rb') as f:
     data = pickle.load(f)
 st.session_state['cleaned_data']= data['final_df']
 st.session_state['category_dict'] = data['bin_dict']
 st.title('Data Validation and Insights')
 target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
 target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
 st.session_state['target_column']=target_column
+panels=st.session_state['category_dict']['Panel Level 1'][0]
+selected_panels=st.multiselect('Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.',st.session_state['cleaned_data'][panels].unique())
+aggregation_dict = {item: 'sum' if key == 'Media' else 'mean' for key, value in st.session_state['category_dict'].items()  for item in value if item not in ['date','Panel_1']}
+with st.expander('**Reponse Metric Analysis**'):
+    if len(selected_panels)>0:
+        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'][st.session_state['cleaned_data']['Panel_1'].isin(selected_panels)]
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].groupby(by='date').agg(aggregation_dict)
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
+    else:
+        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'].groupby(by='date').agg(aggregation_dict)
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
+    fig=line_plot_target(st.session_state['Cleaned_data_panel'], target=target_column, title=f'{target_column} Over Time')
+    st.plotly_chart(fig, use_container_width=True)
+    media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
+    # st.write(media_channel)
+    Non_media_variables=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Exogenous' or key=='Internal'])
+    st.markdown('### Annual Data Summary')
+    st.dataframe(summary(st.session_state['Cleaned_data_panel'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
+    if st.checkbox('Show raw data'):
+        st.write(pd.concat([pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.strftime('%m/%d/%Y'),st.session_state['Cleaned_data_panel'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
 col1 = st.columns(1)
 if "selected_feature" not in st.session_state:
     return report_path
+#st.header()
+with st.expander('Univariate and Bivariate Report'):
+    eda_columns=st.columns(2)
+    with eda_columns[0]:
+        if st.button('Generate Profile Report',help='Univariate report which inlcudes all statistical analysis'):
+            with st.spinner('Generating Report'):
+                report_file = generate_profile_report(st.session_state['Cleaned_data_panel'])
+                if os.path.exists(report_file):
+                    with open(report_file, 'rb') as f:
+                        st.success('Report Generated')
+                        st.download_button(
+                            label="Download EDA Report",
+                            data=f.read(),
+                            file_name="pandas_profiling_report.html",
+                            mime="text/html"
+                        )
+                else:
+                    st.warning("Report generation failed. Unable to find the report file.")
 with eda_columns[1]:
+    if st.button('Generate Sweetviz Report',help='Bivariate report for selected response metric'):
        with st.spinner('Generating Report'):
+            report_file = generate_report_with_target(st.session_state['Cleaned_data_panel'], target_column)
             if os.path.exists(report_file):
                 with open(report_file, 'rb') as f:
+#st.warning('Work in Progress')
+with st.expander('Media Variables Analysis'):
+# Get the selected feature
+    st.session_state["selected_feature"]= st.selectbox('Select media', [col for col in media_channel if 'cost' not in col.lower() and 'spend' not in col.lower()])
+    # Filter spends features based on the selected feature
+    spends_features = [col for col in st.session_state['Cleaned_data_panel'].columns if any(keyword in col.lower() for keyword in ['cost', 'spend'])]
+    spends_feature = [col for col in spends_features if re.split(r'_cost|_spend', col.lower())[0] in st.session_state["selected_feature"]]
+    if 'validation' not in st.session_state:
+        st.session_state['validation']=[]
+    val_variables=[col for col in media_channel if col!='date']
+    if len(spends_feature)==0:
+        st.warning('No spends varaible available for the selected metric in data')
+    else:
+        fig_row1 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
+        st.plotly_chart(fig_row1, use_container_width=True)
+        st.markdown('### Summary')
+        st.dataframe(summary(st.session_state['cleaned_data'],[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
+        cols2=st.columns(2)
+        with cols2[0]:
+            if st.button('Validate'):
+                st.session_state['validation'].append(st.session_state["selected_feature"])
+        with cols2[1]:
+            if st.checkbox('Validate all'):
+                st.session_state['validation'].extend(val_variables)
+                st.success('All media variables are validated ✅')
+        if len(set(st.session_state['validation']).intersection(val_variables))!=len(val_variables):
+            validation_data=pd.DataFrame({'Validate':[True if col in st.session_state['validation'] else False for col in val_variables],
+                                        'Variables':val_variables
+                                        })
+            cols3=st.columns([1,30])
+            with cols3[1]:
+                validation_df=st.data_editor(validation_data,
+                                            # column_config={
+                                            # 'Validate':st.column_config.CheckboxColumn(wi)
+                                            # },
+                                            column_config={
+                                                "Validate": st.column_config.CheckboxColumn(
+                                                    default=False,
+                                                    width=100,
+                                                ),
+                                                'Variables':st.column_config.TextColumn(
+                                                    width=1000
+                                                )
+                                                },hide_index=True)
+                selected_rows = validation_df[validation_df['Validate']==True]['Variables']
+                #st.write(selected_rows)
+                st.session_state['validation'].extend(selected_rows)
+                not_validated_variables = [col for col in val_variables if col not in st.session_state["validation"]]
+                if not_validated_variables:
+                    not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
+                    st.warning(not_validated_message)
+with st.expander('Non Media Variables Analysis'):
+    selected_columns_row4 = st.selectbox('Select Channel',Non_media_variables,index=1)
+    #     # Create the dual-axis line plot
+    fig_row4 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
+    st.plotly_chart(fig_row4, use_container_width=True)
+    selected_non_media=selected_columns_row4
+    sum_df = st.session_state['Cleaned_data_panel'][['date', selected_non_media,target_column]]
+    sum_df['Year']=pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.year
+    #st.dataframe(df)
+    #st.dataframe(sum_df.head(2))
+    sum_df=sum_df.groupby('Year').agg('sum')
+    sum_df.loc['Grand Total']=sum_df.sum()
+    sum_df=sum_df.applymap(format_numbers)
+    sum_df.fillna('-',inplace=True)
+    sum_df=sum_df.replace({"0.0":'-','nan':'-'})
+    st.markdown('### Summary')
+    st.dataframe(sum_df,use_container_width=True)
+with st.expander('Correlation Analysis'):
+    options = list(st.session_state['Cleaned_data_panel'].select_dtypes(np.number).columns)
+    # selected_options = []
+    # num_columns = 4
+    # num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows
+    # # Create a grid of checkboxes
+    # st.header('Select Features for Correlation Plot')
+    # tick=False
+    # if st.checkbox('Select all'):
+    #     tick=True
+    # selected_options = []
+    # for row in range(num_rows):
+    #     cols = st.columns(num_columns)
+    #     for col in cols:
+    #         if options:
+    #             option = options.pop(0)
+    #             selected = col.checkbox(option,value=tick)
+    #             if selected:
+    #                 selected_options.append(option)
+    # # Display selected options
+    selected_options=st.multiselect('Select Variables For correlation plot',[var for var in options if var!= target_column],default=options[3])
+    st.pyplot(correlation_plot(st.session_state['Cleaned_data_panel'],selected_options,target_column))

pages/2_Transformations.py ADDED Viewed

	@@ -0,0 +1,522 @@

+# Importing necessary libraries
+import streamlit as st
+st.set_page_config(
+    page_title="Transformations",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+import pickle
+import numpy as np
+import pandas as pd
+from utilities import set_header, load_local_css
+import streamlit_authenticator as stauth
+import yaml
+from yaml import SafeLoader
+load_local_css("styles.css")
+set_header()
+# Check for authentication status
+for k, v in st.session_state.items():
+    if k not in ["logout", "login", "config"] and not k.startswith(
+        "FormSubmitter"
+    ):
+        st.session_state[k] = v
+with open("config.yaml") as file:
+    config = yaml.load(file, Loader=SafeLoader)
+    st.session_state["config"] = config
+authenticator = stauth.Authenticate(
+    config["credentials"],
+    config["cookie"]["name"],
+    config["cookie"]["key"],
+    config["cookie"]["expiry_days"],
+    config["preauthorized"],
+)
+st.session_state["authenticator"] = authenticator
+name, authentication_status, username = authenticator.login("Login", "main")
+auth_status = st.session_state.get("authentication_status")
+if auth_status == True:
+    authenticator.logout("Logout", "main")
+    is_state_initiaized = st.session_state.get("initialized", False)
+    if not is_state_initiaized:
+        if 'session_name' not in st.session_state:
+            st.session_state['session_name']=None
+# Deserialize and load the objects from the pickle file
+        with open("data_import.pkl", "rb") as f:
+            data = pickle.load(f)
+        # Accessing the loaded objects
+        final_df_loaded = data["final_df"]
+        bin_dict_loaded = data["bin_dict"]
+        # Initialize session state
+        if "transformed_columns_dict" not in st.session_state:
+            st.session_state["transformed_columns_dict"] = {}  # Default empty dictionary
+        if "final_df" not in st.session_state:
+            st.session_state["final_df"] = final_df_loaded  # Default as original dataframe
+        if "summary_string" not in st.session_state:
+            st.session_state["summary_string"] = None  # Default as None
+# Extract original columns for specified categories
+    original_columns = {
+        category: bin_dict_loaded[category]
+        for category in ["Media", "Internal", "Exogenous"]
+        if category in bin_dict_loaded
+    }
+    # Retrive Panel columns
+    panel_1 = bin_dict_loaded.get("Panel Level 1")
+    panel_2 = bin_dict_loaded.get("Panel Level 2")
+# # For testing on non panel level
+# final_df_loaded = final_df_loaded.drop("Panel_1", axis=1)
+# final_df_loaded = final_df_loaded.groupby("date").mean().reset_index()
+# panel_1 = None
+# Apply transformations on panel level
+    st.write("")
+    if panel_1:
+        panel = panel_1 + panel_2 if panel_2 else panel_1
+    else:
+        panel = []
+    # Function to build transformation widgets
+    def transformation_widgets(category, transform_params, date_granularity):
+        # Transformation Options
+        transformation_options = {
+            "Media": ["Lag", "Moving Average", "Saturation", "Power", "Adstock"],
+            "Internal": ["Lead", "Lag", "Moving Average"],
+            "Exogenous": ["Lead", "Lag", "Moving Average"],
+        }
+        with st.expander(f"{category} Transformations"):
+            # Let users select which transformations to apply
+            transformations_to_apply = st.multiselect(
+                "Select transformations to apply",
+                options=transformation_options[category],
+                default=[],
+                key=f"transformation_{category}",
+            )
+            # Determine the number of transformations to put in each column
+            transformations_per_column = (
+                len(transformations_to_apply) // 2 + len(transformations_to_apply) % 2
+            )
+            # Create two columns
+            col1, col2 = st.columns(2)
+            # Assign transformations to each column
+            transformations_col1 = transformations_to_apply[:transformations_per_column]
+            transformations_col2 = transformations_to_apply[transformations_per_column:]
+            # Define a helper function to create widgets for each transformation
+            def create_transformation_widgets(column, transformations):
+                with column:
+                    for transformation in transformations:
+                        # Conditionally create widgets for selected transformations
+                        if transformation == "Lead":
+                            st.markdown(f"**Lead ({date_granularity})**")
+                            lead = st.slider(
+                                "Lead periods",
+                                1,
+                                10,
+                                (1, 2),
+                                1,
+                                key=f"lead_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = lead[0]
+                            end = lead[1]
+                            step = 1
+                            transform_params[category]["Lead"] = np.arange(
+                                start, end + step, step
+                            )
+                        if transformation == "Lag":
+                            st.markdown(f"**Lag ({date_granularity})**")
+                            lag = st.slider(
+                                "Lag periods",
+                                1,
+                                10,
+                                (1, 2),
+                                1,
+                                key=f"lag_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = lag[0]
+                            end = lag[1]
+                            step = 1
+                            transform_params[category]["Lag"] = np.arange(
+                                start, end + step, step
+                            )
+                        if transformation == "Moving Average":
+                            st.markdown(f"**Moving Average ({date_granularity})**")
+                            window = st.slider(
+                                "Window size for Moving Average",
+                                1,
+                                10,
+                                (1, 2),
+                                1,
+                                key=f"ma_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = window[0]
+                            end = window[1]
+                            step = 1
+                            transform_params[category]["Moving Average"] = np.arange(
+                                start, end + step, step
+                            )
+                        if transformation == "Saturation":
+                            st.markdown("**Saturation (%)**")
+                            saturation_point = st.slider(
+                                f"Saturation Percentage",
+                                0,
+                                100,
+                                (10, 20),
+                                10,
+                                key=f"sat_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = saturation_point[0]
+                            end = saturation_point[1]
+                            step = 10
+                            transform_params[category]["Saturation"] = np.arange(
+                                start, end + step, step
+                            )
+                        if transformation == "Power":
+                            st.markdown("**Power**")
+                            power = st.slider(
+                                f"Power",
+                                0,
+                                10,
+                                (2, 4),
+                                1,
+                                key=f"power_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = power[0]
+                            end = power[1]
+                            step = 1
+                            transform_params[category]["Power"] = np.arange(
+                                start, end + step, step
+                            )
+                        if transformation == "Adstock":
+                            st.markdown("**Adstock**")
+                            rate = st.slider(
+                                f"Factor ({category})",
+                                0.0,
+                                1.0,
+                                (0.5, 0.7),
+                                0.05,
+                                key=f"adstock_{category}",
+                                label_visibility="collapsed",
+                            )
+                            start = rate[0]
+                            end = rate[1]
+                            step = 0.05
+                            adstock_range = [
+                                round(a, 3) for a in np.arange(start, end + step, step)
+                            ]
+                            transform_params[category]["Adstock"] = adstock_range
+            # Create widgets in each column
+            create_transformation_widgets(col1, transformations_col1)
+            create_transformation_widgets(col2, transformations_col2)
+    # Function to apply Lag transformation
+    def apply_lag(df, lag):
+        return df.shift(lag)
+    # Function to apply Lead transformation
+    def apply_lead(df, lead):
+        return df.shift(-lead)
+    # Function to apply Moving Average transformation
+    def apply_moving_average(df, window_size):
+        return df.rolling(window=window_size).mean()
+    # Function to apply Saturation transformation
+    def apply_saturation(df, saturation_percent_100):
+        # Convert saturation percentage from 100-based to fraction
+        saturation_percent = saturation_percent_100 / 100.0
+        # Calculate saturation point and steepness
+        column_max = df.max()
+        column_min = df.min()
+        saturation_point = (column_min + column_max) / 2
+        numerator = np.log(
+            (1 / (saturation_percent if saturation_percent != 1 else 1 - 1e-9)) - 1
+        )
+        denominator = np.log(saturation_point / max(column_max, 1e-9))
+        steepness = numerator / max(
+            denominator, 1e-9
+        )  # Avoid division by zero with a small constant
+        # Apply the saturation transformation
+        transformed_series = df.apply(
+            lambda x: (1 / (1 + (saturation_point / x) ** steepness)) * x
+        )
+        return transformed_series
+    # Function to apply Power transformation
+    def apply_power(df, power):
+        return df**power
+    # Function to apply Adstock transformation
+    def apply_adstock(df, factor):
+        x = 0
+        # Use the walrus operator to update x iteratively with the Adstock formula
+        adstock_var = [x := x * factor + v for v in df]
+        ans = pd.Series(adstock_var, index=df.index)
+        return ans
+    # Function to generate transformed columns names
+    @st.cache_resource(show_spinner=False)
+    def generate_transformed_columns(original_columns, transform_params):
+        transformed_columns, summary = {}, {}
+        for category, columns in original_columns.items():
+            for column in columns:
+                transformed_columns[column] = []
+                summary_details = (
+                    []
+                )  # List to hold transformation details for the current column
+                if category in transform_params:
+                    for transformation, values in transform_params[category].items():
+                        # Generate transformed column names for each value
+                        for value in values:
+                            transformed_name = f"{column}@{transformation}_{value}"
+                            transformed_columns[column].append(transformed_name)
+                        # Format the values list as a string with commas and "and" before the last item
+                        if len(values) > 1:
+                            formatted_values = (
+                                ", ".join(map(str, values[:-1])) + " and " + str(values[-1])
+                            )
+                        else:
+                            formatted_values = str(values[0])
+                        # Add transformation details
+                        summary_details.append(f"{transformation} ({formatted_values})")
+                # Only add to summary if there are transformation details for the column
+                if summary_details:
+                    formatted_summary = "⮕ ".join(summary_details)
+                    # Use <strong> tags to make the column name bold
+                    summary[column] = f"<strong>{column}</strong>: {formatted_summary}"
+        # Generate a comprehensive summary string for all columns
+        summary_items = [
+            f"{idx + 1}. {details}" for idx, details in enumerate(summary.values())
+        ]
+        summary_string = "\n".join(summary_items)
+        return transformed_columns, summary_string
+    # Function to apply transformations to DataFrame slices based on specified categories and parameters
+    @st.cache_resource(show_spinner=False)
+    def apply_category_transformations(df, bin_dict, transform_params, panel):
+        # Dictionary for function mapping
+        transformation_functions = {
+            "Lead": apply_lead,
+            "Lag": apply_lag,
+            "Moving Average": apply_moving_average,
+            "Saturation": apply_saturation,
+            "Power": apply_power,
+            "Adstock": apply_adstock,
+        }
+        # Initialize category_df as an empty DataFrame
+        category_df = pd.DataFrame()
+        # Iterate through each category specified in transform_params
+        for category in ["Media", "Internal", "Exogenous"]:
+            if (
+                category not in transform_params
+                or category not in bin_dict
+                or not transform_params[category]
+            ):
+                continue  # Skip categories without transformations
+            # Slice the DataFrame based on the columns specified in bin_dict for the current category
+            df_slice = df[bin_dict[category] + panel]
+            # Iterate through each transformation and its parameters for the current category
+            for transformation, parameters in transform_params[category].items():
+                transformation_function = transformation_functions[transformation]
+                # Check if there is panel data to group by
+                if len(panel) > 0:
+                    # Apply the transformation to each group
+                    category_df = pd.concat(
+                        [
+                            df_slice.groupby(panel)
+                            .transform(transformation_function, p)
+                            .add_suffix(f"@{transformation}_{p}")
+                            for p in parameters
+                        ],
+                        axis=1,
+                    )
+                    # Replace all NaN or null values in category_df with 0
+                    category_df.fillna(0, inplace=True)
+                    # Update df_slice
+                    df_slice = pd.concat(
+                        [df[panel], category_df],
+                        axis=1,
+                    )
+                else:
+                    for p in parameters:
+                        # Apply the transformation function to each column
+                        temp_df = df_slice.apply(
+                            lambda x: transformation_function(x, p), axis=0
+                        ).rename(lambda x: f"{x}@{transformation}_{p}", axis="columns")
+                        # Concatenate the transformed DataFrame slice to the category DataFrame
+                        category_df = pd.concat([category_df, temp_df], axis=1)
+                    # Replace all NaN or null values in category_df with 0
+                    category_df.fillna(0, inplace=True)
+                    # Update df_slice
+                    df_slice = pd.concat(
+                        [df[panel], category_df],
+                        axis=1,
+                    )
+        # If category_df has been modified, concatenate it with the panel and response metrics from the original DataFrame
+        if not category_df.empty:
+            final_df = pd.concat([df, category_df], axis=1)
+        else:
+            # If no transformations were applied, use the original DataFrame
+            final_df = df
+        return final_df
+    # Function to infers the granularity of the date column in a DataFrame
+    @st.cache_resource(show_spinner=False)
+    def infer_date_granularity(df):
+        # Find the most common difference
+        common_freq = pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
+        # Map the most common difference to a granularity
+        if common_freq == 1:
+            return "daily"
+        elif common_freq == 7:
+            return "weekly"
+        elif 28 <= common_freq <= 31:
+            return "monthly"
+        else:
+            return "irregular"
+    #########################################################################################################################################################
+    # User input for transformations
+    #########################################################################################################################################################
+    # Infer date granularity
+    date_granularity = infer_date_granularity(final_df_loaded)
+    # Initialize the main dictionary to store the transformation parameters for each category
+    transform_params = {"Media": {}, "Internal": {}, "Exogenous": {}}
+    # User input for transformations
+    st.markdown("### Select Transformations to Apply")
+    for category in ["Media", "Internal", "Exogenous"]:
+        # Skip Internal
+        if category == "Internal":
+            continue
+        transformation_widgets(category, transform_params, date_granularity)
+    #########################################################################################################################################################
+    # Apply transformations
+    #########################################################################################################################################################
+    # Apply category-based transformations to the DataFrame
+    if st.button("Accept and Proceed", use_container_width=True):
+        with st.spinner("Applying transformations..."):
+            final_df = apply_category_transformations(
+                final_df_loaded, bin_dict_loaded, transform_params, panel
+            )
+            # Generate a dictionary mapping original column names to lists of transformed column names
+            transformed_columns_dict, summary_string = generate_transformed_columns(
+                original_columns, transform_params
+            )
+            # Store into transformed dataframe and summary session state
+            st.session_state["final_df"] = final_df
+            st.session_state["summary_string"] = summary_string
+    #########################################################################################################################################################
+    # Display the transformed DataFrame and summary
+    #########################################################################################################################################################
+    # Display the transformed DataFrame in the Streamlit app
+    st.markdown("### Transformed DataFrame")
+    st.dataframe(st.session_state["final_df"], hide_index=True)
+    # Total rows and columns
+    total_rows, total_columns = st.session_state["final_df"].shape
+    st.markdown(
+        f"<p style='text-align: justify;'>The transformed DataFrame contains <strong>{total_rows}</strong> rows and <strong>{total_columns}</strong> columns.</p>",
+        unsafe_allow_html=True,
+    )
+    # Display the summary of transformations as markdown
+    if st.session_state["summary_string"]:
+        with st.expander("Summary of Transformations"):
+            st.markdown("### Summary of Transformations")
+            st.markdown(st.session_state["summary_string"], unsafe_allow_html=True)
+    @st.cache_resource(show_spinner=False)
+    def save_to_pickle(file_path, final_df):
+        # Open the file in write-binary mode and dump the objects
+        with open(file_path, "wb") as f:
+            pickle.dump({"final_df_transformed": final_df}, f)
+            # Data is now saved to file
+    if st.button("Accept and Save", use_container_width=True):
+        save_to_pickle(
+            "final_df_transformed.pkl", st.session_state["final_df"]
+        )
+        st.toast("💾 Saved Successfully!")

pages/4_Model_Build.py ADDED Viewed

	@@ -0,0 +1,826 @@

+'''
+MMO Build Sprint 3
+additions : adding more variables to session state for saved model : random effect, predicted train & test
+MMO Build Sprint 4
+additions : ability to run models for different response metrics
+'''
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from Eda_functions import format_numbers
+import numpy as np
+import pickle
+from st_aggrid import AgGrid
+from st_aggrid import GridOptionsBuilder, GridUpdateMode
+from utilities import set_header, load_local_css
+from st_aggrid import GridOptionsBuilder
+import time
+import itertools
+import statsmodels.api as sm
+import numpy as npc
+import re
+import itertools
+from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
+from sklearn.preprocessing import MinMaxScaler
+import os
+import matplotlib.pyplot as plt
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+st.set_option('deprecation.showPyplotGlobalUse', False)
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+from datetime import datetime
+import seaborn as sns
+from Data_prep_functions import *
+def get_random_effects(media_data, panel_col, mdf):
+    random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
+    for i, market in enumerate(media_data[panel_col].unique()):
+        print(i, end='\r')
+        intercept = mdf.random_effects[market].values[0]
+        random_eff_df.loc[i, 'random_effect'] = intercept
+        random_eff_df.loc[i, panel_col] = market
+    return random_eff_df
+def mdf_predict(X_df, mdf, random_eff_df):
+    X = X_df.copy()
+    X['fixed_effect'] = mdf.predict(X)
+    X = pd.merge(X, random_eff_df, on=panel_col, how='left')
+    X['pred'] = X['fixed_effect'] + X['random_effect']
+    # X.to_csv('Test/megred_df.csv',index=False)
+    X.drop(columns=['fixed_effect', 'random_effect'], inplace=True)
+    return X['pred']
+st.set_page_config(
+    page_title="Model Build",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state='collapsed'
+)
+load_local_css('styles.css')
+set_header()
+st.title('1. Build Your Model')
+with open("data_import.pkl", "rb") as f:
+    data = pickle.load(f)
+    st.session_state['bin_dict'] = data["bin_dict"]
+#st.write(data["bin_dict"])
+with open("final_df_transformed.pkl", "rb") as f:
+    data = pickle.load(f)
+# Accessing the loaded objects
+    media_data = data["final_df_transformed"]
+# Sprint4 - available response metrics is a list of all reponse metrics in the data
+## these will be put in a drop down
+    st.session_state['media_data']=media_data
+if 'available_response_metrics' not in st.session_state:
+    # st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue',
+    #                                                   'Total Approved Accounts - Appsflyer',
+    #                                                   'Account Requests - Appsflyer',
+    #                                                   'App Installs - Appsflyer']
+    st.session_state['available_response_metrics']= st.session_state['bin_dict']["Response Metrics"]
+# Sprint4
+if "is_tuned_model" not in st.session_state:
+    st.session_state["is_tuned_model"] = {}
+for resp_metric in st.session_state['available_response_metrics'] :
+    resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
+    st.session_state["is_tuned_model"][resp_metric] = False
+# Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model
+if 'used_response_metrics' not in st.session_state:
+    st.session_state['used_response_metrics'] = []
+# Sprint4 - saved_model_names
+if 'saved_model_names' not in st.session_state:
+    st.session_state['saved_model_names'] = []
+# if "model_save_flag" not in st.session_state:
+#     st.session_state["model_save_flag"]=False
+# def reset_save():
+#     st.session_state["model_save_flag"]=False
+# def set_save():
+#     st.session_state["model_save_flag"]=True
+# Sprint4 - select a response metric
+sel_target_col = st.selectbox("Select the response metric",
+                              st.session_state['available_response_metrics'])
+ # , on_change=reset_save())
+target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
+new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns}
+media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns]
+#st.write(st.session_state['bin_dict'])
+panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
+date_col = 'date'
+#st.write(media_data)
+is_panel = True if len(panel_col)>0 else False
+if 'is_panel' not in st.session_state:
+    st.session_state['is_panel']=False
+# if st.toggle('Apply Transformations on DMA/Panel Level'):
+#     media_data = pd.read_csv(r'C:\Users\SrishtiVerma\Mastercard\Sprint2\upf_data_converted_randomized_resp_metrics.csv')
+#     media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
+#                           media_data.columns]
+#     dma = st.selectbox('Select the Level of data ',
+#                        [col for col in media_data.columns if col.lower() in ['dma', 'panel', 'markets']])
+#     # is_panel = True
+#     # st.session_state['is_panel']=True
+#
+# else:
+#     # """ code to aggregate data on date """
+#     media_data = pd.read_excel(r'C:\Users\SrishtiVerma\Mastercard\Sprint1\Tactic Level Models\Tactic_level_data_imp_clicks_spends.xlsx')
+#     media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
+#                           media_data.columns]
+#     dma = None
+#     # is_panel = False
+#     # st.session_state['is_panel']=False
+#media_data = st.session_state["final_df"]
+# st.write(media_data.columns)
+media_data.sort_values(date_col, inplace=True)
+media_data.reset_index(drop=True, inplace=True)
+date = media_data[date_col]
+st.session_state['date'] = date
+# revenue=media_data[target_col]
+y = media_data[target_col]
+if is_panel:
+    spends_data = media_data[
+        [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]]
+    # Sprint3 - spends for resp curves
+else:
+    spends_data = media_data[
+        [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]]
+y = media_data[target_col]
+# media_data.drop([target_col],axis=1,inplace=True)
+media_data.drop([date_col], axis=1, inplace=True)
+media_data.reset_index(drop=True, inplace=True)
+# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
+# st.markdown('## Select the Range of Transformations')
+columns = st.columns(2)
+old_shape = media_data.shape
+if "old_shape" not in st.session_state:
+    st.session_state['old_shape'] = old_shape
+# with columns[0]:
+#     slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1,
+#                                      format="%.2f")
+# with columns[1]:
+#     slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3),
+#                                  step=1)
+# with columns[2]:
+#    slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
+# with columns[1]:
+#    st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
+#    st.number_input('Select the range of  ')
+# Section 1 - Transformations Functions
+# def lag(data, features, lags, dma=None):
+#     if dma:
+#
+#         transformed_data = pd.concat(
+#             [data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
+#         # transformed_data = transformed_data.fillna(method='bfill')
+#         transformed_data = transformed_data.bfill() # Sprint4 - fillna getting deprecated
+#         return pd.concat([transformed_data, data], axis=1)
+#
+#     else:
+#
+#         # ''' data should be aggregated on date'''
+#
+#         transformed_data = pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
+#         # transformed_data = transformed_data.fillna(method='bfill')
+#         transformed_data = transformed_data.bfill()
+#
+#         return pd.concat([transformed_data, data], axis=1)
+#
+#
+# # adstock
+# def adstock(df, alphas, cutoff, features, dma=None):
+#     if dma:
+#         transformed_data = pd.DataFrame()
+#         for d in df[dma].unique():
+#             dma_sub_df = df[df[dma] == d]
+#             n = len(dma_sub_df)
+#
+#             weights = np.array(
+#                 [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for
+#                  alpha in alphas])
+#             X = dma_sub_df[features].to_numpy()
+#
+#             res = pd.DataFrame(np.hstack(weights @ X),
+#                                columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
+#
+#             transformed_data = pd.concat([transformed_data, res], axis=0)
+#             transformed_data.reset_index(drop=True, inplace=True)
+#         return pd.concat([transformed_data, df], axis=1)
+#
+#     else:
+#
+#         n = len(df)
+#
+#         weights = np.array(
+#             [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for alpha in
+#              alphas])
+#
+#         X = df[features].to_numpy()
+#         res = pd.DataFrame(np.hstack(weights @ X),
+#                            columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
+#         return pd.concat([res, df], axis=1)
+# Section 2 - Begin Transformations
+if 'media_data' not in st.session_state:
+    st.session_state['media_data'] = pd.DataFrame()
+# Sprint3
+if "orig_media_data" not in st.session_state:
+    st.session_state['orig_media_data'] = pd.DataFrame()
+# Sprint3 additions
+if 'random_effects' not in st.session_state:
+    st.session_state['random_effects'] = pd.DataFrame()
+if 'pred_train' not in st.session_state:
+    st.session_state['pred_train'] = []
+if 'pred_test' not in st.session_state:
+    st.session_state['pred_test'] = []
+# end of Sprint3 additions
+# variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
+# variables_to_be_transformed = [col for col in media_data.columns if
+#                                '_clicks' in col.lower() or '_impress' in col.lower()]  # srishti - change
+#
+# with columns[0]:
+#     if st.button('Apply Transformations'):
+#         with st.spinner('Applying Transformations'):
+#             transformed_data_lag = lag(media_data, features=variables_to_be_transformed,
+#                                        lags=np.arange(slider_value_lag[0], slider_value_lag[1] + 1, 1), dma=dma)
+#
+#             # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
+#             variables_to_be_transformed = [col for col in media_data.columns if
+#                                            '_clicks' in col.lower() or '_impress' in col.lower()]  # srishti - change
+#
+#             transformed_data_adstock = adstock(df=transformed_data_lag,
+#                                                alphas=np.arange(slider_value_adstock[0], slider_value_adstock[1], 0.1),
+#                                                cutoff=8, features=variables_to_be_transformed, dma=dma)
+#
+#             # st.success('Done')
+#             st.success("Transformations complete!")
+#
+#             st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
+#
+#             transformed_data_adstock.columns = [c.replace(".", "_") for c in
+#                                                 transformed_data_adstock.columns]  # srishti
+#             st.session_state['media_data'] = transformed_data_adstock  # srishti
+#             # Sprint3
+#             orig_media_data = media_data.copy()
+#             orig_media_data[date_col] = date
+#             orig_media_data[target_col] = y
+#             st.session_state['orig_media_data'] = orig_media_data  # srishti
+#
+#         # with st.spinner('Applying Transformations'):
+#         #   time.sleep(2)
+#         #   st.success("Transformations complete!")
+#
+# # if st.session_state['media_data'].shape[1]>old_shape[1]:
+# # with columns[0]:
+# # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
+# # st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
+# Section 3 - Create combinations
+# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
+#       ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
+#         ' GA App: Will And Cid Pequena Baixo Risco Clicks',
+#       'digital_tactic_others',"programmatic"
+#       ]
+# srishti - bucket names changed
+bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2',
+          'fb_level_achieved_tier_1', 'paid_social_others',
+          'ga_app',
+          'digital_tactic_others', "programmatic"
+          ]
+with columns[0]:
+    if st.button('Create Combinations of Variables'):
+        top_3_correlated_features = []
+        # # for col in st.session_state['media_data'].columns[:19]:
+        # original_cols = [c for c in st.session_state['media_data'].columns if
+        #                  "_clicks" in c.lower() or "_impressions" in c.lower()]
+        #original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]
+        original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal']
+        original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols]
+        #st.write(original_cols)
+        # for col in st.session_state['media_data'].columns[:19]:
+        for col in original_cols:  # srishti - new
+            corr_df = pd.concat([st.session_state['media_data'].filter(regex=col),
+                                 y], axis=1).corr()[target_col].iloc[:-1]
+            top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
+        flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
+        # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
+        all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if
+                            len([col for col in flattened_list if var in col]) > 0}  # srishti
+        channels_all = [values for values in all_features_set.values()]
+        st.session_state['combinations'] = list(itertools.product(*channels_all))
+        # if 'combinations' not in st.session_state:
+        #   st.session_state['combinations']=combinations_all
+        st.session_state['final_selection'] = st.session_state['combinations']
+        st.success('Done')
+        # revenue.reset_index(drop=True,inplace=True)
+    y.reset_index(drop=True, inplace=True)
+    if 'Model_results' not in st.session_state:
+        st.session_state['Model_results'] = {'Model_object': [],
+                                             'Model_iteration': [],
+                                             'Feature_set': [],
+                                             'MAPE': [],
+                                             'R2': [],
+                                             'ADJR2': [],
+                                             'pos_count': []
+                                             }
+    def reset_model_result_dct():
+        st.session_state['Model_results'] = {'Model_object': [],
+                                             'Model_iteration': [],
+                                             'Feature_set': [],
+                                             'MAPE': [],
+                                             'R2': [],
+                                             'ADJR2': [],
+                                             'pos_count': []
+                                             }
+        # if st.button('Build Model'):
+    if 'iterations' not in st.session_state:
+        st.session_state['iterations'] = 0
+    if 'final_selection' not in st.session_state:
+        st.session_state['final_selection'] = False
+save_path = r"Model/"
+with columns[1]:
+    if st.session_state['final_selection']:
+        st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
+if st.checkbox('Build all iterations'):
+    iterations = len(st.session_state['final_selection'])
+else:
+    iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100,
+                                 value=st.session_state['iterations'], on_change=reset_model_result_dct)
+#  st.write("iterations=", iterations)
+if st.button('Build Model', on_click=reset_model_result_dct):
+    st.session_state['iterations'] = iterations
+    # Section 4 - Model
+    # st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill')
+    st.session_state['media_data'] = st.session_state['media_data'].ffill()
+    st.markdown(
+        'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
+    progress_bar = st.progress(0)  # Initialize the progress bar
+    # time_remaining_text = st.empty()  # Create an empty space for time remaining text
+    start_time = time.time()  # Record the start time
+    progress_text = st.empty()
+    # time_elapsed_text = st.empty()
+    # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
+    # st.write(st.session_state["final_selection"])
+    # for i, selected_features in enumerate(st.session_state["final_selection"]):
+    if is_panel == True:
+        for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]):  # srishti
+            df = st.session_state['media_data']
+            fet = [var for var in selected_features if len(var) > 0]
+            inp_vars_str = " + ".join(fet)  # new
+            X = df[fet]
+            y = df[target_col]
+            ss = MinMaxScaler()
+            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+            X[target_col] = y  # Sprint2
+            X[panel_col] = df[panel_col]  # Sprint2
+            X_train = X.iloc[:8000]
+            X_test = X.iloc[8000:]
+            y_train = y.iloc[:8000]
+            y_test = y.iloc[8000:]
+            print(X_train.shape)
+            # model = sm.OLS(y_train, X_train).fit()
+            md_str = target_col + " ~ " + inp_vars_str
+            # md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
+            #                 data=X_train[[target_col] + fet],
+            #                 groups=X_train[panel_col])
+            md = smf.mixedlm(md_str,
+                             data=X_train[[target_col] + fet],
+                             groups=X_train[panel_col])
+            mdf = md.fit()
+            predicted_values = mdf.fittedvalues
+            coefficients = mdf.fe_params.to_dict()
+            model_positive = [col for col in coefficients.keys() if coefficients[col] > 0]
+            pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]
+            if (len(model_positive) / len(selected_features)) > 0 and (
+                    len(pvalues) / len(selected_features)) >= 0:  # srishti - changed just for testing, revert later
+                # predicted_values = model.predict(X_train)
+                mape = mean_absolute_percentage_error(y_train, predicted_values)
+                r2 = r2_score(y_train, predicted_values)
+                adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)
+                filename = os.path.join(save_path, f"model_{i}.pkl")
+                with open(filename, "wb") as f:
+                    pickle.dump(mdf, f)
+                # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
+                #   model = pickle.load(file)
+                st.session_state['Model_results']['Model_object'].append(filename)
+                st.session_state['Model_results']['Model_iteration'].append(i)
+                st.session_state['Model_results']['Feature_set'].append(fet)
+                st.session_state['Model_results']['MAPE'].append(mape)
+                st.session_state['Model_results']['R2'].append(r2)
+                st.session_state['Model_results']['pos_count'].append(len(model_positive))
+                st.session_state['Model_results']['ADJR2'].append(adjr2)
+            current_time = time.time()
+            time_taken = current_time - start_time
+            time_elapsed_minutes = time_taken / 60
+            completed_iterations_text = f"{i + 1}/{iterations}"
+            progress_bar.progress((i + 1) / int(iterations))
+            progress_text.text(
+                f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
+        st.write(
+            f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
+    else:
+        for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]):  # srishti
+            df = st.session_state['media_data']
+            fet = [var for var in selected_features if len(var) > 0]
+            inp_vars_str = " + ".join(fet)
+            X = df[fet]
+            y = df[target_col]
+            ss = MinMaxScaler()
+            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+            X = sm.add_constant(X)
+            X_train = X.iloc[:130]
+            X_test = X.iloc[130:]
+            y_train = y.iloc[:130]
+            y_test = y.iloc[130:]
+            model = sm.OLS(y_train, X_train).fit()
+            coefficients = model.params.to_list()
+            model_positive = [coef for coef in coefficients if coef > 0]
+            predicted_values = model.predict(X_train)
+            pvalues = [var for var in list(model.pvalues) if var <= 0.06]
+            # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
+            if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len(
+                    selected_features)) >= 0.5:  # srishti - changed just for testing, revert later VALID MODEL CRITERIA
+                # predicted_values = model.predict(X_train)
+                mape = mean_absolute_percentage_error(y_train, predicted_values)
+                adjr2 = model.rsquared_adj
+                r2 = model.rsquared
+                filename = os.path.join(save_path, f"model_{i}.pkl")
+                with open(filename, "wb") as f:
+                    pickle.dump(model, f)
+                # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
+                #   model = pickle.load(file)
+                st.session_state['Model_results']['Model_object'].append(filename)
+                st.session_state['Model_results']['Model_iteration'].append(i)
+                st.session_state['Model_results']['Feature_set'].append(fet)
+                st.session_state['Model_results']['MAPE'].append(mape)
+                st.session_state['Model_results']['R2'].append(r2)
+                st.session_state['Model_results']['ADJR2'].append(adjr2)
+                st.session_state['Model_results']['pos_count'].append(len(model_positive))
+            current_time = time.time()
+            time_taken = current_time - start_time
+            time_elapsed_minutes = time_taken / 60
+            completed_iterations_text = f"{i + 1}/{iterations}"
+            progress_bar.progress((i + 1) / int(iterations))
+            progress_text.text(
+                f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
+        st.write(
+            f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
+    pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
+    def to_percentage(value):
+        return f'{value * 100:.1f}%'
+## Section 5 - Select Model
+st.title('2. Select Models')
+if 'tick' not in st.session_state:
+    st.session_state['tick'] = False
+if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=st.session_state['tick']):
+    st.session_state['tick'] = True
+    st.write('Select one model iteration to generate performance metrics for it:')
+    data = pd.DataFrame(st.session_state['Model_results'])
+    data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs
+    data.sort_values(by=['ADJR2'], ascending=False, inplace=True)
+    data.drop_duplicates(subset='Model_iteration', inplace=True)
+    top_10 = data.head(10)
+    top_10['Rank'] = np.arange(1, len(top_10) + 1, 1)
+    top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage)
+    top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']]
+    # top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
+    gd = GridOptionsBuilder.from_dataframe(top_10_table)
+    gd.configure_pagination(enabled=True)
+    gd.configure_selection(
+        use_checkbox=True,
+        selection_mode="single",
+        pre_select_all_rows=False,
+        pre_selected_rows=[1],
+    )
+    gridoptions = gd.build()
+    table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED)
+    selected_rows = table.selected_rows
+    # if st.session_state["selected_rows"] != selected_rows:
+    #   st.session_state["build_rc_cb"] = False
+    st.session_state["selected_rows"] = selected_rows
+    if 'Model' not in st.session_state:
+        st.session_state['Model'] = {}
+    # Section 6 - Display Results
+    if len(selected_rows) > 0:
+        st.header('2.1 Results Summary')
+        model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object']
+        features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set']
+        with open(str(model_object.values[0]), 'rb') as file:
+            # print(file)
+            model = pickle.load(file)
+        st.write(model.summary())
+        st.header('2.2 Actual vs. Predicted Plot')
+        if is_panel :
+            df = st.session_state['media_data']
+            X = df[features_set.values[0]]
+            y = df[target_col]
+            ss = MinMaxScaler()
+            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+            # Sprint2 changes
+            X[target_col] = y  # new
+            X[panel_col] = df[panel_col]
+            X[date_col] = date
+            X_train = X.iloc[:8000]
+            X_test = X.iloc[8000:].reset_index(drop=True)
+            y_train = y.iloc[:8000]
+            y_test = y.iloc[8000:].reset_index(drop=True)
+            test_spends = spends_data[8000:]  # Sprint3 - test spends for resp curves
+            random_eff_df = get_random_effects(media_data, panel_col, model)
+            train_pred = model.fittedvalues
+            test_pred = mdf_predict(X_test, model, random_eff_df)
+            print("__" * 20, test_pred.isna().sum())
+        else :
+            df = st.session_state['media_data']
+            X = df[features_set.values[0]]
+            y = df[target_col]
+            ss = MinMaxScaler()
+            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+            X = sm.add_constant(X)
+            X[date_col] = date
+            X_train = X.iloc[:130]
+            X_test = X.iloc[130:].reset_index(drop=True)
+            y_train = y.iloc[:130]
+            y_test = y.iloc[130:].reset_index(drop=True)
+            test_spends = spends_data[130:]  # Sprint3 - test spends for resp curves
+            train_pred = model.predict(X_train[features_set.values[0]+['const']])
+            test_pred = model.predict(X_test[features_set.values[0]+['const']])
+        # save x test to test - srishti
+        x_test_to_save = X_test.copy()
+        x_test_to_save['Actuals'] = y_test
+        x_test_to_save['Predictions'] = test_pred
+        x_train_to_save = X_train.copy()
+        x_train_to_save['Actuals'] = y_train
+        x_train_to_save['Predictions'] = train_pred
+        x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False)
+        x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False)
+        st.session_state['X'] = X_train
+        st.session_state['features_set'] = features_set.values[0]
+        print("**" * 20, "selected model features : ", features_set.values[0])
+        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred,
+                                                                                 model, target_column=sel_target_col,
+                                                                                 is_panel=is_panel)  # Sprint2
+        st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
+        st.markdown('## 2.3 Residual Analysis')
+        columns = st.columns(2)
+        with columns[0]:
+            fig = plot_residual_predicted(y_train, train_pred, X_train)  # Sprint2
+            st.plotly_chart(fig)
+        with columns[1]:
+            st.empty()
+            fig = qqplot(y_train, train_pred)  # Sprint2
+            st.plotly_chart(fig)
+        with columns[0]:
+            fig = residual_distribution(y_train, train_pred)  # Sprint2
+            st.pyplot(fig)
+        vif_data = pd.DataFrame()
+        # X=X.drop('const',axis=1)
+        X_train_orig = X_train.copy()  # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain
+        del_col_list = list(set([target_col, panel_col, date_col]).intersection(list(X_train.columns)))
+        X_train.drop(columns=del_col_list, inplace=True)  # Sprint2
+        vif_data["Variable"] = X_train.columns
+        vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
+        vif_data.sort_values(by=['VIF'], ascending=False, inplace=True)
+        vif_data = np.round(vif_data)
+        vif_data['VIF'] = vif_data['VIF'].astype(float)
+        st.header('2.4 Variance Inflation Factor (VIF)')
+        # st.dataframe(vif_data)
+        color_mapping = {
+            'darkgreen': (vif_data['VIF'] < 3),
+            'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
+            'darkred': (vif_data['VIF'] > 10)
+        }
+        # Create a horizontal bar plot
+        fig, ax = plt.subplots()
+        fig.set_figwidth(10)  # Adjust the width of the figure as needed
+        # Sort the bars by descending VIF values
+        vif_data = vif_data.sort_values(by='VIF', ascending=False)
+        # Iterate through the color mapping and plot bars with corresponding colors
+        for color, condition in color_mapping.items():
+            subset = vif_data[condition]
+            bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
+            # Add text annotations on top of the bars
+            for bar in bars:
+                width = bar.get_width()
+                ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
+                            textcoords='offset points', va='center')
+        # Customize the plot
+        ax.set_xlabel('VIF Values')
+        # ax.set_title('2.4 Variance Inflation Factor (VIF)')
+        # ax.legend(loc='upper right')
+        # Display the plot in Streamlit
+        st.pyplot(fig)
+        with st.expander('Results Summary Test data'):
+            # ss = MinMaxScaler()
+            # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
+            st.header('2.2 Actual vs. Predicted Plot')
+            metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test,
+                                                                                     test_pred, model,
+                                                                                     target_column=sel_target_col,
+                                                                                     is_panel=is_panel)  # Sprint2
+            st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
+            st.markdown('## 2.3 Residual Analysis')
+            columns = st.columns(2)
+            with columns[0]:
+                fig = plot_residual_predicted(y, test_pred, X_test)  # Sprint2
+                st.plotly_chart(fig)
+            with columns[1]:
+                st.empty()
+                fig = qqplot(y, test_pred)  # Sprint2
+                st.plotly_chart(fig)
+            with columns[0]:
+                fig = residual_distribution(y, test_pred)  # Sprint2
+                st.pyplot(fig)
+        value = False
+        save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb')  # , on_click=set_save())
+        if save_button_model:
+            mod_name = st.text_input('Enter model name')
+            if len(mod_name) > 0:
+                mod_name = mod_name + "__" + target_col  # Sprint4 - adding target col to model name
+                if is_panel :
+                    pred_train= model.fittedvalues
+                    pred_test= mdf_predict(X_test, model, random_eff_df)
+                else :
+                    st.session_state['features_set'] = st.session_state['features_set'] + ['const']
+                    pred_train= model.predict(X_train_orig[st.session_state['features_set']])
+                    pred_test= model.predict(X_test[st.session_state['features_set']])
+                st.session_state['Model'][mod_name] = {"Model_object": model,
+                                                       'feature_set': st.session_state['features_set'],
+                                                       'X_train': X_train_orig,
+                                                       'X_test': X_test,
+                                                       'y_train': y_train,
+                                                       'y_test': y_test,
+                                                       'pred_train':pred_train,
+                                                       'pred_test': pred_test
+                                                       }
+                st.session_state['X_train'] = X_train_orig
+                # st.session_state['X_test'] = X_test
+                # st.session_state['y_train'] = y_train
+                # st.session_state['y_test'] = y_test
+                st.session_state['X_test_spends'] = test_spends
+                # st.session_state['base_model'] = model
+                # st.session_state['base_model_feature_set'] = st.session_state['features_set']
+                st.session_state['saved_model_names'].append(mod_name)
+                # Sprint3 additions
+                if is_panel :
+                    random_eff_df = get_random_effects(media_data, panel_col, model)
+                    st.session_state['random_effects'] = random_eff_df
+                # st.session_state['pred_train'] = model.fittedvalues
+                # st.session_state['pred_test'] = mdf_predict(X_test, model, random_eff_df)
+                # # End of Sprint3 additions
+                with open("best_models.pkl", "wb") as f:
+                    pickle.dump(st.session_state['Model'], f)
+                    st.success(mod_name + ' model saved! Proceed to the next page to tune the model')
+                    urm = st.session_state['used_response_metrics']
+                    urm.append(sel_target_col)
+                    st.session_state['used_response_metrics'] = list(set(urm))
+                    mod_name = ""
+                    # Sprint4 - add the formatted name of the target col to used resp metrics
+                value = False

pages/4_Saved_Model_Results.py CHANGED Viewed

@@ -7,16 +7,14 @@ import statsmodels.api as sm
 from sklearn.metrics import mean_absolute_percentage_error
 import sys
 import os
-from utilities import (set_header,
-                       load_local_css,
-                       load_authenticator)
 import seaborn as sns
 import matplotlib.pyplot as plt
 import sweetviz as sv
 import tempfile
 from sklearn.preprocessing import MinMaxScaler
 from st_aggrid import AgGrid
-from st_aggrid import GridOptionsBuilder,GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 import sys
 import re
@@ -24,390 +22,586 @@ import re
 sys.setrecursionlimit(10**6)
 original_stdout = sys.stdout
-sys.stdout = open('temp_stdout.txt', 'w')
 sys.stdout.close()
 sys.stdout = original_stdout
-st.set_page_config(layout='wide')
-load_local_css('styles.css')
 set_header()
 for k, v in st.session_state.items():
-    if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
         st.session_state[k] = v
-authenticator = st.session_state.get('authenticator')
 if authenticator is None:
     authenticator = load_authenticator()
-name, authentication_status, username = authenticator.login('Login', 'main')
-auth_status = st.session_state.get('authentication_status')
 if auth_status == True:
-    is_state_initiaized = st.session_state.get('initialized',False)
     if not is_state_initiaized:
-        a=1
     def plot_residual_predicted(actual, predicted, df_):
-            df_['Residuals'] = actual - pd.Series(predicted)
-            df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()
-            # Create a Plotly scatter plot
-            fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"])
-            # Add horizontal lines
-            fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
-            fig.add_hline(y=2, line_color="red")
-            fig.add_hline(y=-2, line_color="red")
-            fig.update_xaxes(title='Predicted')
-            fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')
-            # Set the same width and height for both figures
-            fig.update_layout(title='Residuals over Predicted Values', autosize=False, width=600, height=400)
-            return fig
     def residual_distribution(actual, predicted):
-            Residuals = actual - pd.Series(predicted)
-            # Create a Seaborn distribution plot
-            sns.set(style="whitegrid")
-            plt.figure(figsize=(6, 4))
-            sns.histplot(Residuals, kde=True, color="#11B6BD")
-            plt.title(' Distribution of Residuals')
-            plt.xlabel('Residuals')
-            plt.ylabel('Probability Density')
-            return plt
     def qqplot(actual, predicted):
-            Residuals = actual - pd.Series(predicted)
-            Residuals = pd.Series(Residuals)
-            Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
-            # Create a QQ plot using Plotly with custom colors
-            fig = go.Figure()
-            fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
-                                    y=sm.ProbPlot(Resud_std).sample_quantiles,
-                                    mode='markers',
-                                    marker=dict(size=5, color="#11B6BD"),
-                                    name='QQ Plot'))
-            # Add the 45-degree reference line
-            diagonal_line = go.Scatter(
-                x=[-2, 2],  # Adjust the x values as needed to fit the range of your data
-                y=[-2, 2],  # Adjust the y values accordingly
-                mode='lines',
-                line=dict(color='red'),  # Customize the line color and style
-                name=' '
             )
-            fig.add_trace(diagonal_line)
-            # Customize the layout
-            fig.update_layout(title='QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
-                            xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
-            return fig
     def plot_actual_vs_predicted(date, y, predicted_values, model):
         fig = go.Figure()
-        fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='blue')))
-        fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='orange')))
         # Calculate MAPE
-        mape = mean_absolute_percentage_error(y, predicted_values)*100
         # Calculate R-squared
         rss = np.sum((y - predicted_values) ** 2)
         tss = np.sum((y - np.mean(y)) ** 2)
         r_squared = 1 - (rss / tss)
         # Get the number of predictors
         num_predictors = model.df_model
         # Get the number of samples
         num_samples = len(y)
         # Calculate Adjusted R-squared
-        adj_r_squared = 1 - ((1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)))
-        metrics_table = pd.DataFrame({
-        'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
-        'Value': [mape, r_squared, adj_r_squared]})
         fig.update_layout(
-            xaxis=dict(title='Date'),
-            yaxis=dict(title='Value'),
-            title=f'MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}',
-            xaxis_tickangle=-30
         )
-        return metrics_table,fig
     def contributions(X, model):
         X1 = X.copy()
         for j, col in enumerate(X1.columns):
             X1[col] = X1[col] * model.params.values[j]
-        return np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)
-    transformed_data=pd.read_csv('transformed_data.csv')
     # hard coded for now, need to get features set from model
-    feature_set_dct={'app_installs_-_appsflyer':['paid_search_clicks',
-                                            'fb:_level_achieved_-_tier_1_impressions_lag2',
-                                            'fb:_level_achieved_-_tier_2_clicks_lag2',
-                                            'paid_social_others_impressions_adst.1',
-                                            'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2',
-                                            'digital_tactic_others_clicks',
-                                            'kwai_clicks_adst.3',
-                                            'programmaticclicks',
-                                            'indicacao_clicks_adst.1',
-                                            'infleux_clicks_adst.4',
-                                            'influencer_clicks'],
-                'account_requests_-_appsflyer':['paid_search_impressions',
-                                                'fb:_level_achieved_-_tier_1_clicks_adst.1',
-                                                'fb:_level_achieved_-_tier_2_clicks_adst.1',
-                                                'paid_social_others_clicks_lag2',
-                                                'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1',
-                                                'digital_tactic_others_clicks_adst.1',
-                                                'kwai_clicks_adst.2',
-                                                'programmaticimpressions_lag4_adst.1',
-                                                'indicacao_clicks',
-                                                'infleux_clicks_adst.2',
-                                                'influencer_clicks'],
-                'total_approved_accounts_-_appsflyer':['paid_search_clicks',
-                                                        'fb:_level_achieved_-_tier_1_impressions_lag2_adst.1',
-                                                        'fb:_level_achieved_-_tier_2_impressions_lag2',
-                                                        'paid_social_others_clicks_lag2_adst.2',
-                                                        'ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4',
-                                                        'digital_tactic_others_clicks',
-                                                        'kwai_impressions_adst.2',
-                                                        'programmaticclicks_adst.5',
-                                                        'indicacao_clicks_adst.1',
-                                                        'infleux_clicks_adst.3',
-                                                        'influencer_clicks'],
-                'total_approved_accounts_-_revenue':['paid_search_impressions_adst.5',
-                                                    'kwai_impressions_lag2_adst.3',
-                                                    'indicacao_clicks_adst.3',
-                                                    'infleux_clicks_adst.3',
-                                                    'programmaticclicks_adst.4',
-                                                    'influencer_clicks_adst.3',
-                                                    'fb:_level_achieved_-_tier_1_impressions_adst.2',
-                                                    'fb:_level_achieved_-_tier_2_impressions_lag3_adst.5',
-                                                    'paid_social_others_impressions_adst.3',
-                                                    'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5',
-                                                    'digital_tactic_others_clicks_adst.2']
-                }
-    #""" the above part should be modified so that we are fetching features set from the saved model"""
-    def contributions(X, model,target):
         X1 = X.copy()
         for j, col in enumerate(X1.columns):
             X1[col] = X1[col] * model.params.values[j]
-        contributions= np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)
-        contributions=pd.DataFrame(contributions,columns=target).reset_index().rename(columns={'index':'Channel'})
-        contributions['Channel']=[ re.split(r'_imp|_cli', col)[0] for col in contributions['Channel']]
         return contributions
-    def model_fit(features_set,target):
         X = transformed_data[features_set]
-        y=  transformed_data[target]
         ss = MinMaxScaler()
         X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
         X = sm.add_constant(X)
-        X_train=X.iloc[:150]
-        X_test=X.iloc[150:]
-        y_train=y.iloc[:150]
-        y_test=y.iloc[150:]
         model = sm.OLS(y_train, X_train).fit()
         predicted_values_train = model.predict(X_train)
         r2 = model.rsquared
         adjr2 = model.rsquared_adj
         train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
-        test_mape=mean_absolute_percentage_error(y_test, model.predict(X_test))
-        summary=model.summary()
-        train_contributions=contributions(X_train,model,[target])
-        return pd.DataFrame({'Model':target,'R2':np.round(r2,2),'ADJr2':np.round(adjr2,2),'Train Mape':np.round(train_mape,2),
-                             'Test Mape':np.round(test_mape,2),'Summary':summary,'Model_object':model
-                             },index=[0]), train_contributions
-    metrics_table=pd.DataFrame()
-    if 'contribution_df' not in st.session_state:
-        st.session_state["contribution_df"]=pd.DataFrame()
-    for target,feature_set in feature_set_dct.items():
-       metrics_table= pd.concat([metrics_table,model_fit(features_set=feature_set,target=target)[0]])
-       if st.session_state["contribution_df"].empty:
-           st.session_state["contribution_df"]= model_fit(features_set=feature_set,target=target)[1]
-       else:
-        st.session_state["contribution_df"]=pd.merge(st.session_state["contribution_df"],model_fit(features_set=feature_set,target=target)[1])
     # st.write(st.session_state["contribution_df"])
-    metrics_table.reset_index(drop=True,inplace=True)
-    eda_columns=st.columns(2)
     with eda_columns[1]:
-        eda=st.button('Generate EDA Report',help="Click to generate a bivariate report for the selected response metric from the table below.")
     # st.markdown('Model Metrics')
-    st.title('Contribution Overview')
-    contribution_selections=st.multiselect('Select the models to compare contributions',[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel'   ],default=[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel'   ][-1])
-    trace_data=[]
     for selection in contribution_selections:
-        trace=go.Bar(x=st.session_state['contribution_df']['Channel'], y=st.session_state['contribution_df'][selection],name=selection,text=np.round(st.session_state['contribution_df'][selection],0).astype(int).astype(str)+'%',textposition='outside')
         trace_data.append(trace)
     layout = go.Layout(
-    title='Metrics Contribution by Channel',
-    xaxis=dict(title='Channel Name'),
-    yaxis=dict(title='Metrics Contribution'),
-    barmode='group'
-                )
     fig = go.Figure(data=trace_data, layout=layout)
-    st.plotly_chart(fig,use_container_width=True)
-    st.title('Analysis of Models Result')
-    #st.markdown()
-    gd_table=metrics_table.iloc[:,:-2]
-    gd=GridOptionsBuilder.from_dataframe(gd_table)
-    #gd.configure_pagination(enabled=True)
-    gd.configure_selection(use_checkbox=True)
-    gridoptions=gd.build()
-    table = AgGrid(gd_table,gridOptions=gridoptions,fit_columns_on_grid_load=True,height=200)
     # table=metrics_table.iloc[:,:-2]
     # table.insert(0, "Select", False)
     # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
-    if len(table.selected_rows)==0:
-        st.warning("Click on the checkbox to view comprehensive results of the selected model.")
         st.stop()
-    else:
-        target_column=table.selected_rows[0]['Model']
-        feature_set=feature_set_dct[target_column]
     with eda_columns[1]:
         if eda:
             def generate_report_with_target(channel_data, target_feature):
-                report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature,verbose=False)
                 temp_dir = tempfile.mkdtemp()
                 report_path = os.path.join(temp_dir, "report.html")
-                report.show_html(filepath=report_path, open_browser=False)  # Generate the report as an HTML file
                 return report_path
-            report_data=transformed_data[feature_set]
-            report_data[target_column]=transformed_data[target_column]
             report_file = generate_report_with_target(report_data, target_column)
             if os.path.exists(report_file):
-                with open(report_file, 'rb') as f:
                     st.download_button(
                         label="Download EDA Report",
                         data=f.read(),
                         file_name="report.html",
-                        mime="text/html"
                     )
             else:
                 st.warning("Report generation failed. Unable to find the report file.")
-    model=metrics_table[metrics_table['Model']==target_column]['Model_object'].iloc[0]
-    st.header('Model Summary')
     st.write(model.summary())
-    X=transformed_data[feature_set]
-    ss=MinMaxScaler()
-    X=pd.DataFrame(ss.fit_transform(X),columns=X.columns)
-    X=sm.add_constant(X)
-    y=transformed_data[target_column]
-    X_train=X.iloc[:150]
-    X_test=X.iloc[150:]
-    y_train=y.iloc[:150]
-    y_test=y.iloc[150:]
-    X.index=transformed_data['date']
-    y.index=transformed_data['date']
-    metrics_table_train,fig_train= plot_actual_vs_predicted(X_train.index, y_train, model.predict(X_train), model)
-    metrics_table_test,fig_test= plot_actual_vs_predicted(X_test.index, y_test, model.predict(X_test), model)
-    metrics_table_train=metrics_table_train.set_index('Metric').transpose()
-    metrics_table_train.index=['Train']
-    metrics_table_test=metrics_table_test.set_index('Metric').transpose()
-    metrics_table_test.index=['test']
-    metrics_table=np.round(pd.concat([metrics_table_train,metrics_table_test]),2)
-    st.markdown('Result Overview')
-    st.dataframe(np.round(metrics_table,2),use_container_width=True)
-    st.subheader('Actual vs Predicted Plot Train')
-    st.plotly_chart(fig_train,use_container_width=True)
-    st.subheader('Actual vs Predicted Plot Test')
-    st.plotly_chart(fig_test,use_container_width=True)
-    st.markdown('## Residual Analysis')
-    columns=st.columns(2)
-    Xtrain1=X_train.copy()
     with columns[0]:
-        fig=plot_residual_predicted(y_train,model.predict(Xtrain1),Xtrain1)
         st.plotly_chart(fig)
     with columns[1]:
         st.empty()
-        fig = qqplot(y_train,model.predict(X_train))
         st.plotly_chart(fig)
     with columns[0]:
-        fig=residual_distribution(y_train,model.predict(X_train))
         st.pyplot(fig)
 elif auth_status == False:
-    st.error('Username/Password is incorrect')
     try:
-        username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
         if username_forgot_pw:
-            st.success('New password sent securely')
             # Random password to be transferred to the user securely
         elif username_forgot_pw == False:
-            st.error('Username not found')
     except Exception as e:
         st.error(e)

 from sklearn.metrics import mean_absolute_percentage_error
 import sys
 import os
+from utilities import set_header, load_local_css, load_authenticator
 import seaborn as sns
 import matplotlib.pyplot as plt
 import sweetviz as sv
 import tempfile
 from sklearn.preprocessing import MinMaxScaler
 from st_aggrid import AgGrid
+from st_aggrid import GridOptionsBuilder, GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 import sys
 import re
 sys.setrecursionlimit(10**6)
 original_stdout = sys.stdout
+sys.stdout = open("temp_stdout.txt", "w")
 sys.stdout.close()
 sys.stdout = original_stdout
+st.set_page_config(layout="wide")
+load_local_css("styles.css")
 set_header()
 for k, v in st.session_state.items():
+    if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
         st.session_state[k] = v
+authenticator = st.session_state.get("authenticator")
 if authenticator is None:
     authenticator = load_authenticator()
+name, authentication_status, username = authenticator.login("Login", "main")
+auth_status = st.session_state.get("authentication_status")
 if auth_status == True:
+    is_state_initiaized = st.session_state.get("initialized", False)
     if not is_state_initiaized:
+        a = 1
     def plot_residual_predicted(actual, predicted, df_):
+        df_["Residuals"] = actual - pd.Series(predicted)
+        df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
+            "Residuals"
+        ].std()
+        # Create a Plotly scatter plot
+        fig = px.scatter(
+            df_,
+            x=predicted,
+            y="StdResidual",
+            opacity=0.5,
+            color_discrete_sequence=["#11B6BD"],
+        )
+        # Add horizontal lines
+        fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
+        fig.add_hline(y=2, line_color="red")
+        fig.add_hline(y=-2, line_color="red")
+        fig.update_xaxes(title="Predicted")
+        fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")
+        # Set the same width and height for both figures
+        fig.update_layout(
+            title="Residuals over Predicted Values",
+            autosize=False,
+            width=600,
+            height=400,
+        )
+        return fig
     def residual_distribution(actual, predicted):
+        Residuals = actual - pd.Series(predicted)
+        # Create a Seaborn distribution plot
+        sns.set(style="whitegrid")
+        plt.figure(figsize=(6, 4))
+        sns.histplot(Residuals, kde=True, color="#11B6BD")
+        plt.title(" Distribution of Residuals")
+        plt.xlabel("Residuals")
+        plt.ylabel("Probability Density")
+        return plt
     def qqplot(actual, predicted):
+        Residuals = actual - pd.Series(predicted)
+        Residuals = pd.Series(Residuals)
+        Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
+        # Create a QQ plot using Plotly with custom colors
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=sm.ProbPlot(Resud_std).theoretical_quantiles,
+                y=sm.ProbPlot(Resud_std).sample_quantiles,
+                mode="markers",
+                marker=dict(size=5, color="#11B6BD"),
+                name="QQ Plot",
             )
+        )
+        # Add the 45-degree reference line
+        diagonal_line = go.Scatter(
+            x=[-2, 2],  # Adjust the x values as needed to fit the range of your data
+            y=[-2, 2],  # Adjust the y values accordingly
+            mode="lines",
+            line=dict(color="red"),  # Customize the line color and style
+            name=" ",
+        )
+        fig.add_trace(diagonal_line)
+        # Customize the layout
+        fig.update_layout(
+            title="QQ Plot of Residuals",
+            title_x=0.5,
+            autosize=False,
+            width=600,
+            height=400,
+            xaxis_title="Theoretical Quantiles",
+            yaxis_title="Sample Quantiles",
+        )
+        return fig
     def plot_actual_vs_predicted(date, y, predicted_values, model):
         fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
+            )
+        )
+        fig.add_trace(
+            go.Scatter(
+                x=date,
+                y=predicted_values,
+                mode="lines",
+                name="Predicted",
+                line=dict(color="orange"),
+            )
+        )
         # Calculate MAPE
+        mape = mean_absolute_percentage_error(y, predicted_values) * 100
         # Calculate R-squared
         rss = np.sum((y - predicted_values) ** 2)
         tss = np.sum((y - np.mean(y)) ** 2)
         r_squared = 1 - (rss / tss)
         # Get the number of predictors
         num_predictors = model.df_model
         # Get the number of samples
         num_samples = len(y)
         # Calculate Adjusted R-squared
+        adj_r_squared = 1 - (
+            (1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
+        )
+        metrics_table = pd.DataFrame(
+            {
+                "Metric": ["MAPE", "R-squared", "AdjR-squared"],
+                "Value": [mape, r_squared, adj_r_squared],
+            }
+        )
         fig.update_layout(
+            xaxis=dict(title="Date"),
+            yaxis=dict(title="Value"),
+            title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
+            xaxis_tickangle=-30,
         )
+        return metrics_table, fig
     def contributions(X, model):
         X1 = X.copy()
         for j, col in enumerate(X1.columns):
             X1[col] = X1[col] * model.params.values[j]
+        return np.round(
+            (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
+        )
+    transformed_data = pd.read_csv("transformed_data.csv")
     # hard coded for now, need to get features set from model
+    feature_set_dct = {
+        "app_installs_-_appsflyer": [
+            "paid_search_clicks",
+            "fb:_level_achieved_-_tier_1_impressions_lag2",
+            "fb:_level_achieved_-_tier_2_clicks_lag2",
+            "paid_social_others_impressions_adst.1",
+            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
+            "digital_tactic_others_clicks",
+            "kwai_clicks_adst.3",
+            "programmaticclicks",
+            "indicacao_clicks_adst.1",
+            "infleux_clicks_adst.4",
+            "influencer_clicks",
+        ],
+        "account_requests_-_appsflyer": [
+            "paid_search_impressions",
+            "fb:_level_achieved_-_tier_1_clicks_adst.1",
+            "fb:_level_achieved_-_tier_2_clicks_adst.1",
+            "paid_social_others_clicks_lag2",
+            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
+            "digital_tactic_others_clicks_adst.1",
+            "kwai_clicks_adst.2",
+            "programmaticimpressions_lag4_adst.1",
+            "indicacao_clicks",
+            "infleux_clicks_adst.2",
+            "influencer_clicks",
+        ],
+        "total_approved_accounts_-_appsflyer": [
+            "paid_search_clicks",
+            "fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
+            "fb:_level_achieved_-_tier_2_impressions_lag2",
+            "paid_social_others_clicks_lag2_adst.2",
+            "ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
+            "digital_tactic_others_clicks",
+            "kwai_impressions_adst.2",
+            "programmaticclicks_adst.5",
+            "indicacao_clicks_adst.1",
+            "infleux_clicks_adst.3",
+            "influencer_clicks",
+        ],
+        "total_approved_accounts_-_revenue": [
+            "paid_search_impressions_adst.5",
+            "kwai_impressions_lag2_adst.3",
+            "indicacao_clicks_adst.3",
+            "infleux_clicks_adst.3",
+            "programmaticclicks_adst.4",
+            "influencer_clicks_adst.3",
+            "fb:_level_achieved_-_tier_1_impressions_adst.2",
+            "fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
+            "paid_social_others_impressions_adst.3",
+            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
+            "digital_tactic_others_clicks_adst.2",
+        ],
+    }
+    # """ the above part should be modified so that we are fetching features set from the saved model"""
+    def contributions(X, model, target):
         X1 = X.copy()
         for j, col in enumerate(X1.columns):
             X1[col] = X1[col] * model.params.values[j]
+        contributions = np.round(
+            (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
+        )
+        contributions = (
+            pd.DataFrame(contributions, columns=target)
+            .reset_index()
+            .rename(columns={"index": "Channel"})
+        )
+        contributions["Channel"] = [
+            re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
+        ]
         return contributions
+    def model_fit(features_set, target):
         X = transformed_data[features_set]
+        y = transformed_data[target]
         ss = MinMaxScaler()
         X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
         X = sm.add_constant(X)
+        X_train = X.iloc[:150]
+        X_test = X.iloc[150:]
+        y_train = y.iloc[:150]
+        y_test = y.iloc[150:]
         model = sm.OLS(y_train, X_train).fit()
         predicted_values_train = model.predict(X_train)
         r2 = model.rsquared
         adjr2 = model.rsquared_adj
         train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
+        test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
+        summary = model.summary()
+        train_contributions = contributions(X_train, model, [target])
+        return (
+            pd.DataFrame(
+                {
+                    "Model": target,
+                    "R2": np.round(r2, 2),
+                    "ADJr2": np.round(adjr2, 2),
+                    "Train Mape": np.round(train_mape, 2),
+                    "Test Mape": np.round(test_mape, 2),
+                    "Summary": summary,
+                    "Model_object": model,
+                },
+                index=[0],
+            ),
+            train_contributions,
+        )
+    metrics_table = pd.DataFrame()
+    if "contribution_df" not in st.session_state:
+        st.session_state["contribution_df"] = pd.DataFrame()
+    for target, feature_set in feature_set_dct.items():
+        metrics_table = pd.concat(
+            [metrics_table, model_fit(features_set=feature_set, target=target)[0]]
+        )
+        if st.session_state["contribution_df"].empty:
+            st.session_state["contribution_df"] = model_fit(
+                features_set=feature_set, target=target
+            )[1]
+        else:
+            st.session_state["contribution_df"] = pd.merge(
+                st.session_state["contribution_df"],
+                model_fit(features_set=feature_set, target=target)[1],
+            )
     # st.write(st.session_state["contribution_df"])
+    metrics_table.reset_index(drop=True, inplace=True)
+    eda_columns = st.columns(2)
     with eda_columns[1]:
+        eda = st.button(
+            "Generate EDA Report",
+            help="Click to generate a bivariate report for the selected response metric from the table below.",
+        )
     # st.markdown('Model Metrics')
+    st.title("Contribution Overview")
+    contribution_selections = st.multiselect(
+        "Select the models to compare contributions",
+        [
+            col
+            for col in st.session_state["contribution_df"].columns
+            if col.lower() != "channel"
+        ],
+        default=[
+            col
+            for col in st.session_state["contribution_df"].columns
+            if col.lower() != "channel"
+        ][-1],
+    )
+    trace_data = []
     for selection in contribution_selections:
+        trace = go.Bar(
+            x=st.session_state["contribution_df"]["Channel"],
+            y=st.session_state["contribution_df"][selection],
+            name=selection,
+            text=np.round(st.session_state["contribution_df"][selection], 0)
+            .astype(int)
+            .astype(str)
+            + "%",
+            textposition="outside",
+        )
         trace_data.append(trace)
     layout = go.Layout(
+        title="Metrics Contribution by Channel",
+        xaxis=dict(title="Channel Name"),
+        yaxis=dict(title="Metrics Contribution"),
+        barmode="group",
+    )
     fig = go.Figure(data=trace_data, layout=layout)
+    st.plotly_chart(fig, use_container_width=True)
+    ############################################ Waterfall Chart ############################################
+    # import plotly.graph_objects as go
+    # # Initialize a Plotly figure
+    # fig = go.Figure()
+    # for selection in contribution_selections:
+    #     # Ensure y_values are numeric
+    #     y_values = st.session_state["contribution_df"][selection].values.astype(float)
+    #     # Generating text labels for each bar, ensuring operations are compatible with string formats
+    #     text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]
+    #     fig.add_trace(
+    #         go.Waterfall(
+    #             name=selection,
+    #             orientation="v",
+    #             measure=["relative"]
+    #             * len(y_values),  # Adjust if you have absolute values at certain points
+    #             x=st.session_state["contribution_df"]["Channel"].tolist(),
+    #             text=text_values,
+    #             textposition="outside",
+    #             y=y_values,
+    #             increasing={"marker": {"color": "green"}},
+    #             decreasing={"marker": {"color": "red"}},
+    #             totals={"marker": {"color": "blue"}},
+    #         )
+    #     )
+    # fig.update_layout(
+    #     title="Metrics Contribution by Channel",
+    #     xaxis={"title": "Channel Name"},
+    #     yaxis={"title": "Metrics Contribution"},
+    #     height=600,
+    # )
+    # # Displaying the waterfall chart in Streamlit
+    # st.plotly_chart(fig, use_container_width=True)
+    import plotly.graph_objects as go
+    # Initialize a Plotly figure
+    fig = go.Figure()
+    for selection in contribution_selections:
+        # Ensure contributions are numeric
+        contributions = (
+            st.session_state["contribution_df"][selection].values.astype(float).tolist()
+        )
+        channel_names = st.session_state["contribution_df"]["Channel"].tolist()
+        display_name, display_contribution, base_contribution = [], [], 0
+        for channel_name, contribution in zip(channel_names, contributions):
+            if channel_name != "const":
+                display_name.append(channel_name)
+                display_contribution.append(contribution)
+            else:
+                base_contribution = contribution
+        display_name = ["Base Sales"] + display_name
+        display_contribution = [base_contribution] + display_contribution
+        # Generating text labels for each bar, ensuring operations are compatible with string formats
+        text_values = [
+            f"{val}%" for val in np.round(display_contribution, 0).astype(int)
+        ]
+        fig.add_trace(
+            go.Waterfall(
+                orientation="v",
+                measure=["relative"]
+                * len(
+                    display_contribution
+                ),  # Adjust if you have absolute values at certain points
+                x=display_name,
+                text=text_values,
+                textposition="outside",
+                y=display_contribution,
+                increasing={"marker": {"color": "green"}},
+                decreasing={"marker": {"color": "red"}},
+                totals={"marker": {"color": "blue"}},
+            )
+        )
+    fig.update_layout(
+        title="Metrics Contribution by Channel",
+        xaxis={"title": "Channel Name"},
+        yaxis={"title": "Metrics Contribution"},
+        height=600,
+    )
+    # Displaying the waterfall chart in Streamlit
+    st.plotly_chart(fig, use_container_width=True)
+    ############################################ Waterfall Chart ############################################
+    st.title("Analysis of Models Result")
+    # st.markdown()
+    gd_table = metrics_table.iloc[:, :-2]
+    gd = GridOptionsBuilder.from_dataframe(gd_table)
+    # gd.configure_pagination(enabled=True)
+    gd.configure_selection(
+        use_checkbox=True,
+        selection_mode="single",
+        pre_select_all_rows=False,
+        pre_selected_rows=[1],
+    )
+    gridoptions = gd.build()
+    table = AgGrid(
+        gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
+    )
     # table=metrics_table.iloc[:,:-2]
     # table.insert(0, "Select", False)
     # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
+    if len(table.selected_rows) == 0:
+        st.warning(
+            "Click on the checkbox to view comprehensive results of the selected model."
+        )
         st.stop()
+    else:
+        target_column = table.selected_rows[0]["Model"]
+        feature_set = feature_set_dct[target_column]
     with eda_columns[1]:
         if eda:
             def generate_report_with_target(channel_data, target_feature):
+                report = sv.analyze(
+                    [channel_data, "Dataset"], target_feat=target_feature, verbose=False
+                )
                 temp_dir = tempfile.mkdtemp()
                 report_path = os.path.join(temp_dir, "report.html")
+                report.show_html(
+                    filepath=report_path, open_browser=False
+                )  # Generate the report as an HTML file
                 return report_path
+            report_data = transformed_data[feature_set]
+            report_data[target_column] = transformed_data[target_column]
             report_file = generate_report_with_target(report_data, target_column)
             if os.path.exists(report_file):
+                with open(report_file, "rb") as f:
                     st.download_button(
                         label="Download EDA Report",
                         data=f.read(),
                         file_name="report.html",
+                        mime="text/html",
                     )
             else:
                 st.warning("Report generation failed. Unable to find the report file.")
+    model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
+        0
+    ]
+    st.header("Model Summary")
     st.write(model.summary())
+    X = transformed_data[feature_set]
+    ss = MinMaxScaler()
+    X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+    X = sm.add_constant(X)
+    y = transformed_data[target_column]
+    X_train = X.iloc[:150]
+    X_test = X.iloc[150:]
+    y_train = y.iloc[:150]
+    y_test = y.iloc[150:]
+    X.index = transformed_data["date"]
+    y.index = transformed_data["date"]
+    metrics_table_train, fig_train = plot_actual_vs_predicted(
+        X_train.index, y_train, model.predict(X_train), model
+    )
+    metrics_table_test, fig_test = plot_actual_vs_predicted(
+        X_test.index, y_test, model.predict(X_test), model
+    )
+    metrics_table_train = metrics_table_train.set_index("Metric").transpose()
+    metrics_table_train.index = ["Train"]
+    metrics_table_test = metrics_table_test.set_index("Metric").transpose()
+    metrics_table_test.index = ["test"]
+    metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)
+    st.markdown("Result Overview")
+    st.dataframe(np.round(metrics_table, 2), use_container_width=True)
+    st.subheader("Actual vs Predicted Plot Train")
+    st.plotly_chart(fig_train, use_container_width=True)
+    st.subheader("Actual vs Predicted Plot Test")
+    st.plotly_chart(fig_test, use_container_width=True)
+    st.markdown("## Residual Analysis")
+    columns = st.columns(2)
+    Xtrain1 = X_train.copy()
     with columns[0]:
+        fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
         st.plotly_chart(fig)
     with columns[1]:
         st.empty()
+        fig = qqplot(y_train, model.predict(X_train))
         st.plotly_chart(fig)
     with columns[0]:
+        fig = residual_distribution(y_train, model.predict(X_train))
         st.pyplot(fig)
 elif auth_status == False:
+    st.error("Username/Password is incorrect")
     try:
+        username_forgot_pw, email_forgot_password, random_password = (
+            authenticator.forgot_password("Forgot password")
+        )
         if username_forgot_pw:
+            st.success("New password sent securely")
             # Random password to be transferred to the user securely
         elif username_forgot_pw == False:
+            st.error("Username not found")
     except Exception as e:
         st.error(e)

pages/5_Model_Tuning_with_panel.py ADDED Viewed

	@@ -0,0 +1,527 @@

+'''
+MMO Build Sprint 3
+date :
+changes : capability to tune MixedLM as well as simple LR in the same page
+'''
+import streamlit as st
+import pandas as pd
+from Eda_functions import format_numbers
+import pickle
+from utilities import set_header, load_local_css
+import statsmodels.api as sm
+import re
+from sklearn.preprocessing import MinMaxScaler
+import matplotlib.pyplot as plt
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+st.set_option('deprecation.showPyplotGlobalUse', False)
+import statsmodels.formula.api as smf
+from Data_prep_functions import *
+# for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features", "tuned_model", "tuned_model_dict"] :
+st.set_page_config(
+    page_title="Model Tuning",
+    page_icon=":shark:",
+    layout="wide",
+    initial_sidebar_state='collapsed'
+)
+load_local_css('styles.css')
+set_header()
+# Sprint3
+# is_panel = st.session_state['is_panel']
+# panel_col = 'markets'  # set the panel column
+date_col = 'date'
+panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
+is_panel = True if len(panel_col)>0 else False
+# flag indicating there is not tuned model till now
+# Sprint4 - model tuned dict
+if 'Model_Tuned' not in st.session_state:
+    st.session_state['Model_Tuned'] = {}
+st.title('1. Model Tuning')
+# st.write(st.session_state['base_model_feature_set'])
+if "X_train" not in st.session_state:
+    st.error(
+        "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
+    st.stop()
+# X_train=st.session_state['X_train']
+# X_test=st.session_state['X_test']
+# y_train=st.session_state['y_train']
+# y_test=st.session_state['y_test']
+# df=st.session_state['media_data']
+# st.write(X_train.columns)
+# st.write(X_test.columns)
+if "is_tuned_model" not in st.session_state:
+        st.session_state["is_tuned_model"] = {}
+# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
+if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics'] != []:
+    sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
+    target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
+else:
+    sel_target_col = 'Total Approved Accounts - Revenue'
+    target_col = 'total_approved_accounts_revenue'
+# Sprint4 - Look through all saved models, only show saved models of the sel resp metric (target_col)
+saved_models = st.session_state['saved_model_names']
+required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
+sel_model = st.selectbox("Select the model to tune", required_saved_models)
+with open("best_models.pkl", 'rb') as file:
+    model_dict = pickle.load(file)
+sel_model_dict = model_dict[sel_model + "__" + target_col]  # Sprint4 - get the model obj of the selected model
+# st.write(sel_model_dict)
+X_train = sel_model_dict['X_train']
+X_test = sel_model_dict['X_test']
+y_train = sel_model_dict['y_train']
+y_test = sel_model_dict['y_test']
+df = st.session_state['media_data']
+if 'selected_model' not in st.session_state:
+    st.session_state['selected_model'] = 0
+# st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns)
+st.markdown('### 1.1 Event Flags')
+st.markdown('Helps in quantifying the impact of specific occurrences of events')
+with st.expander('Apply Event Flags'):
+    # st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
+    model = sel_model_dict['Model_object']
+    date = st.session_state['date']
+    date = pd.to_datetime(date)
+    X_train = sel_model_dict['X_train']
+    # features_set= model_dict[st.session_state["selected_model"]]['feature_set']
+    features_set = sel_model_dict["feature_set"]
+    col = st.columns(3)
+    min_date = min(date)
+    max_date = max(date)
+    with col[0]:
+        start_date = st.date_input('Select Start Date', min_date, min_value=min_date, max_value=max_date)
+    with col[1]:
+        end_date = st.date_input('Select End Date', max_date, min_value=min_date, max_value=max_date)
+    with col[2]:
+        repeat = st.selectbox('Repeat Annually', ['Yes', 'No'], index=1)
+    if repeat == 'Yes':
+        repeat = True
+    else:
+        repeat = False
+    if 'Flags' not in st.session_state:
+        st.session_state['Flags'] = {}
+    # print("**"*50)
+    # print(y_train)
+    # print("**"*50)
+    # print(model.fittedvalues)
+    if is_panel:  # Sprint3
+        met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train,
+                                                              model.fittedvalues, model,
+                                                              target_column=sel_target_col,
+                                                              flag=(start_date, end_date),
+                                                              repeat_all_years=repeat, is_panel=True)
+        st.plotly_chart(fig_flag, use_container_width=True)
+        # create flag on test
+        met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test,
+                                                                   sel_model_dict['pred_test'], model,
+                                                                   target_column=sel_target_col,
+                                                                   flag=(start_date, end_date),
+                                                                   repeat_all_years=repeat, is_panel=True)
+    else:
+        pred_train=model.predict(X_train[features_set])
+        met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, pred_train, model,
+                                                              flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
+        st.plotly_chart(fig_flag, use_container_width=True)
+        pred_test=model.predict(X_test[features_set])
+        met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, pred_test, model,
+                                                                   flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
+    flag_name = 'f1_flag'
+    flag_name = st.text_input('Enter Flag Name')
+    # Sprint4 - add selected target col to flag name
+    if st.button('Update flag'):
+        st.session_state['Flags'][flag_name + '__'+ target_col] = {}
+        st.session_state['Flags'][flag_name + '__'+ target_col]['train'] = line_values
+        st.session_state['Flags'][flag_name + '__'+ target_col]['test'] = test_line_values
+        # st.write(st.session_state['Flags'][flag_name])
+        st.success(f'{flag_name + "__" + target_col} stored')
+    # Sprint4 - only show flag created for the particular target col
+    st.write(st.session_state['Flags'].keys() )
+    target_model_flags = [f.split("__")[0] for f in st.session_state['Flags'].keys() if f.split("__")[1] == target_col]
+    options = list(target_model_flags)
+    selected_options = []
+    num_columns = 4
+    num_rows = -(-len(options) // num_columns)
+tick = False
+if st.checkbox('Select all'):
+    tick = True
+selected_options = []
+for row in range(num_rows):
+    cols = st.columns(num_columns)
+    for col in cols:
+        if options:
+            option = options.pop(0)
+            selected = col.checkbox(option, value=tick)
+            if selected:
+                selected_options.append(option)
+st.markdown('### 1.2 Select Parameters to Apply')
+parameters = st.columns(3)
+with parameters[0]:
+    Trend = st.checkbox("**Trend**")
+    st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
+with parameters[1]:
+    week_number = st.checkbox('**Week_number**')
+    st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
+with parameters[2]:
+    sine_cosine = st.checkbox('**Sine and Cosine Waves**')
+    st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
+#
+# def get_tuned_model():
+#     st.session_state['build_tuned_model']=True
+if st.button('Build model with Selected Parameters and Flags', key='build_tuned_model'):
+    new_features = features_set
+    st.header('2.1 Results Summary')
+    # date=list(df.index)
+    # df = df.reset_index(drop=True)
+    # st.write(df.head(2))
+    # X_train=df[features_set]
+    ss = MinMaxScaler()
+    if is_panel == True:
+        X_train_tuned = X_train[features_set]
+        # X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
+        X_train_tuned[target_col] = X_train[target_col]
+        X_train_tuned[date_col] = X_train[date_col]
+        X_train_tuned[panel_col] = X_train[panel_col]
+        X_test_tuned = X_test[features_set]
+        # X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns)
+        X_test_tuned[target_col] = X_test[target_col]
+        X_test_tuned[date_col] = X_test[date_col]
+        X_test_tuned[panel_col] = X_test[panel_col]
+    else:
+        X_train_tuned = X_train[features_set]
+        # X_train_tuned = pd.DataFrame(ss.fit_transform(X_train_tuned), columns=X_train_tuned.columns)
+        X_test_tuned = X_test[features_set]
+        # X_test_tuned = pd.DataFrame(ss.transform(X_test_tuned), columns=X_test_tuned.columns)
+    for flag in selected_options:
+        # Spirnt4 - added target_col in flag name
+        X_train_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['train']
+        X_test_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['test']
+        # test
+        # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False)
+        # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False)
+    # print("()()"*20,flag, len(st.session_state['Flags'][flag]))
+    if Trend:
+        # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set
+        if is_panel:
+            newdata = pd.DataFrame()
+            panel_wise_end_point_train = {}
+            for panel, groupdf in X_train_tuned.groupby(panel_col):
+                groupdf.sort_values(date_col, inplace=True)
+                groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1)
+                newdata = pd.concat([newdata, groupdf])
+                panel_wise_end_point_train[panel] = len(groupdf)
+            X_train_tuned = newdata.copy()
+            test_newdata = pd.DataFrame()
+            for panel, test_groupdf in X_test_tuned.groupby(panel_col):
+                test_groupdf.sort_values(date_col, inplace=True)
+                start = panel_wise_end_point_train[panel] + 1
+                end = start + len(test_groupdf) # should be + 1? - Sprint4
+                # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start)
+                test_groupdf['Trend'] = np.arange(start, end, 1)
+                test_newdata = pd.concat([test_newdata, test_groupdf])
+            X_test_tuned = test_newdata.copy()
+            new_features = new_features + ['Trend']
+        else:
+            X_train_tuned['Trend'] = np.arange(1, len(X_train_tuned) + 1, 1)
+            X_test_tuned['Trend'] = np.arange(len(X_train_tuned) + 1, len(X_train_tuned) + len(X_test_tuned) + 1, 1)
+            new_features = new_features + ['Trend']
+    if week_number:
+        # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set
+        if is_panel:
+            X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col])
+            X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week
+            if X_train_tuned['Week_number'].nunique() == 1:
+                st.write("All dates in the data are of the same week day. Hence Week number can't be used.")
+            else:
+                X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col])
+                X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week
+                new_features = new_features + ['Week_number']
+        else:
+            date = pd.to_datetime(date.values)
+            X_train_tuned['Week_number'] = pd.to_datetime(X_train[date_col]).dt.day_of_week
+            X_test_tuned['Week_number'] = pd.to_datetime(X_test[date_col]).dt.day_of_week
+            new_features = new_features + ['Week_number']
+    if sine_cosine:
+        # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set
+        if is_panel:
+            new_features = new_features + ['sine_wave', 'cosine_wave']
+            newdata = pd.DataFrame()
+            newdata_test = pd.DataFrame()
+            groups = X_train_tuned.groupby(panel_col)
+            frequency = 2 * np.pi / 365  # Adjust the frequency as needed
+            train_panel_wise_end_point = {}
+            for panel, groupdf in groups:
+                num_samples = len(groupdf)
+                train_panel_wise_end_point[panel] = num_samples
+                days_since_start = np.arange(num_samples)
+                sine_wave = np.sin(frequency * days_since_start)
+                cosine_wave = np.cos(frequency * days_since_start)
+                sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
+                assert len(sine_cosine_df) == len(groupdf)
+                # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
+                groupdf['sine_wave'] = sine_wave
+                groupdf['cosine_wave'] = cosine_wave
+                newdata = pd.concat([newdata, groupdf])
+            X_train_tuned = newdata.copy()
+            test_groups = X_test_tuned.groupby(panel_col)
+            for panel, test_groupdf in test_groups:
+                num_samples = len(test_groupdf)
+                start = train_panel_wise_end_point[panel]
+                days_since_start = np.arange(start, start + num_samples, 1)
+                # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1)))
+                sine_wave = np.sin(frequency * days_since_start)
+                cosine_wave = np.cos(frequency * days_since_start)
+                sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
+                assert len(sine_cosine_df) == len(test_groupdf)
+                # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
+                test_groupdf['sine_wave'] = sine_wave
+                test_groupdf['cosine_wave'] = cosine_wave
+                newdata_test = pd.concat([newdata_test, test_groupdf])
+            X_test_tuned = newdata_test.copy()
+        else:
+            new_features = new_features + ['sine_wave', 'cosine_wave']
+            num_samples = len(X_train_tuned)
+            frequency = 2 * np.pi / 365  # Adjust the frequency as needed
+            days_since_start = np.arange(num_samples)
+            sine_wave = np.sin(frequency * days_since_start)
+            cosine_wave = np.cos(frequency * days_since_start)
+            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
+            # Concatenate the sine and cosine waves with the scaled X DataFrame
+            X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1)
+            test_num_samples = len(X_test_tuned)
+            start = num_samples
+            days_since_start = np.arange(start, start + test_num_samples, 1)
+            sine_wave = np.sin(frequency * days_since_start)
+            cosine_wave = np.cos(frequency * days_since_start)
+            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
+            # Concatenate the sine and cosine waves with the scaled X DataFrame
+            X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1)
+    # model
+    if selected_options:
+        new_features = new_features + selected_options
+    if is_panel:
+        inp_vars_str = " + ".join(new_features)
+        new_features=list(set(new_features))
+        # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False)
+        # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes)
+        # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum())
+        md_str = target_col + " ~ " + inp_vars_str
+        md_tuned = smf.mixedlm(md_str,
+                               data=X_train_tuned[[target_col] + new_features],
+                               groups=X_train_tuned[panel_col])
+        model_tuned = md_tuned.fit()
+        # plot act v pred for original model and tuned model
+        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train,
+                                                                                 model.fittedvalues, model,
+                                                                                 target_column=sel_target_col,
+                                                                                 is_panel=True)
+        metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col],
+                                                                                             X_train_tuned[target_col],
+                                                                                             model_tuned.fittedvalues,
+                                                                                             model_tuned,
+                                                                                             target_column=sel_target_col,
+                                                                                             is_panel=True)
+    else:
+        new_features=list(set(new_features))
+        # st.write(new_features)
+        model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit()
+        # st.write(X_train_tuned.columns)
+        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:130], y_train,
+                                                                                 model.predict(X_train[features_set]), model,
+                                                                                 target_column=sel_target_col)
+        metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:130], y_train,
+                                                                                             model_tuned.predict(
+                                                                                                 X_train_tuned),
+                                                                                             model_tuned,
+                                                                                             target_column=sel_target_col)
+    # st.write(metrics_table_tuned)
+    mape = np.round(metrics_table.iloc[0, 1], 2)
+    r2 = np.round(metrics_table.iloc[1, 1], 2)
+    adjr2 = np.round(metrics_table.iloc[2, 1], 2)
+    mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2)
+    r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2)
+    adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2)
+    parameters_ = st.columns(3)
+    with parameters_[0]:
+        st.metric('R2', r2_tuned, np.round(r2_tuned - r2, 2))
+    with parameters_[1]:
+        st.metric('Adjusted R2', adjr2_tuned, np.round(adjr2_tuned - adjr2, 2))
+    with parameters_[2]:
+        st.metric('MAPE', mape_tuned, np.round(mape_tuned - mape, 2), 'inverse')
+    st.write(model_tuned.summary())
+    X_train_tuned[date_col] = X_train[date_col]
+    X_test_tuned[date_col] = X_test[date_col]
+    X_train_tuned[target_col] = y_train
+    X_test_tuned[target_col] = y_test
+    st.header('2.2 Actual vs. Predicted Plot')
+    # if is_panel:
+    #   metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train),
+    #                                                                              model, target_column='Revenue',is_panel=True)
+    # else:
+    #   metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
+    if is_panel :
+        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
+                                                                                 X_train_tuned[target_col],
+                                                                                 model_tuned.fittedvalues, model_tuned,
+                                                                                 target_column=sel_target_col,
+                                                                                 is_panel=True)
+    else :
+        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
+                                                                                 X_train_tuned[target_col],
+                                                                                 model_tuned.predict(X_train_tuned[new_features]),
+                                                                                 model_tuned,
+                                                                                 target_column=sel_target_col,
+                                                                                 is_panel=False)
+    # plot_actual_vs_predicted(X_train[date_col], y_train,
+    #                                                                             model.fittedvalues, model,
+    #                                                                             target_column='Revenue',
+    #                                                                             is_panel=is_panel)
+    st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
+    st.markdown('## 2.3 Residual Analysis')
+    if is_panel :
+        columns = st.columns(2)
+        with columns[0]:
+            fig = plot_residual_predicted(y_train, model_tuned.fittedvalues, X_train_tuned)
+            st.plotly_chart(fig)
+        with columns[1]:
+            st.empty()
+            fig = qqplot(y_train, model_tuned.fittedvalues)
+            st.plotly_chart(fig)
+        with columns[0]:
+            fig = residual_distribution(y_train, model_tuned.fittedvalues)
+            st.pyplot(fig)
+    else:
+        columns = st.columns(2)
+        with columns[0]:
+            fig = plot_residual_predicted(y_train, model_tuned.predict(X_train_tuned[new_features]), X_train)
+            st.plotly_chart(fig)
+        with columns[1]:
+            st.empty()
+            fig = qqplot(y_train, model_tuned.predict(X_train_tuned[new_features]))
+            st.plotly_chart(fig)
+        with columns[0]:
+            fig = residual_distribution(y_train, model_tuned.predict(X_train_tuned[new_features]))
+            st.pyplot(fig)
+    st.session_state['is_tuned_model'][target_col] = True
+    # Sprint4 - saved tuned model in a dict
+    st.session_state['Model_Tuned'][sel_model + "__" + target_col] = {
+                                                                      "Model_object": model_tuned,
+                                                                      'feature_set': new_features,
+                                                                      'X_train_tuned': X_train_tuned,
+                                                                      'X_test_tuned': X_test_tuned
+                                                                      }
+# Pending
+# if st.session_state['build_tuned_model']==True:
+if st.session_state['Model_Tuned'] is not None :
+    if st.checkbox('Use this model to build response curves', key='save_model'):
+        #   save_model = st.button('Use this model to build response curves', key='saved_tuned_model')
+        #   if save_model:
+        st.session_state["is_tuned_model"][target_col]=True
+        with open("tuned_model.pkl", "wb") as f:
+            # pickle.dump(st.session_state['tuned_model'], f)
+            pickle.dump(st.session_state['Model_Tuned'], f)  # Sprint4
+        # X_test_tuned.to_csv("Test/X_test_tuned_final.csv", index=False)
+        # X_train_tuned.to_csv("Test/X_train_tuned.csv", index=False)
+        st.success(sel_model + "__" + target_col + ' Tuned saved!')
+    # if is_panel:
+    #     # st.session_state["tuned_model_features"] = new_features
+    #     with open("tuned_model.pkl", "wb") as f:
+    #         # pickle.dump(st.session_state['tuned_model'], f)
+    #         pickle.dump(st.session_state['Model_Tuned'], f)  # Sprint4
+    #     st.success(sel_model + "__" + target_col + ' Tuned saved!')
+#   raw_data=df[features_set]
+#   columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
+#   raw_data.columns=columns_raw
+#   columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
+#   raw_data=raw_data[columns_media]
+#   raw_data['Date']=list(df.index)
+#   spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
+#   spends_df=df[spends_var]
+#   spends_df['Week']=list(df.index)
+#   j=0
+#   X1=X.copy()
+#   col=X1.columns
+#   for i in model.params.values:
+#       X1[col[j]]=X1.iloc[:,j]*i
+#       j+=1
+#   contribution_df=X1
+#   contribution_df['Date']=list(df.index)
+#   excel_file='Overview_data.xlsx'
+#   with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
+#      raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
+#      spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
+#      contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')

pages/6_Model_Result_Overview.py ADDED Viewed

	@@ -0,0 +1,348 @@

+'''
+MMO Build Sprint 3
+additions : contributions calculated using tuned Mixed LM model
+pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
+MMO Build Sprint 4
+additions : response metrics selection
+pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
+'''
+import streamlit as st
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+import pickle
+from utilities_with_panel import (set_header,
+                                  overview_test_data_prep_panel,
+                                  overview_test_data_prep_nonpanel,
+                                  initialize_data,
+                                  load_local_css,
+                                  create_channel_summary,
+                                  create_contribution_pie,
+                                  create_contribuion_stacked_plot,
+                                  create_channel_spends_sales_plot,
+                                  format_numbers,
+                                  channel_name_formating)
+import plotly.graph_objects as go
+import streamlit_authenticator as stauth
+import yaml
+from yaml import SafeLoader
+import time
+st.set_page_config(layout='wide')
+load_local_css('styles.css')
+set_header()
+def get_random_effects(media_data, panel_col, mdf):
+    random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
+    for i, market in enumerate(media_data[panel_col].unique()):
+        print(i, end='\r')
+        intercept = mdf.random_effects[market].values[0]
+        random_eff_df.loc[i, 'random_effect'] = intercept
+        random_eff_df.loc[i, panel_col] = market
+    return random_eff_df
+def process_train_and_test(train, test, features, panel_col, target_col):
+    X1 = train[features]
+    ss = MinMaxScaler()
+    X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)
+    X1[panel_col] = train[panel_col]
+    X1[target_col] = train[target_col]
+    if test is not None:
+        X2 = test[features]
+        X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
+        X2[panel_col] = test[panel_col]
+        X2[target_col] = test[target_col]
+        return X1, X2
+    return X1
+def mdf_predict(X_df, mdf, random_eff_df) :
+    X=X_df.copy()
+    X=pd.merge(X, random_eff_df[[panel_col,'random_effect']], on=panel_col, how='left')
+    X['pred_fixed_effect'] = mdf.predict(X)
+    X['pred'] = X['pred_fixed_effect'] + X['random_effect']
+    X.to_csv('Test/merged_df_contri.csv',index=False)
+    X.drop(columns=['pred_fixed_effect', 'random_effect'], inplace=True)
+    return X
+target='Revenue'
+# is_panel=False
+# is_panel = st.session_state['is_panel']
+panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
+date_col = 'date'
+#st.write(media_data)
+is_panel = True if len(panel_col)>0 else False
+# panel_col='markets'
+date_col = 'date'
+# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
+if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
+    sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
+    target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
+else :
+    sel_target_col = 'Total Approved Accounts - Revenue'
+    target_col = 'total_approved_accounts_revenue'
+# Sprint4 - Look through all saved tuned models, only show saved models of the sel resp metric (target_col)
+# saved_models = st.session_state['saved_model_names']
+ # Sprint4 - get the model obj of the selected model
+# st.write(sel_model_dict)
+# Sprint3 - Contribution
+if is_panel:
+    # read tuned mixedLM model
+    # if st.session_state["tuned_model"] is not None :
+    if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
+        with open("tuned_model.pkl", 'rb') as file:
+            model_dict = pickle.load(file)
+        saved_models = list(model_dict.keys())
+        required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
+        sel_model = st.selectbox("Select the model to review", required_saved_models)
+        sel_model_dict = model_dict[sel_model + "__" + target_col]
+        # model=st.session_state["tuned_model"]
+        # X_train=st.session_state["X_train_tuned"]
+        # X_test=st.session_state["X_test_tuned"]
+        # best_feature_set=st.session_state["tuned_model_features"]
+        model=sel_model_dict["Model_object"]
+        X_train=sel_model_dict["X_train_tuned"]
+        X_test=sel_model_dict["X_test_tuned"]
+        best_feature_set=sel_model_dict["feature_set"]
+        # st.write("features", best_feature_set)
+        # st.write(X_test.columns)
+    else : # if non tuned model to be used # Pending
+        with open("best_models.pkl", 'rb') as file:
+            model_dict = pickle.load(file)
+        saved_models = list(model_dict.keys())
+        required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
+        sel_model = st.selectbox("Select the model to review", required_saved_models)
+        sel_model_dict = model_dict[sel_model + "__" + target_col]
+        model=st.session_state["base_model"]
+        X_train = st.session_state['X_train']
+        X_test = st.session_state['X_test']
+        # y_train = st.session_state['y_train']
+        # y_test = st.session_state['y_test']
+        best_feature_set = st.session_state['base_model_feature_set']
+        # st.write(best_feature_set)
+        # st.write(X_test.columns)
+    # Calculate contributions
+    with open("data_import.pkl", "rb") as f:
+        data = pickle.load(f)
+    # Accessing the loaded objects
+    st.session_state['orig_media_data'] = data["final_df"]
+    st.session_state['orig_media_data'].columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['orig_media_data'].columns]
+    media_data = st.session_state["media_data"]
+    # st.session_state['orig_media_data']=st.session_state["media_data"]
+    #st.write(media_data)
+    contri_df = pd.DataFrame()
+    y = []
+    y_pred = []
+    random_eff_df = get_random_effects(media_data, panel_col, model)
+    random_eff_df['fixed_effect'] = model.fe_params['Intercept']
+    random_eff_df['panel_effect'] = random_eff_df['random_effect'] + random_eff_df['fixed_effect']
+    # random_eff_df.to_csv("Test/random_eff_df_contri.csv", index=False)
+    coef_df = pd.DataFrame(model.fe_params)
+    coef_df.columns = ['coef']
+    # coef_df.reset_index().to_csv("Test/coef_df_contri1.csv",index=False)
+    # print(model.fe_params)
+    x_train_contribution = X_train.copy()
+    x_test_contribution = X_test.copy()
+    # preprocessing not needed since X_train is already preprocessed
+    # X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
+    # x_train_contribution[best_feature_set] = X1[best_feature_set]
+    # x_test_contribution[best_feature_set] = X2[best_feature_set]
+    x_train_contribution = mdf_predict(x_train_contribution, model, random_eff_df)
+    x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)
+    x_train_contribution = pd.merge(x_train_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
+                                    how='left')
+    x_test_contribution = pd.merge(x_test_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
+                                   how='left')
+    inp_coef = coef_df['coef'][1:].tolist() # 0th index is intercept
+    for i in range(len(inp_coef)):
+        x_train_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_train_contribution[best_feature_set[i]]
+        x_test_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_test_contribution[best_feature_set[i]]
+    x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1)
+    x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution['panel_effect']
+    x_test_contribution['sum_contributions'] = x_test_contribution.filter(regex="contr").sum(axis=1)
+    x_test_contribution['sum_contributions'] = x_test_contribution['sum_contributions'] + x_test_contribution['panel_effect']
+    # # test
+    x_train_contribution.to_csv("Test/x_train_contribution.csv",index=False)
+    x_test_contribution.to_csv("Test/x_test_contribution.csv",index=False)
+    #
+    # st.session_state['orig_media_data'].to_csv("Test/transformed_data.csv",index=False)
+    # st.session_state['X_test_spends'].to_csv("Test/test_spends.csv",index=False)
+    # # st.write(st.session_state['orig_media_data'].columns)
+    st.write(date_col,panel_col)
+    # st.write(x_test_contribution)
+    overview_test_data_prep_panel(x_test_contribution, st.session_state['orig_media_data'], st.session_state['X_test_spends'],
+                        date_col, panel_col, target_col)
+else : # NON PANEL
+    if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
+        with open("tuned_model.pkl", 'rb') as file:
+            model_dict = pickle.load(file)
+        saved_models = list(model_dict.keys())
+        required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
+        sel_model = st.selectbox("Select the model to review", required_saved_models)
+        sel_model_dict = model_dict[sel_model + "__" + target_col]
+        model=sel_model_dict["Model_object"]
+        X_train=sel_model_dict["X_train_tuned"]
+        X_test=sel_model_dict["X_test_tuned"]
+        best_feature_set=sel_model_dict["feature_set"]
+    else : #Sprint4
+        with open("best_models.pkl", 'rb') as file:
+            model_dict = pickle.load(file)
+        saved_models = list(model_dict.keys())
+        required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
+        sel_model = st.selectbox("Select the model to review", required_saved_models)
+        sel_model_dict = model_dict[sel_model + "__" + target_col]
+        model=sel_model_dict["Model_object"]
+        X_train=sel_model_dict["X_train"]
+        X_test=sel_model_dict["X_test"]
+        best_feature_set=sel_model_dict["feature_set"]
+    x_train_contribution = X_train.copy()
+    x_test_contribution = X_test.copy()
+    x_train_contribution['pred'] = model.predict(x_train_contribution[best_feature_set])
+    x_test_contribution['pred'] = model.predict(x_test_contribution[best_feature_set])
+    for num,i in enumerate(model.params.values):
+        col=best_feature_set[num]
+        x_train_contribution[col + "_contr"] = X_train[col] * i
+        x_test_contribution[col + "_contr"] = X_test[col] * i
+    x_test_contribution.to_csv("Test/x_test_contribution_non_panel.csv",index=False)
+    overview_test_data_prep_nonpanel(x_test_contribution, st.session_state['orig_media_data'].copy(), st.session_state['X_test_spends'].copy(), date_col, target_col)
+# for k, v in st.session_sta
+# te.items():
+#     if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
+#         st.session_state[k] = v
+# authenticator = st.session_state.get('authenticator')
+# if authenticator is None:
+#     authenticator = load_authenticator()
+# name, authentication_status, username = authenticator.login('Login', 'main')
+# auth_status = st.session_state['authentication_status']
+# if auth_status:
+#     authenticator.logout('Logout', 'main')
+#     is_state_initiaized = st.session_state.get('initialized',False)
+#     if not is_state_initiaized:
+initialize_data(target_col)
+scenario = st.session_state['scenario']
+raw_df = st.session_state['raw_df']
+st.header('Overview of previous spends')
+# st.write(scenario.actual_total_spends)
+# st.write(scenario.actual_total_sales)
+columns = st.columns((1,1,3))
+with columns[0]:
+    st.metric(label='Spends', value=format_numbers(float(scenario.actual_total_spends)))
+###print(f"##################### {scenario.actual_total_sales} ##################")
+with columns[1]:
+    st.metric(label=target, value=format_numbers(float(scenario.actual_total_sales),include_indicator=False))
+actual_summary_df = create_channel_summary(scenario)
+actual_summary_df['Channel'] = actual_summary_df['Channel'].apply(channel_name_formating)
+columns = st.columns((2,1))
+with columns[0]:
+    with st.expander('Channel wise overview'):
+        st.markdown(actual_summary_df.style.set_table_styles(
+        [{
+            'selector': 'th',
+            'props': [('background-color', '#11B6BD')]
+        },
+            {
+            'selector' : 'tr:nth-child(even)',
+            'props' : [('background-color', '#11B6BD')]
+            }]).to_html(), unsafe_allow_html=True)
+st.markdown("<hr>",unsafe_allow_html=True)
+##############################
+st.plotly_chart(create_contribution_pie(scenario),use_container_width=True)
+st.markdown("<hr>",unsafe_allow_html=True)
+################################3
+st.plotly_chart(create_contribuion_stacked_plot(scenario),use_container_width=True)
+st.markdown("<hr>",unsafe_allow_html=True)
+#######################################
+selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['non media'], format_func=channel_name_formating)
+selected_channel = scenario.channels.get(selected_channel_name,None)
+st.plotly_chart(create_channel_spends_sales_plot(selected_channel), use_container_width=True)
+st.markdown("<hr>",unsafe_allow_html=True)
+# elif auth_status == False:
+#     st.error('Username/Password is incorrect')
+# if auth_status != True:
+#     try:
+#         username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
+#         if username_forgot_pw:
+#             st.success('New password sent securely')
+#             # Random password to be transferred to user securely
+#         elif username_forgot_pw == False:
+#             st.error('Username not found')
+#     except Exception as e:
+#         st.error(e)

pages/7_Build_Response_Curves.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import streamlit as st
+import plotly.express as px
+import numpy as np
+import plotly.graph_objects as go
+from utilities_with_panel import channel_name_formating, load_authenticator, initialize_data
+from sklearn.metrics import r2_score
+from collections import OrderedDict
+from classes import class_from_dict,class_to_dict
+import pickle
+import json
+from utilities import (
+    load_local_css,
+    set_header,
+    channel_name_formating,
+)
+for k, v in st.session_state.items():
+    if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
+        st.session_state[k] = v
+def s_curve(x,K,b,a,x0):
+    return K / (1 + b*np.exp(-a*(x-x0)))
+def save_scenario(scenario_name):
+    """
+    Save the current scenario with the mentioned name in the session state
+    Parameters
+    ----------
+    scenario_name
+        Name of the scenario to be saved
+    """
+    if 'saved_scenarios' not in st.session_state:
+        st.session_state = OrderedDict()
+    #st.session_state['saved_scenarios'][scenario_name] = st.session_state['scenario'].save()
+    st.session_state['saved_scenarios'][scenario_name] = class_to_dict(st.session_state['scenario'])
+    st.session_state['scenario_input'] = ""
+    print(type(st.session_state['saved_scenarios']))
+    with open('../saved_scenarios.pkl', 'wb') as f:
+        pickle.dump(st.session_state['saved_scenarios'],f)
+def reset_curve_parameters():
+    del st.session_state['K']
+    del st.session_state['b']
+    del st.session_state['a']
+    del st.session_state['x0']
+def update_response_curve():
+    # st.session_state['rcs'][selected_channel_name]['K'] = st.session_state['K']
+    # st.session_state['rcs'][selected_channel_name]['b'] = st.session_state['b']
+    # st.session_state['rcs'][selected_channel_name]['a'] = st.session_state['a']
+    # st.session_state['rcs'][selected_channel_name]['x0'] = st.session_state['x0']
+    # rcs = st.session_state['rcs']
+    _channel_class = st.session_state['scenario'].channels[selected_channel_name]
+    _channel_class.update_response_curves({
+                           'K'  : st.session_state['K'],
+                           'b'  : st.session_state['b'],
+                           'a'  : st.session_state['a'],
+                           'x0' : st.session_state['x0']})
+# authenticator = st.session_state.get('authenticator')
+# if authenticator is None:
+#     authenticator = load_authenticator()
+# name, authentication_status, username = authenticator.login('Login', 'main')
+# auth_status = st.session_state.get('authentication_status')
+# if auth_status == True:
+#     is_state_initiaized = st.session_state.get('initialized',False)
+#     if not is_state_initiaized:
+#         print("Scenario page state reloaded")
+# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
+st.set_page_config(layout='wide')
+load_local_css('styles.css')
+set_header()
+if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
+    sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
+    target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
+else :
+    sel_target_col = 'Total Approved Accounts - Revenue'
+    target_col = 'total_approved_accounts_revenue'
+initialize_data(target_col)
+st.subheader("Build response curves")
+channels_list = st.session_state['channels_list']
+selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['Others'], format_func=channel_name_formating,on_change=reset_curve_parameters)
+rcs = {}
+for channel_name in channels_list:
+    rcs[channel_name] = st.session_state['scenario'].channels[channel_name].response_curve_params
+# rcs = st.session_state['rcs']
+if 'K' not in st.session_state:
+    st.session_state['K'] = rcs[selected_channel_name]['K']
+if 'b' not in st.session_state:
+    st.session_state['b'] = rcs[selected_channel_name]['b']
+if 'a' not in st.session_state:
+    st.session_state['a'] = rcs[selected_channel_name]['a']
+if 'x0' not in st.session_state:
+    st.session_state['x0'] = rcs[selected_channel_name]['x0']
+x = st.session_state['actual_input_df'][selected_channel_name].values
+y = st.session_state['actual_contribution_df'][selected_channel_name].values
+power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
+# fig = px.scatter(x, s_curve(x/10**power,
+#                             st.session_state['K'],
+#                             st.session_state['b'],
+#                             st.session_state['a'],
+#                             st.session_state['x0']))
+fig = px.scatter(x=x, y=y)
+fig.add_trace(go.Scatter(x=sorted(x), y=s_curve(sorted(x)/10**power,st.session_state['K'],
+                                    st.session_state['b'],
+                                    st.session_state['a'],
+                                    st.session_state['x0']),
+                        line=dict(color='red')))
+fig.update_layout(title_text="Response Curve",showlegend=False)
+fig.update_annotations(font_size=10)
+fig.update_xaxes(title='Spends')
+fig.update_yaxes(title=sel_target_col)
+st.plotly_chart(fig,use_container_width=True)
+r2 = r2_score(y, s_curve(x / 10**power,
+                        st.session_state['K'],
+                        st.session_state['b'],
+                        st.session_state['a'],
+                        st.session_state['x0']))
+st.metric('R2',round(r2,2))
+columns = st.columns(4)
+with columns[0]:
+    st.number_input('K',key='K',format="%0.5f")
+with columns[1]:
+    st.number_input('b',key='b',format="%0.5f")
+with columns[2]:
+    st.number_input('a',key='a',step=0.0001,format="%0.5f")
+with columns[3]:
+    st.number_input('x0',key='x0',format="%0.5f")
+st.button('Update parameters',on_click=update_response_curve)
+st.button('Reset parameters',on_click=reset_curve_parameters)
+scenario_name = st.text_input('Scenario name', key='scenario_input',placeholder='Scenario name',label_visibility='collapsed')
+st.button('Save', on_click=lambda  : save_scenario(scenario_name),disabled=len(st.session_state['scenario_input']) == 0)
+file_name = st.text_input('rcs download file name', key='file_name_input',placeholder='file name',label_visibility='collapsed')
+st.download_button(
+                    label="Download response curves",
+                    data=json.dumps(rcs),
+                    file_name=f"{file_name}.json",
+                    mime="application/json",
+                    disabled= len(file_name) == 0,
+                )
+def s_curve_derivative(x, K, b, a, x0):
+    # Derivative of the S-curve function
+    return a * b * K * np.exp(-a * (x - x0)) / ((1 + b * np.exp(-a * (x - x0))) ** 2)
+# Parameters of the S-curve
+K = st.session_state['K']
+b = st.session_state['b']
+a = st.session_state['a']
+x0 = st.session_state['x0']
+# Optimized spend value obtained from the tool
+optimized_spend = st.number_input('value of x')  # Replace this with your optimized spend value
+# Calculate the slope at the optimized spend value
+slope_at_optimized_spend = s_curve_derivative(optimized_spend, K, b, a, x0)
+st.write("Slope ", slope_at_optimized_spend)

pages/8_Scenario_Planner.py CHANGED Viewed

@@ -23,34 +23,28 @@ import re
 import pandas as pd
 import plotly.express as px
-target = "Revenue"
 st.set_page_config(layout="wide")
 load_local_css("styles.css")
 set_header()
 for k, v in st.session_state.items():
-    if k not in ["logout", "login", "config"] and not k.startswith(
-        "FormSubmitter"
-    ):
         st.session_state[k] = v
 # ======================================================== #
 # ======================= Functions ====================== #
 # ======================================================== #
-def optimize(key):
     """
     Optimize the spends for the sales
     """
     channel_list = [
-        key
-        for key, value in st.session_state["optimization_channels"].items()
-        if value
     ]
-    # print('channel_list')
-    # print(channel_list)
-    # print('@@@@@@@@')
     if len(channel_list) > 0:
         scenario = st.session_state["scenario"]
         if key.lower() == "media spends":
@@ -59,7 +53,8 @@ def optimize(key):
                     result = st.session_state["scenario"].optimize(
                         st.session_state["total_spends_change"], channel_list
                     )
-        elif key.lower() == "revenue":
             with status_placeholder:
                 with st.spinner("Optimizing"):
@@ -69,14 +64,11 @@ def optimize(key):
         for channel_name, modified_spends in result:
             st.session_state[channel_name] = numerize(
-                modified_spends
-                * scenario.channels[channel_name].conversion_rate,
                 1,
             )
             prev_spends = (
-                st.session_state["scenario"]
-                .channels[channel_name]
-                .actual_total_spends
             )
             st.session_state[f"{channel_name}_change"] = round(
                 100 * (modified_spends - prev_spends) / prev_spends, 2
@@ -105,15 +97,46 @@ def save_scenario(scenario_name):
         pickle.dump(st.session_state["saved_scenarios"], f)
 def update_sales_abs():
     actual_sales = _scenario.actual_total_sales
-    if validate_input(st.session_state["total_sales_change_abs"]):
         modified_sales = extract_number_for_string(
             st.session_state["total_sales_change_abs"]
         )
         st.session_state["total_sales_change"] = round(
             ((modified_sales / actual_sales) - 1) * 100
         )
 def update_sales():
@@ -122,32 +145,95 @@ def update_sales():
         * _scenario.actual_total_sales,
         1,
     )
 def update_all_spends_abs():
     actual_spends = _scenario.actual_total_spends
-    if validate_input(st.session_state["total_spends_change_abs"]):
         modified_spends = extract_number_for_string(
             st.session_state["total_spends_change_abs"]
         )
-        print(modified_spends)
-        print(actual_spends)
         st.session_state["total_spends_change"] = (
             (modified_spends / actual_spends) - 1
         ) * 100
         update_all_spends()
 def update_all_spends():
     """
     Updates spends for all the channels with the given overall spends change
     """
     percent_change = st.session_state["total_spends_change"]
-    st.session_state["total_spends_change_abs"] = numerize(
-        (1 + percent_change / 100) * _scenario.actual_total_spends, 1
-    )
     for channel_name in st.session_state["channels_list"]:
         channel = st.session_state["scenario"].channels[channel_name]
         current_spends = channel.actual_total_spends
@@ -199,16 +285,10 @@ def update_data(channel_name):
     """
     if validate_input(st.session_state[channel_name]):
-        modified_spends = extract_number_for_string(
-            st.session_state[channel_name]
-        )
         prev_spends = (
-            st.session_state["scenario"]
-            .channels[channel_name]
-            .actual_total_spends
-            * st.session_state["scenario"]
-            .channels[channel_name]
-            .conversion_rate
         )
         st.session_state[f"{channel_name}_change"] = round(
             100 * (modified_spends - prev_spends) / prev_spends, 2
@@ -216,9 +296,7 @@ def update_data(channel_name):
         st.session_state["scenario"].update(
             channel_name,
             modified_spends
-            / st.session_state["scenario"]
-            .channels[channel_name]
-            .conversion_rate,
         )
     # st.session_state['scenario'].update(channel_name, modified_spends)
     # else:
@@ -249,31 +327,55 @@ def select_all_channels_for_optimization():
         st.session_state[f"{channel_name}_selected"] = st.session_state[
             "optimze_all_channels"
         ]
-        st.session_state["optimization_channels"][channel_name] = (
-            st.session_state["optimze_all_channels"]
-        )
 def update_penalty():
     """
     Updates the penalty flag for sales calculation
     """
-    st.session_state["scenario"].update_penalty(
-        st.session_state["apply_penalty"]
-    )
-def reset_scenario():
     # #print(st.session_state['default_scenario_dict'])
     # st.session_state['scenario']  = class_from_dict(st.session_state['default_scenario_dict'])
     # for channel in st.session_state['scenario'].channels.values():
     #     st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
-    initialize_data()
     for channel_name in st.session_state["channels_list"]:
         st.session_state[f"{channel_name}_selected"] = False
         st.session_state[f"{channel_name}_change"] = 0
     st.session_state["optimze_all_channels"] = False
 def format_number(num):
     if num >= 1_000_000:
@@ -305,9 +407,7 @@ def summary_plot(data, x, y, title, text_column):
         hovertemplate="%{x:.2s}",
     )
-    fig.update_layout(
-        xaxis_title=x, yaxis_title="Channel Name", showlegend=False
-    )
     return fig
@@ -342,27 +442,21 @@ def calculate_rgba(
         relative_position = (current_channel_spends - start_value) / (
             left_value - start_value
         )
-        alpha = 0.8 - (
-            0.6 * relative_position
-        )  # Alpha decreases from start to end
     elif left_value < current_channel_spends <= right_value:
         color = "green"
         relative_position = (current_channel_spends - left_value) / (
             right_value - left_value
         )
-        alpha = 0.8 - (
-            0.6 * relative_position
-        )  # Alpha decreases from start to end
     elif right_value < current_channel_spends <= end_value:
         color = "red"
         relative_position = (current_channel_spends - right_value) / (
             end_value - right_value
         )
-        alpha = 0.2 + (
-            0.6 * relative_position
-        )  # Alpha increases from start to end
     else:
         # Default case, if the spends are outside the defined ranges
@@ -432,9 +526,7 @@ def plot_response_curves():
         for index in range(len(x_plot)):
             marginal_roi.append(
-                a
-                * y[index]
-                * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
             )
         x = (
@@ -466,9 +558,7 @@ def plot_response_curves():
             st.session_state["scenario"].channels[col].modified_total_spends
             * st.session_state["scenario"].channels[col].conversion_rate
         )
-        y_optimal = (
-            st.session_state["scenario"].channels[col].modified_total_sales
-        )
         # if col == "Paid_social_others":
         #     debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
@@ -576,7 +666,7 @@ def plot_response_curves():
     fig.update_layout(
         # height=1000,
         # width=1000,
-        title_text="Response Curves (X: Spends Vs Y: Revenue)",
         showlegend=False,
         shapes=shapes,
     )
@@ -718,12 +808,144 @@ authenticator = stauth.Authenticate(
 st.session_state["authenticator"] = authenticator
 name, authentication_status, username = authenticator.login("Login", "main")
 auth_status = st.session_state.get("authentication_status")
 if auth_status == True:
     authenticator.logout("Logout", "main")
     is_state_initiaized = st.session_state.get("initialized", False)
-    if not is_state_initiaized:
-        initialize_data()
     channels_list = st.session_state["channels_list"]
     # ======================================================== #
@@ -731,12 +953,16 @@ if auth_status == True:
     # ======================================================== #
     # print(list(st.session_state.keys()))
-    st.header("Simulation")
     main_header = st.columns((2, 2))
     sub_header = st.columns((1, 1, 1, 1))
     _scenario = st.session_state["scenario"]
     if "total_spends_change_abs" not in st.session_state:
         st.session_state["total_spends_change_abs"] = numerize(
             _scenario.actual_total_spends, 1
@@ -747,6 +973,16 @@ if auth_status == True:
             _scenario.actual_total_sales, 1
         )
     with main_header[0]:
         st.subheader("Actual")
@@ -754,9 +990,7 @@ if auth_status == True:
         st.subheader("Simulated")
     with sub_header[0]:
-        st.metric(
-            label="Spends", value=format_numbers(_scenario.actual_total_spends)
-        )
     with sub_header[1]:
         st.metric(
@@ -782,33 +1016,49 @@ if auth_status == True:
             delta=numerize(_scenario.delta_sales, 1),
         )
-    with st.expander("Channel Spends Simulator"):
         _columns1 = st.columns((2, 2, 1, 1))
         with _columns1[0]:
             optimization_selection = st.selectbox(
-                "Optimize", options=["Media Spends", "Revenue"], key="optimization_key"
             )
         with _columns1[1]:
             st.markdown("#")
             st.checkbox(
                 label="Optimize all Channels",
-                key=f"optimze_all_channels",
                 value=False,
                 on_change=select_all_channels_for_optimization,
             )
         with _columns1[2]:
             st.markdown("#")
-            st.button(
-                "Optimize",
-                on_click=optimize,
-                args=(st.session_state["optimization_key"],),
-            )
         with _columns1[3]:
             st.markdown("#")
-            st.button("Reset", on_click=reset_scenario)
         _columns2 = st.columns((2, 2, 2))
         if st.session_state["optimization_key"] == "Media Spends":
@@ -819,37 +1069,90 @@ if auth_status == True:
                     # label_visibility="collapsed",
                     on_change=update_all_spends_abs,
                 )
-            with _columns2[1]:
                 st.number_input(
-                    "Percent",
-                    key=f"total_spends_change",
                     step=1,
-                    on_change=update_all_spends,
                 )
-        elif st.session_state["optimization_key"] == "Revenue":
-            with _columns2[0]:
                 sales_input = st.text_input(
                     "Absolute",
                     key="total_sales_change_abs",
                     on_change=update_sales_abs,
                 )
             with _columns2[1]:
                 st.number_input(
-                    "Percent change",
-                    key=f"total_sales_change",
                     step=1,
                     on_change=update_sales,
                 )
-        with _columns2[2]:
-            st.markdown("#")
-            status_placeholder = st.empty()
-        st.markdown(
-            """<hr class="spends-heading-seperator">""", unsafe_allow_html=True
         )
         _columns = st.columns((2.5, 2, 1.5, 1.5, 1))
         with _columns[0]:
             generate_spending_header("Channel")
@@ -862,9 +1165,7 @@ if auth_status == True:
         with _columns[4]:
             generate_spending_header("Optimize")
-        st.markdown(
-            """<hr class="spends-heading-seperator">""", unsafe_allow_html=True
-        )
         if "acutual_predicted" not in st.session_state:
             st.session_state["acutual_predicted"] = {
@@ -874,9 +1175,7 @@ if auth_status == True:
                 "Delta": [],
             }
         for i, channel_name in enumerate(channels_list):
-            _channel_class = st.session_state["scenario"].channels[
-                channel_name
-            ]
             _columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
             with _columns[0]:
                 st.write(channel_name_formating(channel_name))
@@ -885,12 +1184,8 @@ if auth_status == True:
             with _columns[1]:
                 channel_bounds = _channel_class.bounds
                 channel_spends = float(_channel_class.actual_total_spends)
-                min_value = float(
-                    (1 + channel_bounds[0] / 100) * channel_spends
-                )
-                max_value = float(
-                    (1 + channel_bounds[1] / 100) * channel_spends
-                )
                 ##print(st.session_state[channel_name])
                 spend_input = st.text_input(
                     channel_name,
@@ -901,9 +1196,11 @@ if auth_status == True:
                 if not validate_input(spend_input):
                     st.error("Invalid input")
                 st.number_input(
-                    "Percent change",
-                    key=f"{channel_name}_change",
                     step=1,
                     on_change=partial(update_data_by_percent, channel_name),
                 )
@@ -915,12 +1212,10 @@ if auth_status == True:
                     * _channel_class.conversion_rate
                 )
                 actual_channel_spends = float(
-                    _channel_class.actual_total_spends
-                    * _channel_class.conversion_rate
                 )
                 spends_delta = float(
-                    _channel_class.delta_spends
-                    * _channel_class.conversion_rate
                 )
                 st.session_state["acutual_predicted"]["Channel_name"].append(
                     channel_name
@@ -928,12 +1223,10 @@ if auth_status == True:
                 st.session_state["acutual_predicted"]["Actual_spend"].append(
                     actual_channel_spends
                 )
-                st.session_state["acutual_predicted"][
-                    "Optimized_spend"
-                ].append(current_channel_spends)
-                st.session_state["acutual_predicted"]["Delta"].append(
-                    spends_delta
                 )
                 ## REMOVE
                 st.metric(
                     "Spends",
@@ -944,29 +1237,32 @@ if auth_status == True:
             with _columns[3]:
                 # sales
-                current_channel_sales = float(
-                    _channel_class.modified_total_sales
-                )
                 actual_channel_sales = float(_channel_class.actual_total_sales)
                 sales_delta = float(_channel_class.delta_sales)
                 st.metric(
                     target,
-                    format_numbers(
-                        current_channel_sales, include_indicator=False
-                    ),
                     delta=numerize(sales_delta, 1),
                     label_visibility="collapsed",
                 )
             with _columns[4]:
                 st.checkbox(
                     label="select for optimization",
                     key=f"{channel_name}_selected",
                     value=False,
-                    on_change=partial(
-                        select_channel_for_optimization, channel_name
-                    ),
                     label_visibility="collapsed",
                 )
@@ -978,20 +1274,29 @@ if auth_status == True:
             # Bins
             col = channels_list[i]
             x_actual = st.session_state["scenario"].channels[col].actual_spends
-            x_modified = (
-                st.session_state["scenario"].channels[col].modified_spends
-            )
             x_total = x_modified.sum()
             power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
-            K = st.session_state["rcs"][col]["K"]
-            b = st.session_state["rcs"][col]["b"]
-            a = st.session_state["rcs"][col]["a"]
-            x0 = st.session_state["rcs"][col]["x0"]
             x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
             x, y, marginal_roi = [], [], []
             for x_p in x_plot:
                 x.append(x_p * x_actual / x_actual.sum())
@@ -1001,9 +1306,7 @@ if auth_status == True:
             for index in range(len(x_plot)):
                 marginal_roi.append(
-                    a
-                    * y[index]
-                    * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
                 )
             x = (
@@ -1018,12 +1321,18 @@ if auth_status == True:
             roi = y / np.maximum(x, np.finfo(float).eps)
-            start_value, end_value, left_value, right_value = (
-                find_segment_value(
-                    x,
-                    roi,
-                    marginal_roi,
-                )
             )
             rgba = calculate_rgba(
@@ -1034,16 +1343,6 @@ if auth_status == True:
                 current_channel_spends,
             )
-            # Protecting division by zero by adding a small epsilon to denominators
-            roi_current = current_channel_sales / np.maximum(
-                current_channel_spends, np.finfo(float).eps
-            )
-            marginal_roi_current = (
-                st.session_state["scenario"]
-                .channels[col]
-                .get_marginal_roi("modified")
-            )
             with bin_placeholder:
                 st.markdown(
                     f"""
@@ -1061,7 +1360,7 @@ if auth_status == True:
                     unsafe_allow_html=True,
                 )
-    with st.expander("See Response Curves"):
         fig = plot_response_curves()
         st.plotly_chart(fig, use_container_width=True)
@@ -1081,19 +1380,11 @@ if auth_status == True:
         )
     summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
-    summary_df.drop_duplicates(
-        subset="Channel_name", keep="last", inplace=True
-    )
     summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
     summary_df_sorted["Delta_percent"] = np.round(
-        (
-            (
-                summary_df_sorted["Optimized_spend"]
-                / summary_df_sorted["Actual_spend"]
-            )
-            - 1
-        )
         * 100,
         2,
     )
@@ -1121,9 +1412,9 @@ if auth_status != True:
             authenticator.forgot_password("Forgot password")
         )
         if username_forgot_pw:
-            st.session_state["config"]["credentials"]["usernames"][
-                username_forgot_pw
-            ]["password"] = stauth.Hasher([random_password]).generate()[0]
             send_email(email_forgot_password, random_password)
             st.success("New password sent securely")
             # Random password to be transferred to user securely

 import pandas as pd
 import plotly.express as px
 st.set_page_config(layout="wide")
 load_local_css("styles.css")
 set_header()
 for k, v in st.session_state.items():
+    if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
         st.session_state[k] = v
 # ======================================================== #
 # ======================= Functions ====================== #
 # ======================================================== #
+def optimize(key, status_placeholder):
     """
     Optimize the spends for the sales
     """
     channel_list = [
+        key for key, value in st.session_state["optimization_channels"].items() if value
     ]
     if len(channel_list) > 0:
         scenario = st.session_state["scenario"]
         if key.lower() == "media spends":
                     result = st.session_state["scenario"].optimize(
                         st.session_state["total_spends_change"], channel_list
                     )
+        # elif key.lower() == "revenue":
+        else:
             with status_placeholder:
                 with st.spinner("Optimizing"):
         for channel_name, modified_spends in result:
             st.session_state[channel_name] = numerize(
+                modified_spends * scenario.channels[channel_name].conversion_rate,
                 1,
             )
             prev_spends = (
+                st.session_state["scenario"].channels[channel_name].actual_total_spends
             )
             st.session_state[f"{channel_name}_change"] = round(
                 100 * (modified_spends - prev_spends) / prev_spends, 2
         pickle.dump(st.session_state["saved_scenarios"], f)
+if "allow_spends_update" not in st.session_state:
+    st.session_state["allow_spends_update"] = True
+if "allow_sales_update" not in st.session_state:
+    st.session_state["allow_sales_update"] = True
+def update_sales_abs_slider():
+    actual_sales = _scenario.actual_total_sales
+    if validate_input(st.session_state["total_sales_change_abs_slider"]):
+        modified_sales = extract_number_for_string(
+            st.session_state["total_sales_change_abs_slider"]
+        )
+        st.session_state["total_sales_change"] = round(
+            ((modified_sales / actual_sales) - 1) * 100
+        )
+        st.session_state["total_sales_change_abs"] = numerize(modified_sales, 1)
 def update_sales_abs():
+    if (
+        st.session_state["total_sales_change_abs"]
+        in st.session_state["total_sales_change_abs_slider_options"]
+    ):
+        st.session_state["allow_sales_update"] = True
+    else:
+        st.session_state["allow_sales_update"] = False
     actual_sales = _scenario.actual_total_sales
+    if (
+        validate_input(st.session_state["total_sales_change_abs"])
+        and st.session_state["allow_sales_update"]
+    ):
         modified_sales = extract_number_for_string(
             st.session_state["total_sales_change_abs"]
         )
         st.session_state["total_sales_change"] = round(
             ((modified_sales / actual_sales) - 1) * 100
         )
+        st.session_state["total_sales_change_abs_slider"] = numerize(modified_sales, 1)
 def update_sales():
         * _scenario.actual_total_sales,
         1,
     )
+    st.session_state["total_sales_change_abs_slider"] = numerize(
+        (1 + st.session_state["total_sales_change"] / 100)
+        * _scenario.actual_total_sales,
+        1,
+    )
+def update_all_spends_abs_slider():
+    actual_spends = _scenario.actual_total_spends
+    if validate_input(st.session_state["total_spends_change_abs_slider"]):
+        modified_spends = extract_number_for_string(
+            st.session_state["total_spends_change_abs_slider"]
+        )
+        st.session_state["total_spends_change"] = round(
+            ((modified_spends / actual_spends) - 1) * 100
+        )
+        st.session_state["total_spends_change_abs"] = numerize(modified_spends, 1)
+        update_all_spends()
+# def update_all_spends_abs_slider():
+#     actual_spends = _scenario.actual_total_spends
+#     if validate_input(st.session_state["total_spends_change_abs_slider"]):
+#         print("#" * 100)
+#         print(st.session_state["total_spends_change_abs_slider"])
+#         print("#" * 100)
+#         modified_spends = extract_number_for_string(
+#             st.session_state["total_spends_change_abs_slider"]
+#         )
+#         st.session_state["total_spends_change"] = (
+#             (modified_spends / actual_spends) - 1
+#         ) * 100
+#         st.session_state["total_spends_change_abs"] = st.session_state[
+#             "total_spends_change_abs_slider"
+#         ]
+#         update_all_spends()
 def update_all_spends_abs():
+    if (
+        st.session_state["total_spends_change_abs"]
+        in st.session_state["total_spends_change_abs_slider_options"]
+    ):
+        st.session_state["allow_spends_update"] = True
+    else:
+        st.session_state["allow_spends_update"] = False
     actual_spends = _scenario.actual_total_spends
+    if (
+        validate_input(st.session_state["total_spends_change_abs"])
+        and st.session_state["allow_spends_update"]
+    ):
         modified_spends = extract_number_for_string(
             st.session_state["total_spends_change_abs"]
         )
         st.session_state["total_spends_change"] = (
             (modified_spends / actual_spends) - 1
         ) * 100
+        st.session_state["total_spends_change_abs_slider"] = st.session_state[
+            "total_spends_change_abs"
+        ]
         update_all_spends()
+def update_spends():
+    st.session_state["total_spends_change_abs"] = numerize(
+        (1 + st.session_state["total_spends_change"] / 100)
+        * _scenario.actual_total_spends,
+        1,
+    )
+    st.session_state["total_spends_change_abs_slider"] = numerize(
+        (1 + st.session_state["total_spends_change"] / 100)
+        * _scenario.actual_total_spends,
+        1,
+    )
+    update_all_spends()
 def update_all_spends():
     """
     Updates spends for all the channels with the given overall spends change
     """
     percent_change = st.session_state["total_spends_change"]
     for channel_name in st.session_state["channels_list"]:
         channel = st.session_state["scenario"].channels[channel_name]
         current_spends = channel.actual_total_spends
     """
     if validate_input(st.session_state[channel_name]):
+        modified_spends = extract_number_for_string(st.session_state[channel_name])
         prev_spends = (
+            st.session_state["scenario"].channels[channel_name].actual_total_spends
+            * st.session_state["scenario"].channels[channel_name].conversion_rate
         )
         st.session_state[f"{channel_name}_change"] = round(
             100 * (modified_spends - prev_spends) / prev_spends, 2
         st.session_state["scenario"].update(
             channel_name,
             modified_spends
+            / st.session_state["scenario"].channels[channel_name].conversion_rate,
         )
     # st.session_state['scenario'].update(channel_name, modified_spends)
     # else:
         st.session_state[f"{channel_name}_selected"] = st.session_state[
             "optimze_all_channels"
         ]
+        st.session_state["optimization_channels"][channel_name] = st.session_state[
+            "optimze_all_channels"
+        ]
 def update_penalty():
     """
     Updates the penalty flag for sales calculation
     """
+    st.session_state["scenario"].update_penalty(st.session_state["apply_penalty"])
+def reset_scenario(panel_selected, file_selected, updated_rcs):
     # #print(st.session_state['default_scenario_dict'])
     # st.session_state['scenario']  = class_from_dict(st.session_state['default_scenario_dict'])
     # for channel in st.session_state['scenario'].channels.values():
     #     st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
+    # initialize_data()
+    if panel_selected == "Aggregated":
+        initialize_data(
+            panel=panel_selected,
+            target_file=file_selected,
+            updated_rcs=updated_rcs,
+            metrics=metrics_selected,
+        )
+        panel = None
+    else:
+        initialize_data(
+            panel=panel_selected,
+            target_file=file_selected,
+            updated_rcs=updated_rcs,
+            metrics=metrics_selected,
+        )
     for channel_name in st.session_state["channels_list"]:
         st.session_state[f"{channel_name}_selected"] = False
         st.session_state[f"{channel_name}_change"] = 0
     st.session_state["optimze_all_channels"] = False
+    st.session_state["total_sales_change"] = 0
+    update_spends()
+    update_sales()
+    reset_inputs()
+    # st.rerun()
 def format_number(num):
     if num >= 1_000_000:
         hovertemplate="%{x:.2s}",
     )
+    fig.update_layout(xaxis_title=x, yaxis_title="Channel Name", showlegend=False)
     return fig
         relative_position = (current_channel_spends - start_value) / (
             left_value - start_value
         )
+        alpha = 0.8 - (0.6 * relative_position)  # Alpha decreases from start to end
     elif left_value < current_channel_spends <= right_value:
         color = "green"
         relative_position = (current_channel_spends - left_value) / (
             right_value - left_value
         )
+        alpha = 0.8 - (0.6 * relative_position)  # Alpha decreases from start to end
     elif right_value < current_channel_spends <= end_value:
         color = "red"
         relative_position = (current_channel_spends - right_value) / (
             end_value - right_value
         )
+        alpha = 0.2 + (0.6 * relative_position)  # Alpha increases from start to end
     else:
         # Default case, if the spends are outside the defined ranges
         for index in range(len(x_plot)):
             marginal_roi.append(
+                a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
             )
         x = (
             st.session_state["scenario"].channels[col].modified_total_spends
             * st.session_state["scenario"].channels[col].conversion_rate
         )
+        y_optimal = st.session_state["scenario"].channels[col].modified_total_sales
         # if col == "Paid_social_others":
         #     debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
     fig.update_layout(
         # height=1000,
         # width=1000,
+        title_text=f"Response Curves (X: Spends Vs Y: {target})",
         showlegend=False,
         shapes=shapes,
     )
 st.session_state["authenticator"] = authenticator
 name, authentication_status, username = authenticator.login("Login", "main")
 auth_status = st.session_state.get("authentication_status")
+import os
+import glob
+def get_excel_names(directory):
+    # Create a list to hold the final parts of the filenames
+    last_portions = []
+    # Patterns to match Excel files (.xlsx and .xls) that contain @#
+    patterns = [
+        os.path.join(directory, "*@#*.xlsx"),
+        os.path.join(directory, "*@#*.xls"),
+    ]
+    # Process each pattern
+    for pattern in patterns:
+        files = glob.glob(pattern)
+        # Extracting the last portion after @# for each file
+        for file in files:
+            base_name = os.path.basename(file)
+            last_portion = base_name.split("@#")[-1]
+            last_portion = last_portion.replace(".xlsx", "").replace(
+                ".xls", ""
+            )  # Removing extensions
+            last_portions.append(last_portion)
+    return last_portions
+def name_formating(channel_name):
+    # Replace underscores with spaces
+    name_mod = channel_name.replace("_", " ")
+    # Capitalize the first letter of each word
+    name_mod = name_mod.title()
+    return name_mod
+@st.cache_resource(show_spinner=False)
+def panel_fetch(file_selected):
+    raw_data_mmm_df = pd.read_excel(file_selected, sheet_name="RAW DATA MMM")
+    if "Panel" in raw_data_mmm_df.columns:
+        panel = list(set(raw_data_mmm_df["Panel"]))
+    else:
+        raw_data_mmm_df = None
+        panel = None
+    return panel
+def reset_inputs():
+    if "total_spends_change_abs" in st.session_state:
+        del st.session_state.total_spends_change_abs
+    if "total_spends_change" in st.session_state:
+        del st.session_state.total_spends_change
+    if "total_spends_change_abs_slider" in st.session_state:
+        del st.session_state.total_spends_change_abs_slider
+    if "total_sales_change_abs" in st.session_state:
+        del st.session_state.total_sales_change_abs
+    if "total_sales_change" in st.session_state:
+        del st.session_state.total_sales_change
+    if "total_sales_change_abs_slider" in st.session_state:
+        del st.session_state.total_sales_change_abs_slider
+    st.session_state["initialized"] = False
 if auth_status == True:
     authenticator.logout("Logout", "main")
+    st.header("Simulation")
+    col1, col2 = st.columns([1, 1])
+    # Response Metrics
+    directory = "metrics_level_data"
+    metrics_list = get_excel_names(directory)
+    metrics_selected = col1.selectbox(
+        "Response Metrics",
+        metrics_list,
+        format_func=name_formating,
+        index=0,
+        on_change=reset_inputs,
+    )
+    # Target
+    target = name_formating(metrics_selected)
+    file_selected = (
+        f".\metrics_level_data\Overview_data_test_panel@#{metrics_selected}.xlsx"
+    )
+    # Panel List
+    panel_list = panel_fetch(file_selected)
+    # Panel Selected
+    panel_selected = col2.selectbox(
+        "Panel",
+        ["Aggregated"] + panel_list,
+        index=0,
+        on_change=reset_inputs,
+    )
+    if "update_rcs" in st.session_state:
+        updated_rcs = st.session_state["update_rcs"]
+    else:
+        updated_rcs = None
+    if "first_time" not in st.session_state:
+        st.session_state["first_time"] = True
+    # Check if state is initiaized
     is_state_initiaized = st.session_state.get("initialized", False)
+    if not is_state_initiaized or st.session_state["first_time"]:
+        # initialize_data()
+        if panel_selected == "Aggregated":
+            initialize_data(
+                panel=panel_selected,
+                target_file=file_selected,
+                updated_rcs=updated_rcs,
+                metrics=metrics_selected,
+            )
+            panel = None
+        else:
+            initialize_data(
+                panel=panel_selected,
+                target_file=file_selected,
+                updated_rcs=updated_rcs,
+                metrics=metrics_selected,
+            )
+        st.session_state["initialized"] = True
+        st.session_state["first_time"] = False
+    # Channels List
     channels_list = st.session_state["channels_list"]
     # ======================================================== #
     # ======================================================== #
     # print(list(st.session_state.keys()))
     main_header = st.columns((2, 2))
     sub_header = st.columns((1, 1, 1, 1))
     _scenario = st.session_state["scenario"]
+    if "total_spends_change" not in st.session_state:
+        st.session_state.total_spends_change = 0
+    if "total_sales_change" not in st.session_state:
+        st.session_state.total_sales_change = 0
     if "total_spends_change_abs" not in st.session_state:
         st.session_state["total_spends_change_abs"] = numerize(
             _scenario.actual_total_spends, 1
             _scenario.actual_total_sales, 1
         )
+    if "total_spends_change_abs_slider" not in st.session_state:
+        st.session_state.total_spends_change_abs_slider = numerize(
+            _scenario.actual_total_spends, 1
+        )
+    if "total_sales_change_abs_slider" not in st.session_state:
+        st.session_state.total_sales_change_abs_slider = numerize(
+            _scenario.actual_total_sales, 1
+        )
     with main_header[0]:
         st.subheader("Actual")
         st.subheader("Simulated")
     with sub_header[0]:
+        st.metric(label="Spends", value=format_numbers(_scenario.actual_total_spends))
     with sub_header[1]:
         st.metric(
             delta=numerize(_scenario.delta_sales, 1),
         )
+    with st.expander("Channel Spends Simulator", expanded=True):
         _columns1 = st.columns((2, 2, 1, 1))
         with _columns1[0]:
             optimization_selection = st.selectbox(
+                "Optimize", options=["Media Spends", target], key="optimization_key"
             )
         with _columns1[1]:
             st.markdown("#")
+            # if st.checkbox(
+            #     label="Optimize all Channels",
+            #     key="optimze_all_channels",
+            #     value=False,
+            #     # on_change=select_all_channels_for_optimization,
+            # ):
+            #     select_all_channels_for_optimization()
             st.checkbox(
                 label="Optimize all Channels",
+                key="optimze_all_channels",
                 value=False,
                 on_change=select_all_channels_for_optimization,
             )
         with _columns1[2]:
             st.markdown("#")
+            # st.button(
+            #     "Optimize",
+            #     on_click=optimize,
+            #     args=(st.session_state["optimization_key"]),
+            #     use_container_width=True,
+            # )
+            optimize_placeholder = st.empty()
         with _columns1[3]:
             st.markdown("#")
+            st.button(
+                "Reset",
+                on_click=reset_scenario,
+                args=(panel_selected, file_selected, updated_rcs),
+                use_container_width=True,
+            )
         _columns2 = st.columns((2, 2, 2))
         if st.session_state["optimization_key"] == "Media Spends":
                     # label_visibility="collapsed",
                     on_change=update_all_spends_abs,
                 )
+            with _columns2[1]:
                 st.number_input(
+                    "Percent Change",
+                    key="total_spends_change",
+                    min_value=-50,
+                    max_value=50,
                     step=1,
+                    on_change=update_spends,
+                )
+            with _columns2[2]:
+                min_value = round(_scenario.actual_total_spends * 0.5)
+                max_value = round(_scenario.actual_total_spends * 1.5)
+                st.session_state["total_spends_change_abs_slider_options"] = [
+                    numerize(value, 1)
+                    for value in range(min_value, max_value + 1, int(1e4))
+                ]
+                st.select_slider(
+                    "Absolute Slider",
+                    options=st.session_state["total_spends_change_abs_slider_options"],
+                    key="total_spends_change_abs_slider",
+                    on_change=update_all_spends_abs_slider,
                 )
+        elif st.session_state["optimization_key"] == target:
+            with _columns2[0]:
                 sales_input = st.text_input(
                     "Absolute",
                     key="total_sales_change_abs",
                     on_change=update_sales_abs,
                 )
             with _columns2[1]:
                 st.number_input(
+                    "Percent Change",
+                    key="total_sales_change",
+                    min_value=-50,
+                    max_value=50,
                     step=1,
                     on_change=update_sales,
                 )
+            with _columns2[2]:
+                min_value = round(_scenario.actual_total_sales * 0.5)
+                max_value = round(_scenario.actual_total_sales * 1.5)
+                st.session_state["total_sales_change_abs_slider_options"] = [
+                    numerize(value, 1)
+                    for value in range(min_value, max_value + 1, int(1e5))
+                ]
+                st.select_slider(
+                    "Absolute Slider",
+                    options=st.session_state["total_sales_change_abs_slider_options"],
+                    key="total_sales_change_abs_slider",
+                    on_change=update_sales_abs_slider,
+                )
+        if (
+            not st.session_state["allow_sales_update"]
+            and optimization_selection == target
+        ):
+            st.warning("Invalid Input")
+        if (
+            not st.session_state["allow_spends_update"]
+            and optimization_selection == "Media Spends"
+        ):
+            st.warning("Invalid Input")
+        status_placeholder = st.empty()
+        # if optimize_placeholder.button("Optimize", use_container_width=True):
+        #     optimize(st.session_state["optimization_key"], status_placeholder)
+        #     st.rerun()
+        optimize_placeholder.button(
+            "Optimize",
+            on_click=optimize,
+            args=(st.session_state["optimization_key"], status_placeholder),
+            use_container_width=True,
         )
+        st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
         _columns = st.columns((2.5, 2, 1.5, 1.5, 1))
         with _columns[0]:
             generate_spending_header("Channel")
         with _columns[4]:
             generate_spending_header("Optimize")
+        st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
         if "acutual_predicted" not in st.session_state:
             st.session_state["acutual_predicted"] = {
                 "Delta": [],
             }
         for i, channel_name in enumerate(channels_list):
+            _channel_class = st.session_state["scenario"].channels[channel_name]
             _columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
             with _columns[0]:
                 st.write(channel_name_formating(channel_name))
             with _columns[1]:
                 channel_bounds = _channel_class.bounds
                 channel_spends = float(_channel_class.actual_total_spends)
+                min_value = float((1 + channel_bounds[0] / 100) * channel_spends)
+                max_value = float((1 + channel_bounds[1] / 100) * channel_spends)
                 ##print(st.session_state[channel_name])
                 spend_input = st.text_input(
                     channel_name,
                 if not validate_input(spend_input):
                     st.error("Invalid input")
+                channel_name_current = f"{channel_name}_change"
                 st.number_input(
+                    "Percent Change",
+                    key=channel_name_current,
                     step=1,
                     on_change=partial(update_data_by_percent, channel_name),
                 )
                     * _channel_class.conversion_rate
                 )
                 actual_channel_spends = float(
+                    _channel_class.actual_total_spends * _channel_class.conversion_rate
                 )
                 spends_delta = float(
+                    _channel_class.delta_spends * _channel_class.conversion_rate
                 )
                 st.session_state["acutual_predicted"]["Channel_name"].append(
                     channel_name
                 st.session_state["acutual_predicted"]["Actual_spend"].append(
                     actual_channel_spends
                 )
+                st.session_state["acutual_predicted"]["Optimized_spend"].append(
+                    current_channel_spends
                 )
+                st.session_state["acutual_predicted"]["Delta"].append(spends_delta)
                 ## REMOVE
                 st.metric(
                     "Spends",
             with _columns[3]:
                 # sales
+                current_channel_sales = float(_channel_class.modified_total_sales)
                 actual_channel_sales = float(_channel_class.actual_total_sales)
                 sales_delta = float(_channel_class.delta_sales)
                 st.metric(
                     target,
+                    format_numbers(current_channel_sales, include_indicator=False),
                     delta=numerize(sales_delta, 1),
                     label_visibility="collapsed",
                 )
             with _columns[4]:
+                # if st.checkbox(
+                #     label="select for optimization",
+                #     key=f"{channel_name}_selected",
+                #     value=False,
+                #     # on_change=partial(select_channel_for_optimization, channel_name),
+                #     label_visibility="collapsed",
+                # ):
+                #     select_channel_for_optimization(channel_name)
                 st.checkbox(
                     label="select for optimization",
                     key=f"{channel_name}_selected",
                     value=False,
+                    on_change=partial(select_channel_for_optimization, channel_name),
                     label_visibility="collapsed",
                 )
             # Bins
             col = channels_list[i]
             x_actual = st.session_state["scenario"].channels[col].actual_spends
+            x_modified = st.session_state["scenario"].channels[col].modified_spends
             x_total = x_modified.sum()
             power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
+            updated_rcs_key = f"{metrics_selected}#@{panel_selected}#@{channel_name}"
+            if updated_rcs and updated_rcs_key in list(updated_rcs.keys()):
+                K = updated_rcs[updated_rcs_key]["K"]
+                b = updated_rcs[updated_rcs_key]["b"]
+                a = updated_rcs[updated_rcs_key]["a"]
+                x0 = updated_rcs[updated_rcs_key]["x0"]
+            else:
+                K = st.session_state["rcs"][col]["K"]
+                b = st.session_state["rcs"][col]["b"]
+                a = st.session_state["rcs"][col]["a"]
+                x0 = st.session_state["rcs"][col]["x0"]
             x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
+            # Append current_channel_spends to the end of x_plot
+            x_plot = np.append(x_plot, current_channel_spends)
             x, y, marginal_roi = [], [], []
             for x_p in x_plot:
                 x.append(x_p * x_actual / x_actual.sum())
             for index in range(len(x_plot)):
                 marginal_roi.append(
+                    a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
                 )
             x = (
             roi = y / np.maximum(x, np.finfo(float).eps)
+            roi_current, marginal_roi_current = roi[-1], marginal_roi[-1]
+            x, y, roi, marginal_roi = (
+                x[:-1],
+                y[:-1],
+                roi[:-1],
+                marginal_roi[:-1],
+            )  # Drop data for current spends
+            start_value, end_value, left_value, right_value = find_segment_value(
+                x,
+                roi,
+                marginal_roi,
             )
             rgba = calculate_rgba(
                 current_channel_spends,
             )
             with bin_placeholder:
                 st.markdown(
                     f"""
                     unsafe_allow_html=True,
                 )
+    with st.expander("See Response Curves", expanded=True):
         fig = plot_response_curves()
         st.plotly_chart(fig, use_container_width=True)
         )
     summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
+    summary_df.drop_duplicates(subset="Channel_name", keep="last", inplace=True)
     summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
     summary_df_sorted["Delta_percent"] = np.round(
+        ((summary_df_sorted["Optimized_spend"] / summary_df_sorted["Actual_spend"]) - 1)
         * 100,
         2,
     )
             authenticator.forgot_password("Forgot password")
         )
         if username_forgot_pw:
+            st.session_state["config"]["credentials"]["usernames"][username_forgot_pw][
+                "password"
+            ] = stauth.Hasher([random_password]).generate()[0]
             send_email(email_forgot_password, random_password)
             st.success("New password sent securely")
             # Random password to be transferred to user securely

requirements.txt CHANGED Viewed

@@ -1,102 +1,94 @@
-altair==5.2.0
-annotated-types==0.6.0
-attrs==23.2.0
-bcrypt==4.1.2
-blinker==1.7.0
-cachetools==5.3.2
-certifi==2024.2.2
-charset-normalizer==3.3.2
-click==8.1.7
-colorama==0.4.6
-contourpy==1.2.0
-cycler==0.12.1
-dacite==1.8.1
-et-xmlfile==1.1.0
-extra-streamlit-components==0.1.56
-fonttools==4.49.0
-gitdb==4.0.11
-GitPython==3.1.42
-htmlmin==0.1.12
-idna==3.6
-ImageHash==4.3.1
-importlib-metadata==7.0.1
-importlib-resources==6.1.1
-Jinja2==3.1.3
-joblib==1.3.2
-jsonschema==4.21.1
-jsonschema-specifications==2023.12.1
-kiwisolver==1.4.5
-llvmlite==0.41.1
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matplotlib==3.7.0
-matplotlib-inline==0.1.6
-mdurl==0.1.2
-multimethod==1.11.2
-networkx==3.2.1
-numba==0.58.1
-numerize==0.12
-numpy==1.23.5
-openpyxl==3.0.10
-packaging==23.2
-pandas==1.5.2
-pandas-profiling==3.6.6
-patsy==0.5.6
-phik==0.12.4
-pillow==10.2.0
-pip==24.0
-plotly==5.11.0
-plotly-express==0.4.1
-protobuf==4.25.3
-pyarrow==15.0.0
-pydantic==2.6.3
-pydantic-core==2.16.3
-pydantic-settings==2.2.1
-pydeck==0.8.1b0
-Pygments==2.17.2
-PyJWT==2.8.0
-pyparsing==3.1.1
-python-dateutil==2.8.2
-python-decouple==3.8
-python-dotenv==1.0.1
-pytz==2024.1
-PyWavelets==1.5.0
-PyYAML==6.0.1
-referencing==0.33.0
-requests==2.31.0
-rich==13.7.0
-rpds-py==0.18.0
-scikit-learn==1.1.3
-scipy==1.11.4
-seaborn==0.12.2
-setuptools==69.1.0
-six==1.16.0
-smmap==5.0.1
-statsmodels==0.14.0
-streamlit==1.31.0
-streamlit-aggrid==0.3.4.post3
-streamlit-authenticator==0.2.1
-streamlit-chat==0.1.1
-streamlit-pandas-profiling==0.1.3
-sweetviz==2.2.1
-tangled-up-in-unicode==0.2.0
-tenacity==8.2.3
-threadpoolctl==3.3.0
-toml==0.10.2
-toolz==0.12.1
-tornado==6.4
-tqdm==4.66.2
-traitlets==5.14.1
-typeguard==4.1.5
-typing-extensions==4.9.0
-tzdata==2024.1
-tzlocal==5.2
-urllib3==2.2.1
-validators==0.22.0
-visions==0.7.5
-watchdog==4.0.0
-wheel==0.42.0
-wordcloud==1.9.3
-ydata-profiling==4.6.5
-zipp==3.17.0

+altair	==			4.2.0
+attrs	==			23.1.0
+bcrypt	==			4.0.1
+blinker	==			1.6.2
+cachetools	==		5.3.1
+certifi	==			2023.7.22
+charset-normalizer	==	3.2.0
+click	==			8.1.7
+colorama	==		0.4.6
+contourpy	==		1.1.1
+cycler	==			0.11.0
+dacite	==			1.8.1
+entrypoints	==		0.4
+et-xmlfile	==		1.1.0
+extra-streamlit-components ==	0.1.56
+fonttools	==		4.42.1
+gitdb	==			4.0.10
+GitPython	==		3.1.35
+htmlmin	==			0.1.12
+idna	==			3.4
+ImageHash	==		4.3.1
+importlib-metadata	==	6.8.0
+importlib-resources	==	6.1.0
+Jinja2	==			3.1.2
+joblib	==			1.3.2
+jsonschema	==		4.19.0
+jsonschema-specifications==	2023.7.1
+kaleido	==			0.2.1
+kiwisolver	==		1.4.5
+markdown-it-py	==		3.0.0
+MarkupSafe	==		2.1.3
+matplotlib	==		3.7.0
+mdurl	==			0.1.2
+networkx	==		3.1
+numerize	==		0.12
+numpy	==			1.23.5
+openpyxl>=3.1.0
+packaging	==		23.1
+pandas	==			1.5.2
+pandas-profiling	==	3.6.6
+patsy	==			0.5.3
+phik	==			0.12.3
+Pillow	==			10.0.0
+pip	==			23.2.1
+plotly	==			5.11.0
+protobuf	==		3.20.3
+pyarrow	==			13.0.0
+pydantic	==		1.10.13
+pydeck	==			0.8.1b0
+Pygments	==		2.16.1
+PyJWT	==			2.8.0
+Pympler	==			1.0.1
+pyparsing	==		3.1.1
+python-dateutil	==		2.8.2
+python-decouple	==		3.8
+pytz	==			2023.3.post1
+PyWavelets	==		1.4.1
+PyYAML	==			6.0.1
+referencing	==		0.30.2
+requests	==		2.31.0
+rich	==			13.5.2
+rpds-py	==			0.10.2
+scikit-learn	==		1.1.3
+scipy	==			1.9.3
+seaborn	==			0.12.2
+semver	==			3.0.1
+setuptools	==		68.1.2
+six	==			1.16.0
+smmap	==			5.0.0
+statsmodels	==		0.14.0
+streamlit	==		1.16.0
+streamlit-aggrid	==	0.3.4.post3
+streamlit-authenticator	==	0.2.1
+streamlit-pandas-profiling==	0.1.3
+sweetviz	==		2.2.1
+tangled-up-in-unicode	==	0.2.0
+tenacity	==		8.2.3
+threadpoolctl	==		3.2.0
+toml	==			0.10.2
+toolz	==			0.12.0
+tornado	==			6.3.3
+tqdm	==			4.66.1
+typeguard	==		2.13.3
+typing_extensions	==	4.7.1
+tzdata	==			2023.3
+tzlocal	==			5.0.1
+urllib3	==			2.0.4
+validators	==		0.22.0
+visions	==			0.7.5
+watchdog	==		3.0.0
+wheel	==			0.41.2
+wordcloud	==		1.9.2
+ydata-profiling	==		4.5.1
+zipp	==			3.16.2

summary_df.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0250c8a092d14c32845f27f6cddb2ac8131f8c280d38489294da847adf61c4e7
 size 1482

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f2aa1b3c4f759d4179abf2dbed90751ec0849b3750a1019827173d2152954ac
 size 1482

tuned_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9871c17d7d10846b84c31343a1b9fc3ad87c1a67fa8bf8b10b2199032a1581be
+size 4287842

upf_data_converted_old.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

upf_data_converted_old.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92716069afa2c16a8afb6494da6d5f93878558de0215b1b9334ffeb997fdc6b6
+size 1561111

upf_data_converted_randomized_resp_metrics.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

upf_data_converted_randomized_resp_metrics.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf24972737d4c10d274ce6e3165551442e662992623754dbef11155f4b177531
+size 1893805

utilities.py CHANGED Viewed

@@ -12,7 +12,6 @@ import io
 import plotly
 from pathlib import Path
 import pickle
-import streamlit_authenticator as stauth
 import yaml
 from yaml import SafeLoader
 from streamlit.components.v1 import html
@@ -24,27 +23,59 @@ import os
 import base64
-color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
-CURRENCY_INDICATOR = '$'
 def load_authenticator():
-    with open('config.yaml') as file:
         config = yaml.load(file, Loader=SafeLoader)
-        st.session_state['config'] = config
     authenticator = stauth.Authenticate(
-        config['credentials'],
-        config['cookie']['name'],
-        config['cookie']['key'],
-        config['cookie']['expiry_days'],
-        config['preauthorized']
     )
-    st.session_state['authenticator'] = authenticator
     return authenticator
 def nav_page(page_name, timeout_secs=3):
     nav_script = """
         <script type="text/javascript">
@@ -67,7 +98,10 @@ def nav_page(page_name, timeout_secs=3):
                 attempt_nav_page("%s", new Date(), %d);
             });
         </script>
-    """ % (page_name, timeout_secs)
     html(nav_script)
@@ -92,23 +126,18 @@ data_url = base64.b64encode(contents).decode("utf-8")
 file_.close()
-DATA_PATH = './data'
-IMAGES_PATH = './data/images_224_224'
 def load_local_css(file_name):
     with open(file_name) as f:
-        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
 # def set_header():
@@ -129,24 +158,24 @@ data_url1 = base64.b64encode(contents1).decode("utf-8")
 file_1.close()
-DATA_PATH1 = './data'
-IMAGES_PATH1 = './data/images_224_224'
 def set_header():
-    return st.markdown(f"""<div class='main-header'>
                     <!-- <h1></h1> -->
                        <div >
                        <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
                        </div>
                     <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
-            </div>""", unsafe_allow_html=True)
 # def set_header():
 #     logo_path = "./path/to/your/local/LIME_logo.png"  # Replace with the actual file path
@@ -157,51 +186,87 @@ def set_header():
 #             </div>""", unsafe_allow_html=True)
-def s_curve(x,K,b,a,x0):
-    return K / (1 + b * np.exp(-a*(x-x0)))
-def initialize_data():
     # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
     #                    "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
     #                    "digital_spends":1}
-    #print('State initialized')
-    excel = pd.read_excel("Overview_data_test.xlsx",sheet_name=None)
-    raw_df = excel['RAW DATA MMM']
-    spend_df = excel['SPEND INPUT']
-    contri_df = excel['CONTRIBUTION MMM']
-    #Revenue_df = excel['Revenue']
-    ## remove sesonalities, indices etc ...
-    exclude_columns = ['Date',
-                       'Region',
-                       'Controls_Grammarly_Index_SeasonalAVG',
-                       'Controls_Quillbot_Index',
-                       'Daily_Positive_Outliers',
-                       'External_RemoteClass_Index',
-                       'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
-                       'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
-                       'Intervals ON 20201005-20201019',
-                       'Promotion_PercentOff',
-                       'Promotion_TimeBased',
-                       'Seasonality_Indicator_Chirstmas',
-                       'Seasonality_Indicator_NewYears_Days',
-                       'Seasonality_Indicator_Thanksgiving',
-                       'Trend 20200302 / 20200803',
-                  ]
-    raw_df['Date']=pd.to_datetime(raw_df['Date'])
-    contri_df['Date']=pd.to_datetime(contri_df['Date'])
-    input_df = raw_df.sort_values(by='Date')
-    output_df = contri_df.sort_values(by='Date')
-    spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
-    spend_df.sort_values(by='Week', inplace=True)
     # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
     # spend_df = spend_df.sort_values(by='Week')
     channel_list = [col for col in input_df.columns if col not in exclude_columns]
     response_curves = {}
     mapes = {}
     rmses = {}
@@ -215,14 +280,14 @@ def initialize_data():
     dates = input_df.Date.values
     actual_output_dic = {}
     actual_input_dic = {}
     for inp_col in channel_list:
-        #st.write(inp_col)
         spends = input_df[inp_col].values
         x = spends.copy()
-        # upper limit for penalty
-        upper_limits[inp_col] = 2*x.max()
         # contribution
         out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
         y = output_df[out_col].values.copy()
@@ -230,96 +295,141 @@ def initialize_data():
         actual_input_dic[inp_col] = x.copy()
         ##output cols aggregation
         output_cols.append(out_col)
         ## scale the input
-        power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
-        if power >= 0 :
             x = x / 10**power
-        x = x.astype('float64')
-        y = y.astype('float64')
-        #print('#printing yyyyyyyyy')
-        #print(inp_col)
-        #print(x.max())
-        #print(y.max())
-        bounds = ((0, 0, 0, 0), (3*y.max(), 1000, 1, x.max()))
-        #bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
-        params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
-                                bounds=bounds,
-                                maxfev=int(1e5))
         mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
-        rmse =  np.sqrt(((y - s_curve(x,*params))**2).mean())
-        r2_ = r2_score(y, s_curve(x,*params))
-        response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
         mapes[inp_col] = mape
         rmses[inp_col] = rmse
         r2[inp_col] = r2_
         powers[inp_col] = power
         ## conversion rates
-        spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
-        #print('#printing spendssss')
-        #print(spend_col)
-        conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
-        conv.rename(columns={'index':'Week'},inplace=True)
-        conv['year'] = conv.Week.dt.year
-        conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
         ##print('Before',conv_rates[inp_col])
         # conv_rates[inp_col] = uopx_conv_rates[inp_col]
         ##print('After',(conv_rates[inp_col]))
-        channel = Channel(name=inp_col,dates=dates,
-                            spends=spends,
-                            # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
-                            conversion_rate = conv_rates[inp_col],
-                            response_curve_type='s-curve',
-                            response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
-                            bounds=np.array([-10,10]))
         channels[inp_col] = channel
         if sales is None:
             sales = channel.actual_sales
         else:
             sales += channel.actual_sales
-    other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
-    correction = output_df.drop('Date',axis=1).sum(axis=1).values -  (sales + other_contributions)
-    scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
     ## setting session variables
-    st.session_state['initialized'] = True
-    st.session_state['actual_df'] = input_df
-    st.session_state['raw_df'] = raw_df
-    st.session_state['contri_df'] = output_df
     default_scenario_dict = class_to_dict(scenario)
-    st.session_state['default_scenario_dict'] = default_scenario_dict
-    st.session_state['scenario'] = scenario
-    st.session_state['channels_list'] = channel_list
-    st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
-    st.session_state['rcs'] = response_curves
-    st.session_state['powers'] = powers
-    st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
-    st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
     for channel in channels.values():
-        st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
-    st.session_state['xlsx_buffer'] = io.BytesIO()
-    if Path('../saved_scenarios.pkl').exists():
-        with open('../saved_scenarios.pkl','rb') as f:
-            st.session_state['saved_scenarios'] = pickle.load(f)
     else:
-        st.session_state['saved_scenarios'] = OrderedDict()
-    st.session_state['total_spends_change'] = 0
-    st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
-    st.session_state['disable_download_button'] = True
 # def initialize_data():
 #     # fetch data from excel
 #     output = pd.read_excel('data.xlsx',sheet_name=None)
@@ -335,17 +445,17 @@ def initialize_data():
 #             channel_list.append(col)
 #         else:
 #             pass
 #     ## NOTE : Considered only Desktop spends for all calculations
 #     acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
 #     ## NOTE : Considered one year of data
 #     acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
 #     actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
 #     ##load response curves
 #     with open('./grammarly_response_curves.json','r') as f:
 #         response_curves = json.load(f)
 #     ## create channel dict for scenario creation
 #     dates = actual_df.Date.values
 #     channels = {}
@@ -363,15 +473,15 @@ def initialize_data():
 #                             response_curve_type=response_curve_type,
 #                             response_curve_params=response_curve_params,
 #                             bounds=np.array([-30,30]))
 #             channels[name] = channel
 #         else:
 #             constant = info_dict.get('value',0.) * len(dates)
 #     ## create scenario
 #     scenario = Scenario(name='default', channels=channels, constant=constant)
 #     default_scenario_dict = class_to_dict(scenario)
 #     ## setting session variables
 #     st.session_state['initialized'] = True
@@ -385,7 +495,7 @@ def initialize_data():
 #     for channel in channels.values():
 #         if channel.name not in st.session_state:
 #             st.session_state[channel.name] = float(channel.actual_total_spends)
 #     if 'xlsx_buffer' not in st.session_state:
 #         st.session_state['xlsx_buffer'] = io.BytesIO()
@@ -394,51 +504,121 @@ def initialize_data():
 #         if Path('../saved_scenarios.pkl').exists():
 #             with open('../saved_scenarios.pkl','rb') as f:
 #                 st.session_state['saved_scenarios'] = pickle.load(f)
 #         else:
 #             st.session_state['saved_scenarios'] = OrderedDict()
 #     if 'total_spends_change' not in st.session_state:
 #         st.session_state['total_spends_change'] = 0
 #     if 'optimization_channels' not in st.session_state:
 #         st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
 #     if 'disable_download_button' not in st.session_state:
 #         st.session_state['disable_download_button'] = True
 def create_channel_summary(scenario):
     # Provided data
     data = {
-        'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
-        'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
-        'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
     }
     # Create DataFrame
     df = pd.DataFrame(data)
     # Convert currency strings to numeric values
-    df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
-    df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
     # Calculate ROI
-    df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
     # Format columns
     format_currency = lambda x: f"${x:,.1f}"
     format_roi = lambda x: f"{x:.1f}"
-    df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
-    df['Revenue'] =  ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
-    df['ROI'] = df['ROI'].apply(format_roi)
     return df
-#@st.cache(allow_output_mutation=True)
 # def create_contribution_pie(scenario):
 #     #c1f7dc
 #     colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
@@ -470,23 +650,23 @@ def create_channel_summary(scenario):
 #     weekly_spends_data = []
 #     weekly_sales_data = []
 #     for channel_name in st.session_state['channels_list']:
-#         weekly_spends_data.append((go.Bar(x=x,
 #                                           y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
-#                                           name=channel_name_formating(channel_name),
 #                                           hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
 #                                           legendgroup=channel_name)))
-#         weekly_sales_data.append((go.Bar(x=x,
 #                                          y=scenario.channels[channel_name].actual_sales,
-#                                          name=channel_name_formating(channel_name),
 #                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
 #                                          legendgroup=channel_name, showlegend=False)))
 #     for _d in weekly_spends_data:
 #         weekly_contribution_fig.add_trace(_d, row=1, col=1)
 #     for _d in weekly_sales_data:
 #         weekly_contribution_fig.add_trace(_d, row=1, col=2)
-#     weekly_contribution_fig.add_trace(go.Bar(x=x,
 #                                          y=scenario.constant + scenario.correction,
-#                                          name='Non Media',
 #                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
 #     weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
 #     weekly_contribution_fig.update_xaxes(showgrid=False)
@@ -524,14 +704,50 @@ def create_channel_summary(scenario):
 def create_contribution_pie():
-    color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
-    total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
-    channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
     # Assign colors from the limited palette to channels
-    colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
-    colors_map['Non Media'] = color_palette[5]  # Assign fixed green color for 'Non Media'
     # Hardcoded values for Spends and Revenue
     spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
@@ -542,10 +758,13 @@ def create_contribution_pie():
         go.Pie(
             labels=[channel_name for channel_name in channels_list],
             values=spends_values,
-            marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
-            hole=0.3
         ),
-        row=1, col=1
     )
     # Add trace for Revenue pie chart
@@ -553,144 +772,196 @@ def create_contribution_pie():
         go.Pie(
             labels=[channel_name for channel_name in channels_list],
             values=revenue_values,
-            marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
-            hole=0.3
         ),
-        row=1, col=2
     )
-    total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
-    total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
     return total_contribution_fig
 def create_contribuion_stacked_plot(scenario):
-    weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
-    raw_df = st.session_state['raw_df']
-    df = raw_df.sort_values(by='Date')
     x = df.Date
     weekly_spends_data = []
     weekly_sales_data = []
-    for i, channel_name in enumerate(st.session_state['channels_list']):
         color = color_palette[i % len(color_palette)]
-        weekly_spends_data.append(go.Bar(
-            x=x,
-            y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
-            name=channel_name_formating(channel_name),
-            hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
-            legendgroup=channel_name,
-            marker_color=color,
-        ))
-        weekly_sales_data.append(go.Bar(
-            x=x,
-            y=scenario.channels[channel_name].actual_sales,
-            name=channel_name_formating(channel_name),
-            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
-            legendgroup=channel_name,
-            showlegend=False,
-            marker_color=color,
-        ))
     for _d in weekly_spends_data:
         weekly_contribution_fig.add_trace(_d, row=1, col=1)
     for _d in weekly_sales_data:
         weekly_contribution_fig.add_trace(_d, row=1, col=2)
-    weekly_contribution_fig.add_trace(go.Bar(
-        x=x,
-        y=scenario.constant + scenario.correction,
-        name='Non Media',
-        hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
-        marker_color=color_palette[-1],
-    ), row=1, col=2)
-    weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
     weekly_contribution_fig.update_xaxes(showgrid=False)
     weekly_contribution_fig.update_yaxes(showgrid=False)
     return weekly_contribution_fig
 def create_channel_spends_sales_plot(channel):
     if channel is not None:
         x = channel.dates
         _spends = channel.actual_spends * channel.conversion_rate
         _sales = channel.actual_sales
         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
-        channel_sales_spends_fig.add_trace(go.Bar(
-            x=x,
-            y=_sales,
-            marker_color=color_palette[3],  # You can choose a color from the palette
-            name='Revenue',
-            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
-        ), secondary_y=False)
-        channel_sales_spends_fig.add_trace(go.Scatter(
-            x=x,
-            y=_spends,
-            line=dict(color=color_palette[2]),  # You can choose another color from the palette
-            name='Spends',
-            hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
-        ), secondary_y=True)
-        channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
         channel_sales_spends_fig.update_xaxes(showgrid=False)
         channel_sales_spends_fig.update_yaxes(showgrid=False)
     else:
-        raw_df = st.session_state['raw_df']
-        df = raw_df.sort_values(by='Date')
         x = df.Date
-        scenario = class_from_dict(st.session_state['default_scenario_dict'])
         _sales = scenario.constant + scenario.correction
         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
-        channel_sales_spends_fig.add_trace(go.Bar(
-            x=x,
-            y=_sales,
-            marker_color=color_palette[0],  # You can choose a color from the palette
-            name='Revenue',
-            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
-        ), secondary_y=False)
-        channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
         channel_sales_spends_fig.update_xaxes(showgrid=False)
         channel_sales_spends_fig.update_yaxes(showgrid=False)
     return channel_sales_spends_fig
-def format_numbers(value, n_decimals=1,include_indicator = True):
     if include_indicator:
-        return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
     else:
-        return f'{numerize(value,n_decimals)}'
-def decimal_formater(num_string,n_decimals=1):
-    parts = num_string.split('.')
     if len(parts) == 1:
-        return num_string+'.' + '0'*n_decimals
     else:
         to_be_padded = n_decimals - len(parts[-1])
-        if to_be_padded > 0 :
-            return num_string+'0'*to_be_padded
         else:
             return num_string
 def channel_name_formating(channel_name):
-    name_mod = channel_name.replace('_', ' ')
-    if name_mod.lower().endswith(' imp'):
-        name_mod = name_mod.replace('Imp','Spend')
-    elif name_mod.lower().endswith(' clicks'):
-        name_mod = name_mod.replace('Clicks','Spend')
     return name_mod
-def send_email(email,message):
-    s = smtplib.SMTP('smtp.gmail.com', 587)
     s.starttls()
     s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
     s.sendmail("geethu4444@gmail.com", email, message)
     s.quit()
 if __name__ == "__main__":
     initialize_data()

 import plotly
 from pathlib import Path
 import pickle
 import yaml
 from yaml import SafeLoader
 from streamlit.components.v1 import html
 import base64
+color_palette = [
+    "#F3F3F0",
+    "#5E7D7E",
+    "#2FA1FF",
+    "#00EDED",
+    "#00EAE4",
+    "#304550",
+    "#EDEBEB",
+    "#7FBEFD",
+    "#003059",
+    "#A2F3F3",
+    "#E1D6E2",
+    "#B6B6B6",
+]
+CURRENCY_INDICATOR = "$"
+import streamlit_authenticator as stauth
 def load_authenticator():
+    with open("config.yaml") as file:
         config = yaml.load(file, Loader=SafeLoader)
+        st.session_state["config"] = config
     authenticator = stauth.Authenticate(
+        credentials=config["credentials"],
+        cookie_name=config["cookie"]["name"],
+        key=config["cookie"]["key"],
+        cookie_expiry_days=config["cookie"]["expiry_days"],
+        preauthorized=config["preauthorized"],
     )
+    st.session_state["authenticator"] = authenticator
     return authenticator
+# Authentication
+def authentication():
+    with open("config.yaml") as file:
+        config = yaml.load(file, Loader=SafeLoader)
+        authenticator = stauth.Authenticate(
+            config["credentials"],
+            config["cookie"]["name"],
+            config["cookie"]["key"],
+            config["cookie"]["expiry_days"],
+            config["preauthorized"],
+        )
+    name, authentication_status, username = authenticator.login("Login", "main")
+    return authenticator, name, authentication_status, username
 def nav_page(page_name, timeout_secs=3):
     nav_script = """
         <script type="text/javascript">
                 attempt_nav_page("%s", new Date(), %d);
             });
         </script>
+    """ % (
+        page_name,
+        timeout_secs,
+    )
     html(nav_script)
 file_.close()
+DATA_PATH = "./data"
+IMAGES_PATH = "./data/images_224_224"
 def load_local_css(file_name):
     with open(file_name) as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
 # def set_header():
 file_1.close()
+DATA_PATH1 = "./data"
+IMAGES_PATH1 = "./data/images_224_224"
 def set_header():
+    return st.markdown(
+        f"""<div class='main-header'>
                     <!-- <h1></h1> -->
                        <div >
                        <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
                        </div>
                     <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
+            </div>""",
+        unsafe_allow_html=True,
+    )
 # def set_header():
 #     logo_path = "./path/to/your/local/LIME_logo.png"  # Replace with the actual file path
 #             </div>""", unsafe_allow_html=True)
+def s_curve(x, K, b, a, x0):
+    return K / (1 + b * np.exp(-a * (x - x0)))
+def panel_level(input_df, date_column="Date"):
+    # Ensure 'Date' is set as the index
+    if date_column not in input_df.index.names:
+        input_df = input_df.set_index(date_column)
+    # Select numeric columns only (excluding 'Date' since it's now the index)
+    numeric_columns_df = input_df.select_dtypes(include="number")
+    # Group by 'Date' (which is the index) and sum the numeric columns
+    aggregated_df = numeric_columns_df.groupby(input_df.index).sum()
+    # Reset index if you want 'Date' back as a column
+    aggregated_df = aggregated_df.reset_index()
+    return aggregated_df
+def initialize_data(
+    panel=None, target_file="Overview_data_test.xlsx", updated_rcs=None, metrics=None
+):
     # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
     #                    "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
     #                    "digital_spends":1}
+    # print('State initialized')
+    excel = pd.read_excel(target_file, sheet_name=None)
+    # Extract dataframes for raw data, spend input, and contribution MMM
+    raw_df = excel["RAW DATA MMM"]
+    spend_df = excel["SPEND INPUT"]
+    contri_df = excel["CONTRIBUTION MMM"]
+    # Check if the panel is not None
+    if panel is not None and panel != "Aggregated":
+        raw_df = raw_df[raw_df["Panel"] == panel].drop(columns=["Panel"])
+        spend_df = spend_df[spend_df["Panel"] == panel].drop(columns=["Panel"])
+        contri_df = contri_df[contri_df["Panel"] == panel].drop(columns=["Panel"])
+    elif panel == "Aggregated":
+        raw_df = panel_level(raw_df, date_column="Date")
+        spend_df = panel_level(spend_df, date_column="Week")
+        contri_df = panel_level(contri_df, date_column="Date")
+    # Revenue_df = excel['Revenue']
+    ## remove sesonalities, indices etc ...
+    exclude_columns = [
+        "Date",
+        "Region",
+        "Controls_Grammarly_Index_SeasonalAVG",
+        "Controls_Quillbot_Index",
+        "Daily_Positive_Outliers",
+        "External_RemoteClass_Index",
+        "Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802",
+        "Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206",
+        "Intervals ON 20201005-20201019",
+        "Promotion_PercentOff",
+        "Promotion_TimeBased",
+        "Seasonality_Indicator_Chirstmas",
+        "Seasonality_Indicator_NewYears_Days",
+        "Seasonality_Indicator_Thanksgiving",
+        "Trend 20200302 / 20200803",
+    ]
+    raw_df["Date"] = pd.to_datetime(raw_df["Date"])
+    contri_df["Date"] = pd.to_datetime(contri_df["Date"])
+    input_df = raw_df.sort_values(by="Date")
+    output_df = contri_df.sort_values(by="Date")
+    spend_df["Week"] = pd.to_datetime(
+        spend_df["Week"], format="%Y-%m-%d", errors="coerce"
+    )
+    spend_df.sort_values(by="Week", inplace=True)
     # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
     # spend_df = spend_df.sort_values(by='Week')
     channel_list = [col for col in input_df.columns if col not in exclude_columns]
+    channel_list = list(set(channel_list) - set(["fb_level_achieved_tier_1", "ga_app"]))
     response_curves = {}
     mapes = {}
     rmses = {}
     dates = input_df.Date.values
     actual_output_dic = {}
     actual_input_dic = {}
     for inp_col in channel_list:
+        # st.write(inp_col)
         spends = input_df[inp_col].values
         x = spends.copy()
+        # upper limit for penalty
+        upper_limits[inp_col] = 2 * x.max()
         # contribution
         out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
         y = output_df[out_col].values.copy()
         actual_input_dic[inp_col] = x.copy()
         ##output cols aggregation
         output_cols.append(out_col)
         ## scale the input
+        power = np.ceil(np.log(x.max()) / np.log(10)) - 3
+        if power >= 0:
             x = x / 10**power
+        x = x.astype("float64")
+        y = y.astype("float64")
+        # print('#printing yyyyyyyyy')
+        # print(inp_col)
+        # print(x.max())
+        # print(y.max())
+        bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
+        # bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
+        params, _ = curve_fit(
+            s_curve,
+            x,
+            y,
+            p0=(2 * y.max(), 0.01, 1e-5, x.max()),
+            bounds=bounds,
+            maxfev=int(1e5),
+        )
         mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
+        rmse = np.sqrt(((y - s_curve(x, *params)) ** 2).mean())
+        r2_ = r2_score(y, s_curve(x, *params))
+        response_curves[inp_col] = {
+            "K": params[0],
+            "b": params[1],
+            "a": params[2],
+            "x0": params[3],
+        }
+        updated_rcs_key = f"{metrics}#@{panel}#@{inp_col}"
+        if updated_rcs is not None and updated_rcs_key in list(updated_rcs.keys()):
+            response_curves[inp_col] = updated_rcs[updated_rcs_key]
         mapes[inp_col] = mape
         rmses[inp_col] = rmse
         r2[inp_col] = r2_
         powers[inp_col] = power
         ## conversion rates
+        spend_col = [
+            _col
+            for _col in spend_df.columns
+            if _col.startswith(inp_col.rsplit("_", 1)[0])
+        ][0]
+        # print('#printing spendssss')
+        # print(spend_col)
+        conv = (
+            spend_df.set_index("Week")[spend_col]
+            / input_df.set_index("Date")[inp_col].clip(lower=1)
+        ).reset_index()
+        conv.rename(columns={"index": "Week"}, inplace=True)
+        conv["year"] = conv.Week.dt.year
+        conv_rates[inp_col] = list(conv.drop("Week", axis=1).mean().to_dict().values())[
+            0
+        ]
         ##print('Before',conv_rates[inp_col])
         # conv_rates[inp_col] = uopx_conv_rates[inp_col]
         ##print('After',(conv_rates[inp_col]))
+        channel = Channel(
+            name=inp_col,
+            dates=dates,
+            spends=spends,
+            # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
+            conversion_rate=conv_rates[inp_col],
+            response_curve_type="s-curve",
+            response_curve_params={
+                "K": params[0],
+                "b": params[1],
+                "a": params[2],
+                "x0": params[3],
+            },
+            bounds=np.array([-10, 10]),
+        )
         channels[inp_col] = channel
         if sales is None:
             sales = channel.actual_sales
         else:
             sales += channel.actual_sales
+    other_contributions = (
+        output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only=True).values
+    )
+    correction = output_df.drop("Date", axis=1).sum(axis=1).values - (
+        sales + other_contributions
+    )
+    scenario = Scenario(
+        name="default",
+        channels=channels,
+        constant=other_contributions,
+        correction=correction,
+    )
     ## setting session variables
+    st.session_state["initialized"] = True
+    st.session_state["actual_df"] = input_df
+    st.session_state["raw_df"] = raw_df
+    st.session_state["contri_df"] = output_df
     default_scenario_dict = class_to_dict(scenario)
+    st.session_state["default_scenario_dict"] = default_scenario_dict
+    st.session_state["scenario"] = scenario
+    st.session_state["channels_list"] = channel_list
+    st.session_state["optimization_channels"] = {
+        channel_name: False for channel_name in channel_list
+    }
+    st.session_state["rcs"] = response_curves
+    st.session_state["powers"] = powers
+    st.session_state["actual_contribution_df"] = pd.DataFrame(actual_output_dic)
+    st.session_state["actual_input_df"] = pd.DataFrame(actual_input_dic)
     for channel in channels.values():
+        st.session_state[channel.name] = numerize(
+            channel.actual_total_spends * channel.conversion_rate, 1
+        )
+    st.session_state["xlsx_buffer"] = io.BytesIO()
+    if Path("../saved_scenarios.pkl").exists():
+        with open("../saved_scenarios.pkl", "rb") as f:
+            st.session_state["saved_scenarios"] = pickle.load(f)
     else:
+        st.session_state["saved_scenarios"] = OrderedDict()
+    # st.session_state["total_spends_change"] = 0
+    st.session_state["optimization_channels"] = {
+        channel_name: False for channel_name in channel_list
+    }
+    st.session_state["disable_download_button"] = True
 # def initialize_data():
 #     # fetch data from excel
 #     output = pd.read_excel('data.xlsx',sheet_name=None)
 #             channel_list.append(col)
 #         else:
 #             pass
 #     ## NOTE : Considered only Desktop spends for all calculations
 #     acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
 #     ## NOTE : Considered one year of data
 #     acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
 #     actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
 #     ##load response curves
 #     with open('./grammarly_response_curves.json','r') as f:
 #         response_curves = json.load(f)
 #     ## create channel dict for scenario creation
 #     dates = actual_df.Date.values
 #     channels = {}
 #                             response_curve_type=response_curve_type,
 #                             response_curve_params=response_curve_params,
 #                             bounds=np.array([-30,30]))
 #             channels[name] = channel
 #         else:
 #             constant = info_dict.get('value',0.) * len(dates)
 #     ## create scenario
 #     scenario = Scenario(name='default', channels=channels, constant=constant)
 #     default_scenario_dict = class_to_dict(scenario)
 #     ## setting session variables
 #     st.session_state['initialized'] = True
 #     for channel in channels.values():
 #         if channel.name not in st.session_state:
 #             st.session_state[channel.name] = float(channel.actual_total_spends)
 #     if 'xlsx_buffer' not in st.session_state:
 #         st.session_state['xlsx_buffer'] = io.BytesIO()
 #         if Path('../saved_scenarios.pkl').exists():
 #             with open('../saved_scenarios.pkl','rb') as f:
 #                 st.session_state['saved_scenarios'] = pickle.load(f)
 #         else:
 #             st.session_state['saved_scenarios'] = OrderedDict()
 #     if 'total_spends_change' not in st.session_state:
 #         st.session_state['total_spends_change'] = 0
 #     if 'optimization_channels' not in st.session_state:
 #         st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
 #     if 'disable_download_button' not in st.session_state:
 #         st.session_state['disable_download_button'] = True
 def create_channel_summary(scenario):
     # Provided data
     data = {
+        "Channel": [
+            "Paid Search",
+            "Ga will cid baixo risco",
+            "Digital tactic others",
+            "Fb la tier 1",
+            "Fb la tier 2",
+            "Paid social others",
+            "Programmatic",
+            "Kwai",
+            "Indicacao",
+            "Infleux",
+            "Influencer",
+        ],
+        "Spends": [
+            "$ 11.3K",
+            "$ 155.2K",
+            "$ 50.7K",
+            "$ 125.4K",
+            "$ 125.2K",
+            "$ 105K",
+            "$ 3.3M",
+            "$ 47.5K",
+            "$ 55.9K",
+            "$ 632.3K",
+            "$ 48.3K",
+        ],
+        "Revenue": [
+            "558.0K",
+            "3.5M",
+            "5.2M",
+            "3.1M",
+            "3.1M",
+            "2.1M",
+            "20.8M",
+            "1.6M",
+            "728.4K",
+            "22.9M",
+            "4.8M",
+        ],
     }
     # Create DataFrame
     df = pd.DataFrame(data)
     # Convert currency strings to numeric values
+    df["Spends"] = (
+        df["Spends"]
+        .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
+        .map(pd.eval)
+        .astype(int)
+    )
+    df["Revenue"] = (
+        df["Revenue"]
+        .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
+        .map(pd.eval)
+        .astype(int)
+    )
     # Calculate ROI
+    df["ROI"] = (df["Revenue"] - df["Spends"]) / df["Spends"]
     # Format columns
     format_currency = lambda x: f"${x:,.1f}"
     format_roi = lambda x: f"{x:.1f}"
+    df["Spends"] = [
+        "$ 11.3K",
+        "$ 155.2K",
+        "$ 50.7K",
+        "$ 125.4K",
+        "$ 125.2K",
+        "$ 105K",
+        "$ 3.3M",
+        "$ 47.5K",
+        "$ 55.9K",
+        "$ 632.3K",
+        "$ 48.3K",
+    ]
+    df["Revenue"] = [
+        "$ 536.3K",
+        "$ 3.4M",
+        "$ 5M",
+        "$ 3M",
+        "$ 3M",
+        "$ 2M",
+        "$ 20M",
+        "$ 1.5M",
+        "$ 7.1M",
+        "$ 22M",
+        "$ 4.6M",
+    ]
+    df["ROI"] = df["ROI"].apply(format_roi)
     return df
+# @st.cache(allow_output_mutation=True)
 # def create_contribution_pie(scenario):
 #     #c1f7dc
 #     colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
 #     weekly_spends_data = []
 #     weekly_sales_data = []
 #     for channel_name in st.session_state['channels_list']:
+#         weekly_spends_data.append((go.Bar(x=x,
 #                                           y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
+#                                           name=channel_name_formating(channel_name),
 #                                           hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
 #                                           legendgroup=channel_name)))
+#         weekly_sales_data.append((go.Bar(x=x,
 #                                          y=scenario.channels[channel_name].actual_sales,
+#                                          name=channel_name_formating(channel_name),
 #                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
 #                                          legendgroup=channel_name, showlegend=False)))
 #     for _d in weekly_spends_data:
 #         weekly_contribution_fig.add_trace(_d, row=1, col=1)
 #     for _d in weekly_sales_data:
 #         weekly_contribution_fig.add_trace(_d, row=1, col=2)
+#     weekly_contribution_fig.add_trace(go.Bar(x=x,
 #                                          y=scenario.constant + scenario.correction,
+#                                          name='Non Media',
 #                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
 #     weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
 #     weekly_contribution_fig.update_xaxes(showgrid=False)
 def create_contribution_pie():
+    color_palette = [
+        "#F3F3F0",
+        "#5E7D7E",
+        "#2FA1FF",
+        "#00EDED",
+        "#00EAE4",
+        "#304550",
+        "#EDEBEB",
+        "#7FBEFD",
+        "#003059",
+        "#A2F3F3",
+        "#E1D6E2",
+        "#B6B6B6",
+    ]
+    total_contribution_fig = make_subplots(
+        rows=1,
+        cols=2,
+        subplot_titles=["Spends", "Revenue"],
+        specs=[[{"type": "pie"}, {"type": "pie"}]],
+    )
+    channels_list = [
+        "Paid Search",
+        "Ga will cid baixo risco",
+        "Digital tactic others",
+        "Fb la tier 1",
+        "Fb la tier 2",
+        "Paid social others",
+        "Programmatic",
+        "Kwai",
+        "Indicacao",
+        "Infleux",
+        "Influencer",
+        "Non Media",
+    ]
     # Assign colors from the limited palette to channels
+    colors_map = {
+        col: color_palette[i % len(color_palette)]
+        for i, col in enumerate(channels_list)
+    }
+    colors_map["Non Media"] = color_palette[
+        5
+    ]  # Assign fixed green color for 'Non Media'
     # Hardcoded values for Spends and Revenue
     spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
         go.Pie(
             labels=[channel_name for channel_name in channels_list],
             values=spends_values,
+            marker=dict(
+                colors=[colors_map[channel_name] for channel_name in channels_list]
+            ),
+            hole=0.3,
         ),
+        row=1,
+        col=1,
     )
     # Add trace for Revenue pie chart
         go.Pie(
             labels=[channel_name for channel_name in channels_list],
             values=revenue_values,
+            marker=dict(
+                colors=[colors_map[channel_name] for channel_name in channels_list]
+            ),
+            hole=0.3,
         ),
+        row=1,
+        col=2,
+    )
+    total_contribution_fig.update_traces(
+        textposition="inside", texttemplate="%{percent:.1%}"
+    )
+    total_contribution_fig.update_layout(
+        uniformtext_minsize=12, title="Channel contribution", uniformtext_mode="hide"
     )
     return total_contribution_fig
 def create_contribuion_stacked_plot(scenario):
+    weekly_contribution_fig = make_subplots(
+        rows=1,
+        cols=2,
+        subplot_titles=["Spends", "Revenue"],
+        specs=[[{"type": "bar"}, {"type": "bar"}]],
+    )
+    raw_df = st.session_state["raw_df"]
+    df = raw_df.sort_values(by="Date")
     x = df.Date
     weekly_spends_data = []
     weekly_sales_data = []
+    for i, channel_name in enumerate(st.session_state["channels_list"]):
         color = color_palette[i % len(color_palette)]
+        weekly_spends_data.append(
+            go.Bar(
+                x=x,
+                y=scenario.channels[channel_name].actual_spends
+                * scenario.channels[channel_name].conversion_rate,
+                name=channel_name_formating(channel_name),
+                hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
+                legendgroup=channel_name,
+                marker_color=color,
+            )
+        )
+        weekly_sales_data.append(
+            go.Bar(
+                x=x,
+                y=scenario.channels[channel_name].actual_sales,
+                name=channel_name_formating(channel_name),
+                hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+                legendgroup=channel_name,
+                showlegend=False,
+                marker_color=color,
+            )
+        )
     for _d in weekly_spends_data:
         weekly_contribution_fig.add_trace(_d, row=1, col=1)
     for _d in weekly_sales_data:
         weekly_contribution_fig.add_trace(_d, row=1, col=2)
+    weekly_contribution_fig.add_trace(
+        go.Bar(
+            x=x,
+            y=scenario.constant + scenario.correction,
+            name="Non Media",
+            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+            marker_color=color_palette[-1],
+        ),
+        row=1,
+        col=2,
+    )
+    weekly_contribution_fig.update_layout(
+        barmode="stack", title="Channel contribution by week", xaxis_title="Date"
+    )
     weekly_contribution_fig.update_xaxes(showgrid=False)
     weekly_contribution_fig.update_yaxes(showgrid=False)
     return weekly_contribution_fig
 def create_channel_spends_sales_plot(channel):
     if channel is not None:
         x = channel.dates
         _spends = channel.actual_spends * channel.conversion_rate
         _sales = channel.actual_sales
         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+        channel_sales_spends_fig.add_trace(
+            go.Bar(
+                x=x,
+                y=_sales,
+                marker_color=color_palette[
+                    3
+                ],  # You can choose a color from the palette
+                name="Revenue",
+                hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+            ),
+            secondary_y=False,
+        )
+        channel_sales_spends_fig.add_trace(
+            go.Scatter(
+                x=x,
+                y=_spends,
+                line=dict(
+                    color=color_palette[2]
+                ),  # You can choose another color from the palette
+                name="Spends",
+                hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
+            ),
+            secondary_y=True,
+        )
+        channel_sales_spends_fig.update_layout(
+            xaxis_title="Date",
+            yaxis_title="Revenue",
+            yaxis2_title="Spends ($)",
+            title="Channel spends and Revenue week-wise",
+        )
         channel_sales_spends_fig.update_xaxes(showgrid=False)
         channel_sales_spends_fig.update_yaxes(showgrid=False)
     else:
+        raw_df = st.session_state["raw_df"]
+        df = raw_df.sort_values(by="Date")
         x = df.Date
+        scenario = class_from_dict(st.session_state["default_scenario_dict"])
         _sales = scenario.constant + scenario.correction
         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+        channel_sales_spends_fig.add_trace(
+            go.Bar(
+                x=x,
+                y=_sales,
+                marker_color=color_palette[
+                    0
+                ],  # You can choose a color from the palette
+                name="Revenue",
+                hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+            ),
+            secondary_y=False,
+        )
+        channel_sales_spends_fig.update_layout(
+            xaxis_title="Date",
+            yaxis_title="Revenue",
+            yaxis2_title="Spends ($)",
+            title="Channel spends and Revenue week-wise",
+        )
         channel_sales_spends_fig.update_xaxes(showgrid=False)
         channel_sales_spends_fig.update_yaxes(showgrid=False)
     return channel_sales_spends_fig
+def format_numbers(value, n_decimals=1, include_indicator=True):
     if include_indicator:
+        return f"{CURRENCY_INDICATOR} {numerize(value,n_decimals)}"
     else:
+        return f"{numerize(value,n_decimals)}"
+def decimal_formater(num_string, n_decimals=1):
+    parts = num_string.split(".")
     if len(parts) == 1:
+        return num_string + "." + "0" * n_decimals
     else:
         to_be_padded = n_decimals - len(parts[-1])
+        if to_be_padded > 0:
+            return num_string + "0" * to_be_padded
         else:
             return num_string
 def channel_name_formating(channel_name):
+    name_mod = channel_name.replace("_", " ")
+    if name_mod.lower().endswith(" imp"):
+        name_mod = name_mod.replace("Imp", "Spend")
+    elif name_mod.lower().endswith(" clicks"):
+        name_mod = name_mod.replace("Clicks", "Spend")
     return name_mod
+def send_email(email, message):
+    s = smtplib.SMTP("smtp.gmail.com", 587)
     s.starttls()
     s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
     s.sendmail("geethu4444@gmail.com", email, message)
     s.quit()
 if __name__ == "__main__":
     initialize_data()

utilities_with_panel.py ADDED Viewed

	@@ -0,0 +1,1018 @@

+from numerize.numerize import numerize
+import streamlit as st
+import pandas as pd
+import json
+from classes import Channel, Scenario
+import numpy as np
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+from classes import class_to_dict
+from collections import OrderedDict
+import io
+import plotly
+from pathlib import Path
+import pickle
+import streamlit_authenticator as stauth
+import yaml
+from yaml import SafeLoader
+from streamlit.components.v1 import html
+import smtplib
+from scipy.optimize import curve_fit
+from sklearn.metrics import r2_score
+from classes import class_from_dict
+import os
+import base64
+color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
+CURRENCY_INDICATOR = '$'
+def load_authenticator():
+    with open('config.yaml') as file:
+        config = yaml.load(file, Loader=SafeLoader)
+        st.session_state['config'] = config
+    authenticator = stauth.Authenticate(
+        config['credentials'],
+        config['cookie']['name'],
+        config['cookie']['key'],
+        config['cookie']['expiry_days'],
+        config['preauthorized']
+    )
+    st.session_state['authenticator'] = authenticator
+    return authenticator
+def nav_page(page_name, timeout_secs=3):
+    nav_script = """
+        <script type="text/javascript">
+            function attempt_nav_page(page_name, start_time, timeout_secs) {
+                var links = window.parent.document.getElementsByTagName("a");
+                for (var i = 0; i < links.length; i++) {
+                    if (links[i].href.toLowerCase().endsWith("/" + page_name.toLowerCase())) {
+                        links[i].click();
+                        return;
+                    }
+                }
+                var elasped = new Date() - start_time;
+                if (elasped < timeout_secs * 1000) {
+                    setTimeout(attempt_nav_page, 100, page_name, start_time, timeout_secs);
+                } else {
+                    alert("Unable to navigate to page '" + page_name + "' after " + timeout_secs + " second(s).");
+                }
+            }
+            window.addEventListener("load", function() {
+                attempt_nav_page("%s", new Date(), %d);
+            });
+        </script>
+    """ % (page_name, timeout_secs)
+    html(nav_script)
+# def load_local_css(file_name):
+#     with open(file_name) as f:
+#         st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+# def set_header():
+#     return st.markdown(f"""<div class='main-header'>
+#                     <h1>MMM LiME</h1>
+#                     <img src="https://assets-global.website-files.com/64c8fffb0e95cbc525815b79/64df84637f83a891c1473c51_Vector%20(Stroke).svg   ">
+#             </div>""", unsafe_allow_html=True)
+path = os.path.dirname(__file__)
+file_ = open(f"{path}/mastercard_logo.png", "rb")
+contents = file_.read()
+data_url = base64.b64encode(contents).decode("utf-8")
+file_.close()
+DATA_PATH = './data'
+IMAGES_PATH = './data/images_224_224'
+# New - Sprint 2
+if 'bin_dict' not in st.session_state:
+    with open("data_import.pkl", "rb") as f:
+        data = pickle.load(f)
+        st.session_state['bin_dict'] = data["bin_dict"]
+panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
+is_panel = True if len(panel_col)>0 else False
+date_col='Date'
+#is_panel = False # flag if set to true - do panel level response curves
+def load_local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+# def set_header():
+#     return st.markdown(f"""<div class='main-header'>
+#                     <h1>H & M Recommendations</h1>
+#                     <img src="data:image;base64,{data_url}", alt="Logo">
+#             </div>""", unsafe_allow_html=True)
+path1 = os.path.dirname(__file__)
+file_1 = open(f"{path}/willbank.png", "rb")
+contents1 = file_1.read()
+data_url1 = base64.b64encode(contents1).decode("utf-8")
+file_1.close()
+DATA_PATH1 = './data'
+IMAGES_PATH1 = './data/images_224_224'
+def set_header():
+    return st.markdown(f"""<div class='main-header'>
+                    <!-- <h1></h1> -->
+                       <div >
+                       <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
+                       </div>
+                    <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
+            </div>""", unsafe_allow_html=True)
+# def set_header():
+#     logo_path = "./path/to/your/local/LIME_logo.png"  # Replace with the actual file path
+#     text = "LiME"
+#     return st.markdown(f"""<div class='main-header'>
+#                     <img src="data:image/png;base64,{data_url}" alt="Logo" style="float: left; margin-right: 10px; width: 100px; height: auto;">
+#                     <h1>{text}</h1>
+#             </div>""", unsafe_allow_html=True)
+def s_curve(x,K,b,a,x0):
+    return K / (1 + b * np.exp(-a*(x-x0)))
+def overview_test_data_prep_panel(X, df, spends_X, date_col, panel_col, target_col):
+    '''
+        function to create the data which is used in initialize data fn
+        X : X test with contributions
+        df : originally uploaded data (media data) which has raw vars
+        spends_X : spends of dates in X test
+    '''
+    # define channels
+    channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
+                'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions'], #, 'fb:_level_achieved_-_tier_1_clicks'],
+                'fb_level_achieved_tier_2': ['fb:_level_achieved_tier_2_impressions',
+                                             'fb_level_achieved_tier_2_clicks'],
+                 'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
+                'ga_app': ['ga_app_impressions', 'ga_app_clicks'],
+                'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
+                'kwai': ['kwai_impressions', 'kwai_clicks'],
+                'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
+                 # 'affiliates':['affiliates_clicks'],
+                 #
+                 # "indicacao":['indicacao_clicks'],
+                 #
+                 # "infleux":['infleux_clicks'],
+                 #
+                 # "influencer":['influencer_clicks']
+                }
+    channel_list = list(channels.keys())
+    # map transformed variable to raw variable name & channel name
+    # mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
+    variables = {}
+    channel_and_variables = {}
+    new_variables = {}
+    new_channels_and_variables = {}
+    for transformed_var in [col for col in
+                            X.drop(columns=[date_col, panel_col, target_col, 'pred', 'panel_effect']).columns if
+                            "_contr" not in col]:
+        if len([col for col in df.columns if col in transformed_var]) == 1:
+            raw_var = [col for col in df.columns if col in transformed_var][0]
+            variables[transformed_var] = raw_var
+            channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][
+                0]
+        else:
+            new_variables[transformed_var] = transformed_var
+            new_channels_and_variables[transformed_var] = 'base'
+    # Raw DF
+    raw_X = pd.merge(X[[date_col, panel_col]], df[[date_col, panel_col] + list(variables.values())], how='left',
+                     on=[date_col, panel_col])
+    assert len(raw_X) == len(X)
+    raw_X_cols = []
+    for i in raw_X.columns:
+        if i in channel_and_variables.keys():
+            raw_X_cols.append(channel_and_variables[i])
+        else:
+            raw_X_cols.append(i)
+    raw_X.columns = raw_X_cols
+    # Contribution DF
+    contr_X = X[[date_col, panel_col, 'panel_effect'] + [col for col in X.columns if
+                                                         "_contr" in col and "sum_" not in col]].copy()
+    new_variables = [col for col in contr_X.columns if
+                     "_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
+    if len(new_variables) > 0:
+        contr_X['const'] = contr_X[['panel_effect'] + new_variables].sum(axis=1)
+        contr_X.drop(columns=['panel_effect'], inplace=True)
+        contr_X.drop(columns=new_variables, inplace=True)
+    else:
+        contr_X.rename(columns={'panel_effect': 'const'}, inplace=True)
+    new_contr_X_cols = []
+    for col in contr_X.columns:
+        col_clean = col.replace("_contr", "")
+        new_contr_X_cols.append(col_clean)
+    contr_X.columns = new_contr_X_cols
+    contr_X_cols = []
+    for i in contr_X.columns:
+        if i in variables.keys():
+            contr_X_cols.append(channel_and_variables[variables[i]])
+        else:
+            contr_X_cols.append(i)
+    contr_X.columns = contr_X_cols
+    # Spends DF
+    spends_X.columns = [col.replace("_cost", "") for col in spends_X.columns]
+    raw_X.rename(columns={"date": "Date"}, inplace=True)
+    contr_X.rename(columns={"date": "Date"}, inplace=True)
+    spends_X.rename(columns={'date': 'Week'}, inplace=True)
+    # Create excel
+    file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
+    with pd.ExcelWriter(file_name) as writer:
+        raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
+        contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
+        spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
+def overview_test_data_prep_nonpanel(X, df, spends_X, date_col, target_col):
+    '''
+        function to create the data which is used in initialize data fn
+        X : X test with contributions
+        df : originally uploaded data (media data) which has raw vars
+        spends_X : spends of dates in X test
+    '''
+    # define channels
+    channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
+                'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions', 'fb_level_achieved_tier_1_clicks'],
+                'fb_level_achieved_tier_2': ['fb_level_achieved_tier_2_impressions',
+                                             'fb_level_achieved_tier_2_clicks'],
+                 'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
+                'ga_app_will_and_cid_pequena_baixo_risco': ['ga_app_will_and_cid_pequena_baixo_risco_impressions', 'ga_app_will_and_cid_pequena_baixo_risco_clicks'],
+                'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
+                'kwai': ['kwai_impressions', 'kwai_clicks'],
+                'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
+                 'affiliates':['affiliates_clicks', 'affiliates_impressions'],
+                 "indicacao":['indicacao_clicks', 'indicacao_impressions'],
+                 "infleux":['infleux_clicks', 'infleux_impressions'],
+                 "influencer":['influencer_clicks', 'influencer_impressions']
+                }
+    channel_list = list(channels.keys())
+    # map transformed variable to raw variable name & channel name
+    # mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
+    variables = {}
+    channel_and_variables = {}
+    new_variables = {}
+    new_channels_and_variables = {}
+    cols_to_del = list(set([date_col, target_col, 'pred']).intersection((set(X.columns))))
+    for transformed_var in [col for col in
+                            X.drop(columns=cols_to_del).columns if
+                            "_contr" not in col]: # also has 'const'
+        if len([col for col in df.columns if col in transformed_var]) == 1: # col is raw var
+            raw_var = [col for col in df.columns if col in transformed_var][0]
+            variables[transformed_var] = raw_var
+            channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][0]
+        else: # when no corresponding raw var then base
+            new_variables[transformed_var] = transformed_var
+            new_channels_and_variables[transformed_var] = 'base'
+    # Raw DF
+    raw_X = pd.merge(X[[date_col]], df[[date_col] + list(variables.values())], how='left',
+                     on=[date_col])
+    assert len(raw_X) == len(X)
+    raw_X_cols = []
+    for i in raw_X.columns:
+        if i in channel_and_variables.keys():
+            raw_X_cols.append(channel_and_variables[i])
+        else:
+            raw_X_cols.append(i)
+    raw_X.columns = raw_X_cols
+    # Contribution DF
+    contr_X = X[[date_col] + [col for col in X.columns if "_contr" in col and "sum_" not in col]].copy()
+    # st.write(contr_X.columns)
+    new_variables = [col for col in contr_X.columns if
+                     "_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
+    if len(new_variables) > 0: # if new vars are available, their contributions should be added to base (called const)
+        contr_X['const_contr'] = contr_X[['const_contr'] + new_variables].sum(axis=1)
+        contr_X.drop(columns=new_variables, inplace=True)
+    new_contr_X_cols = []
+    for col in contr_X.columns:
+        col_clean = col.replace("_contr", "")
+        new_contr_X_cols.append(col_clean)
+    contr_X.columns = new_contr_X_cols
+    contr_X_cols = []
+    for i in contr_X.columns:
+        if i in variables.keys():
+            contr_X_cols.append(channel_and_variables[variables[i]])
+        else:
+            contr_X_cols.append(i)
+    contr_X.columns = contr_X_cols
+    # Spends DF
+    spends_X.columns = [col.replace("_cost", "").replace("_spends", '').replace("_spend", "") for col in spends_X.columns]
+    raw_X.rename(columns={"date": "Date"}, inplace=True)
+    contr_X.rename(columns={"date": "Date"}, inplace=True)
+    spends_X.rename(columns={'date': 'Week'}, inplace=True)
+    # Create excel
+    file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
+    with pd.ExcelWriter(file_name) as writer:
+        raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
+        contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
+        spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
+def initialize_data(target_col):
+    # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
+    #                    "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
+    #                    "digital_spends":1}
+    #print('State initialized')
+    # excel = pd.read_excel("data_test_overview_panel.xlsx",sheet_name=None)
+    excel = pd.read_excel("data_test_overview_panel_#" + target_col + ".xlsx",sheet_name=None)
+    raw_df = excel['RAW DATA MMM']
+    spend_df = excel['SPEND INPUT']
+    contri_df = excel['CONTRIBUTION MMM']
+    #Revenue_df = excel['Revenue']
+    ## remove sesonalities, indices etc ...
+    exclude_columns = ['Date', 'Week',
+                       'Region',
+                       'Controls_Grammarly_Index_SeasonalAVG',
+                       'Controls_Quillbot_Index',
+                       'Daily_Positive_Outliers',
+                       'External_RemoteClass_Index',
+                       'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
+                       'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
+                       'Intervals ON 20201005-20201019',
+                       'Promotion_PercentOff',
+                       'Promotion_TimeBased',
+                       'Seasonality_Indicator_Chirstmas',
+                       'Seasonality_Indicator_NewYears_Days',
+                       'Seasonality_Indicator_Thanksgiving',
+                       'Trend 20200302 / 20200803',
+                       date_col, panel_col
+                  ]
+    # Aggregate all 3 dfs to date level (from date-panel level)
+    raw_df[date_col]=pd.to_datetime(raw_df[date_col])
+    raw_df_aggregations = {c:'sum' for c in raw_df.columns if c not in exclude_columns}
+    raw_df = raw_df.groupby(date_col).agg(raw_df_aggregations).reset_index()
+    contri_df[date_col]=pd.to_datetime(contri_df[date_col])
+    contri_df_aggregations = {c:'sum' for c in contri_df.columns if c not in exclude_columns}
+    contri_df = contri_df.groupby(date_col).agg(contri_df_aggregations).reset_index()
+    input_df = raw_df.sort_values(by=[date_col])
+    output_df = contri_df.sort_values(by=[date_col])
+    spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
+    spend_df_aggregations = {c: 'sum' for c in spend_df.columns if c not in exclude_columns}
+    spend_df = spend_df.groupby('Week').agg(spend_df_aggregations).reset_index()
+    # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
+    # spend_df = spend_df.sort_values(by='Week')
+    channel_list = [col for col in input_df.columns if col not in exclude_columns]
+    response_curves = {}
+    mapes = {}
+    rmses = {}
+    upper_limits = {}
+    powers = {}
+    r2 = {}
+    conv_rates = {}
+    output_cols = []
+    channels = {}
+    sales = None
+    dates = input_df.Date.values
+    actual_output_dic = {}
+    actual_input_dic = {}
+    # ONLY FOR TESTING
+    # channel_list=['programmatic']
+    infeasible_channels = [c for c in contri_df.select_dtypes(include=['float', 'int']).columns if contri_df[c].sum()<=0]
+    # st.write(infeasible_channels)
+    channel_list=list(set(channel_list)-set(infeasible_channels))
+    for inp_col in channel_list:
+        st.write(inp_col)
+        # # New - Sprint 2
+        # if is_panel:
+        #     input_df1 = input_df.groupby([date_col]).agg({inp_col:'sum'}).reset_index() # aggregate spends on date
+        #     spends = input_df1[inp_col].values
+        # else :
+        #     spends = input_df[inp_col].values
+        spends = spend_df[inp_col].values
+        x = spends.copy()
+        # upper limit for penalty
+        upper_limits[inp_col] = 2*x.max()
+        # contribution
+        # New - Sprint 2
+        out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
+        if is_panel :
+            output_df1 = output_df.groupby([date_col]).agg({out_col:'sum'}).reset_index()
+            y = output_df1[out_col].values.copy()
+        else :
+            y = output_df[out_col].values.copy()
+        actual_output_dic[inp_col] = y.copy()
+        actual_input_dic[inp_col] = x.copy()
+        ##output cols aggregation
+        output_cols.append(out_col)
+        ## scale the input
+        power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
+        if power >= 0 :
+            x = x / 10**power
+        x = x.astype('float64')
+        y = y.astype('float64')
+        #print('#printing yyyyyyyyy')
+        #print(inp_col)
+        #print(x.max())
+        #print(y.max())
+        # st.write(y.max(),x.max())
+        print(y.max(),x.max())
+        if y.max()<=0.01:
+            if x.max()<=0.01 :
+                st.write("here-here")
+                bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
+            else :
+                st.write("here")
+                bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
+        else :
+            bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
+        #bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
+        params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
+                                bounds=bounds,
+                                maxfev=int(1e5))
+        mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
+        rmse =  np.sqrt(((y - s_curve(x,*params))**2).mean())
+        r2_ = r2_score(y, s_curve(x,*params))
+        response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
+        mapes[inp_col] = mape
+        rmses[inp_col] = rmse
+        r2[inp_col] = r2_
+        powers[inp_col] = power
+        ## conversion rates
+        spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
+        #print('#printing spendssss')
+        #print(spend_col)
+        conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
+        conv.rename(columns={'index':'Week'},inplace=True)
+        conv['year'] = conv.Week.dt.year
+        conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
+        ##print('Before',conv_rates[inp_col])
+        # conv_rates[inp_col] = uopx_conv_rates[inp_col]
+        ##print('After',(conv_rates[inp_col]))
+        channel = Channel(name=inp_col,dates=dates,
+                            spends=spends,
+                            # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
+                            conversion_rate = conv_rates[inp_col],
+                            response_curve_type='s-curve',
+                            response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
+                            bounds=np.array([-10,10]))
+        channels[inp_col] = channel
+        if sales is None:
+            sales = channel.actual_sales
+        else:
+            sales += channel.actual_sales
+        # st.write(inp_col, channel.actual_sales)
+    # st.write(output_cols)
+    other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
+    correction = output_df.drop(['Date'],axis=1).sum(axis=1).values - (sales + other_contributions)
+    scenario_test_df=pd.DataFrame(columns=['other_contributions','correction', 'sales'])
+    scenario_test_df['other_contributions']=other_contributions
+    scenario_test_df['correction']=correction
+    scenario_test_df['sales']=sales
+    scenario_test_df.to_csv("test/scenario_test_df.csv",index=False)
+    output_df.to_csv("test/output_df.csv",index=False)
+    scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
+    ## setting session variables
+    st.session_state['initialized'] = True
+    st.session_state['actual_df'] = input_df
+    st.session_state['raw_df'] = raw_df
+    st.session_state['contri_df'] = output_df
+    default_scenario_dict = class_to_dict(scenario)
+    st.session_state['default_scenario_dict'] = default_scenario_dict
+    st.session_state['scenario'] = scenario
+    st.session_state['channels_list'] = channel_list
+    st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
+    st.session_state['rcs'] = response_curves
+    st.session_state['powers'] = powers
+    st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
+    st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
+    for channel in channels.values():
+        st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
+    st.session_state['xlsx_buffer'] = io.BytesIO()
+    if Path('../saved_scenarios.pkl').exists():
+        with open('../saved_scenarios.pkl','rb') as f:
+            st.session_state['saved_scenarios'] = pickle.load(f)
+    else:
+        st.session_state['saved_scenarios'] = OrderedDict()
+    st.session_state['total_spends_change'] = 0
+    st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
+    st.session_state['disable_download_button'] = True
+# def initialize_data():
+#     # fetch data from excel
+#     output = pd.read_excel('data.xlsx',sheet_name=None)
+#     raw_df = output['RAW DATA MMM']
+#     contribution_df = output['CONTRIBUTION MMM']
+#     Revenue_df = output['Revenue']
+#     ## channels to be shows
+#     channel_list = []
+#     for col in raw_df.columns:
+#         if 'click' in col.lower() or 'spend' in col.lower() or 'imp' in col.lower():
+#             ##print(col)
+#             channel_list.append(col)
+#         else:
+#             pass
+#     ## NOTE : Considered only Desktop spends for all calculations
+#     acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
+#     ## NOTE : Considered one year of data
+#     acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
+#     actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
+#     ##load response curves
+#     with open('./grammarly_response_curves.json','r') as f:
+#         response_curves = json.load(f)
+#     ## create channel dict for scenario creation
+#     dates = actual_df.Date.values
+#     channels = {}
+#     rcs = {}
+#     constant = 0.
+#     for i,info_dict in enumerate(response_curves):
+#         name = info_dict.get('name')
+#         response_curve_type = info_dict.get('response_curve')
+#         response_curve_params = info_dict.get('params')
+#         rcs[name] = response_curve_params
+#         if name != 'constant':
+#             spends = actual_df[name].values
+#             channel = Channel(name=name,dates=dates,
+#                             spends=spends,
+#                             response_curve_type=response_curve_type,
+#                             response_curve_params=response_curve_params,
+#                             bounds=np.array([-30,30]))
+#             channels[name] = channel
+#         else:
+#             constant = info_dict.get('value',0.) * len(dates)
+#     ## create scenario
+#     scenario = Scenario(name='default', channels=channels, constant=constant)
+#     default_scenario_dict = class_to_dict(scenario)
+#     ## setting session variables
+#     st.session_state['initialized'] = True
+#     st.session_state['actual_df'] = actual_df
+#     st.session_state['raw_df'] = raw_df
+#     st.session_state['default_scenario_dict'] = default_scenario_dict
+#     st.session_state['scenario'] = scenario
+#     st.session_state['channels_list'] = channel_list
+#     st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
+#     st.session_state['rcs'] = rcs
+#     for channel in channels.values():
+#         if channel.name not in st.session_state:
+#             st.session_state[channel.name] = float(channel.actual_total_spends)
+#     if 'xlsx_buffer' not in st.session_state:
+#         st.session_state['xlsx_buffer'] = io.BytesIO()
+#     ## for saving scenarios
+#     if 'saved_scenarios' not in st.session_state:
+#         if Path('../saved_scenarios.pkl').exists():
+#             with open('../saved_scenarios.pkl','rb') as f:
+#                 st.session_state['saved_scenarios'] = pickle.load(f)
+#         else:
+#             st.session_state['saved_scenarios'] = OrderedDict()
+#     if 'total_spends_change' not in st.session_state:
+#         st.session_state['total_spends_change'] = 0
+#     if 'optimization_channels' not in st.session_state:
+#         st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
+#     if 'disable_download_button' not in st.session_state:
+#         st.session_state['disable_download_button'] = True
+def create_channel_summary(scenario):
+    summary_columns = []
+    actual_spends_rows = []
+    actual_sales_rows = []
+    actual_roi_rows = []
+    for channel in scenario.channels.values():
+        name_mod = channel.name.replace('_', ' ')
+        if name_mod.lower().endswith(' imp'):
+            name_mod = name_mod.replace('Imp', ' Impressions')
+        print(name_mod, channel.actual_total_spends, channel.conversion_rate,
+              channel.actual_total_spends * channel.conversion_rate)
+        summary_columns.append(name_mod)
+        actual_spends_rows.append(format_numbers(float(channel.actual_total_spends * channel.conversion_rate)))
+        actual_sales_rows.append(format_numbers((float(channel.actual_total_sales))))
+        actual_roi_rows.append(decimal_formater(
+            format_numbers((channel.actual_total_sales) / (channel.actual_total_spends * channel.conversion_rate),
+                           include_indicator=False, n_decimals=4), n_decimals=4))
+    actual_summary_df = pd.DataFrame([summary_columns, actual_spends_rows, actual_sales_rows, actual_roi_rows]).T
+    actual_summary_df.columns = ['Channel', 'Spends', 'Prospects', 'ROI']
+    actual_summary_df['Prospects'] = actual_summary_df['Prospects'].map(lambda x: str(x)[1:])
+    return actual_summary_df
+# def create_channel_summary(scenario):
+#
+#     # Provided data
+#     data = {
+#         'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
+#         'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
+#         'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
+#     }
+#
+#     # Create DataFrame
+#     df = pd.DataFrame(data)
+#
+#     # Convert currency strings to numeric values
+#     df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
+#     df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
+#
+#     # Calculate ROI
+#     df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
+#
+#     # Format columns
+#     format_currency = lambda x: f"${x:,.1f}"
+#     format_roi = lambda x: f"{x:.1f}"
+#
+#     df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
+#     df['Revenue'] =  ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
+#     df['ROI'] = df['ROI'].apply(format_roi)
+#
+#     return df
+@st.cache(allow_output_mutation=True)
+def create_contribution_pie(scenario):
+    #c1f7dc
+    colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
+    total_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "pie"}, {"type": "pie"}]])
+    total_contribution_fig.add_trace(
+                go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
+                    values= [round(scenario.channels[channel_name].actual_total_spends * scenario.channels[channel_name].conversion_rate,1) for channel_name in st.session_state['channels_list']] + [0],
+                    marker=dict(colors = [plotly.colors.label_rgb(colors_map[channel_name]) for channel_name in st.session_state['channels_list']] + ['#F0F0F0']),
+                        hole=0.3),
+                row=1, col=1)
+    total_contribution_fig.add_trace(
+                go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
+                    values= [scenario.channels[channel_name].actual_total_sales for channel_name in st.session_state['channels_list']] + [scenario.correction.sum() + scenario.constant.sum()],
+                        hole=0.3),
+                row=1, col=2)
+    total_contribution_fig.update_traces(textposition='inside',texttemplate='%{percent:.1%}')
+    total_contribution_fig.update_layout(uniformtext_minsize=12,title='Channel contribution', uniformtext_mode='hide')
+    return total_contribution_fig
+@st.cache(allow_output_mutation=True)
+# def create_contribuion_stacked_plot(scenario):
+#     weekly_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "bar"}, {"type": "bar"}]])
+#     raw_df = st.session_state['raw_df']
+#     df = raw_df.sort_values(by='Date')
+#     x = df.Date
+#     weekly_spends_data = []
+#     weekly_sales_data = []
+#     for channel_name in st.session_state['channels_list']:
+#         weekly_spends_data.append((go.Bar(x=x,
+#                                           y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
+#                                           name=channel_name_formating(channel_name),
+#                                           hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
+#                                           legendgroup=channel_name)))
+#         weekly_sales_data.append((go.Bar(x=x,
+#                                          y=scenario.channels[channel_name].actual_sales,
+#                                          name=channel_name_formating(channel_name),
+#                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+#                                          legendgroup=channel_name, showlegend=False)))
+#     for _d in weekly_spends_data:
+#         weekly_contribution_fig.add_trace(_d, row=1, col=1)
+#     for _d in weekly_sales_data:
+#         weekly_contribution_fig.add_trace(_d, row=1, col=2)
+#     weekly_contribution_fig.add_trace(go.Bar(x=x,
+#                                          y=scenario.constant + scenario.correction,
+#                                          name='Non Media',
+#                                          hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
+#     weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
+#     weekly_contribution_fig.update_xaxes(showgrid=False)
+#     weekly_contribution_fig.update_yaxes(showgrid=False)
+#     return weekly_contribution_fig
+# @st.cache(allow_output_mutation=True)
+# def create_channel_spends_sales_plot(channel):
+#     if channel is not None:
+#         x = channel.dates
+#         _spends = channel.actual_spends * channel.conversion_rate
+#         _sales = channel.actual_sales
+#         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+#         channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
+#         channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#005b96'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
+#         channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
+#         channel_sales_spends_fig.update_xaxes(showgrid=False)
+#         channel_sales_spends_fig.update_yaxes(showgrid=False)
+#     else:
+#         raw_df = st.session_state['raw_df']
+#         df = raw_df.sort_values(by='Date')
+#         x = df.Date
+#         scenario = class_from_dict(st.session_state['default_scenario_dict'])
+#         _sales = scenario.constant + scenario.correction
+#         channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+#         channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
+#         # channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#15C39A'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
+#         channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
+#         channel_sales_spends_fig.update_xaxes(showgrid=False)
+#         channel_sales_spends_fig.update_yaxes(showgrid=False)
+#     return channel_sales_spends_fig
+# Define a shared color palette
+# def create_contribution_pie():
+#     color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
+#     total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
+#
+#     channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
+#
+#     # Assign colors from the limited palette to channels
+#     colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
+#     colors_map['Non Media'] = color_palette[5]  # Assign fixed green color for 'Non Media'
+#
+#     # Hardcoded values for Spends and Revenue
+#     spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
+#     revenue_values = [1, 4, 5, 3, 3, 2, 50.8, 1.5, 0.7, 13, 0, 16]
+#
+#     # Add trace for Spends pie chart
+#     total_contribution_fig.add_trace(
+#         go.Pie(
+#             labels=[channel_name for channel_name in channels_list],
+#             values=spends_values,
+#             marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
+#             hole=0.3
+#         ),
+#         row=1, col=1
+#     )
+#
+#     # Add trace for Revenue pie chart
+#     total_contribution_fig.add_trace(
+#         go.Pie(
+#             labels=[channel_name for channel_name in channels_list],
+#             values=revenue_values,
+#             marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
+#             hole=0.3
+#         ),
+#         row=1, col=2
+#     )
+#
+#     total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
+#     total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
+#     return total_contribution_fig
+def create_contribuion_stacked_plot(scenario):
+    weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
+    raw_df = st.session_state['raw_df']
+    df = raw_df.sort_values(by='Date')
+    x = df.Date
+    weekly_spends_data = []
+    weekly_sales_data = []
+    for i, channel_name in enumerate(st.session_state['channels_list']):
+        color = color_palette[i % len(color_palette)]
+        weekly_spends_data.append(go.Bar(
+            x=x,
+            y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
+            name=channel_name_formating(channel_name),
+            hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
+            legendgroup=channel_name,
+            marker_color=color,
+        ))
+        weekly_sales_data.append(go.Bar(
+            x=x,
+            y=scenario.channels[channel_name].actual_sales,
+            name=channel_name_formating(channel_name),
+            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+            legendgroup=channel_name,
+            showlegend=False,
+            marker_color=color,
+        ))
+    for _d in weekly_spends_data:
+        weekly_contribution_fig.add_trace(_d, row=1, col=1)
+    for _d in weekly_sales_data:
+        weekly_contribution_fig.add_trace(_d, row=1, col=2)
+    weekly_contribution_fig.add_trace(go.Bar(
+        x=x,
+        y=scenario.constant + scenario.correction,
+        name='Non Media',
+        hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+        marker_color=color_palette[-1],
+    ), row=1, col=2)
+    weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
+    weekly_contribution_fig.update_xaxes(showgrid=False)
+    weekly_contribution_fig.update_yaxes(showgrid=False)
+    return weekly_contribution_fig
+def create_channel_spends_sales_plot(channel):
+    if channel is not None:
+        x = channel.dates
+        _spends = channel.actual_spends * channel.conversion_rate
+        _sales = channel.actual_sales
+        channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+        channel_sales_spends_fig.add_trace(go.Bar(
+            x=x,
+            y=_sales,
+            marker_color=color_palette[3],  # You can choose a color from the palette
+            name='Revenue',
+            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+        ), secondary_y=False)
+        channel_sales_spends_fig.add_trace(go.Scatter(
+            x=x,
+            y=_spends,
+            line=dict(color=color_palette[2]),  # You can choose another color from the palette
+            name='Spends',
+            hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
+        ), secondary_y=True)
+        channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
+        channel_sales_spends_fig.update_xaxes(showgrid=False)
+        channel_sales_spends_fig.update_yaxes(showgrid=False)
+    else:
+        raw_df = st.session_state['raw_df']
+        df = raw_df.sort_values(by='Date')
+        x = df.Date
+        scenario = class_from_dict(st.session_state['default_scenario_dict'])
+        _sales = scenario.constant + scenario.correction
+        channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
+        channel_sales_spends_fig.add_trace(go.Bar(
+            x=x,
+            y=_sales,
+            marker_color=color_palette[0],  # You can choose a color from the palette
+            name='Revenue',
+            hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
+        ), secondary_y=False)
+        channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
+        channel_sales_spends_fig.update_xaxes(showgrid=False)
+        channel_sales_spends_fig.update_yaxes(showgrid=False)
+    return channel_sales_spends_fig
+def format_numbers(value, n_decimals=1,include_indicator = True):
+    if include_indicator:
+        return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
+    else:
+        return f'{numerize(value,n_decimals)}'
+def decimal_formater(num_string,n_decimals=1):
+    parts = num_string.split('.')
+    if len(parts) == 1:
+        return num_string+'.' + '0'*n_decimals
+    else:
+        to_be_padded = n_decimals - len(parts[-1])
+        if to_be_padded > 0 :
+            return num_string+'0'*to_be_padded
+        else:
+            return num_string
+def channel_name_formating(channel_name):
+    name_mod = channel_name.replace('_', ' ')
+    if name_mod.lower().endswith(' imp'):
+        name_mod = name_mod.replace('Imp','Spend')
+    elif name_mod.lower().endswith(' clicks'):
+        name_mod = name_mod.replace('Clicks','Spend')
+    return name_mod
+def send_email(email,message):
+    s = smtplib.SMTP('smtp.gmail.com', 587)
+    s.starttls()
+    s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
+    s.sendmail("geethu4444@gmail.com", email, message)
+    s.quit()
+if __name__ == "__main__":
+    initialize_data()