# Importing necessary libraries import streamlit as st import pickle st.set_page_config( page_title="Model Build", page_icon=":shark:", layout="wide", initial_sidebar_state="collapsed", ) from utilities import load_authenticator import numpy as np import pandas as pd from utilities import set_header, load_local_css load_local_css("styles.css") set_header() for k, v in st.session_state.items(): if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'): st.session_state[k] = v authenticator = st.session_state.get('authenticator') if authenticator is None: authenticator = load_authenticator() name, authentication_status, username = authenticator.login('Login', 'main') auth_status = st.session_state.get('authentication_status') if auth_status == True: is_state_initiaized = st.session_state.get('initialized',False) if not is_state_initiaized: a=1 # Function to expand dataframe to daily @st.cache_resource(show_spinner=False) def expand_to_daily(df, granularity, start_date, end_date): # Create a new DataFrame with a row for each day all_dates = pd.date_range(start=start_date, end=end_date, freq="D") daily_df = pd.DataFrame(all_dates, columns=["Date"]) if granularity == "daily": # For daily data, simply merge to fill missing dates daily_df = daily_df.merge(df, on="Date", how="left") else: # For weekly or monthly, distribute values to daily rows for column in df.columns: if column != "Date": # Skip 'Date' column daily_df[column] = np.nan # Initialize with NaNs # Group by the required frequency and distribute values freq = "W-MON" if granularity == "weekly" else "MS" for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)): num_days = len( pd.date_range(group["Date"].min(), group["Date"].max(), freq="D") ) for column in group.columns: if column == "Date": # Skip 'Date' column continue value = group[column].sum() / num_days date_range = pd.date_range( group["Date"].min(), periods=num_days, freq="D" ) daily_df.loc[daily_df["Date"].isin(date_range), column] = value return daily_df # Function to validate date column in dataframe def validate_date_column(df): try: # Attempt to convert the 'Date' column to datetime df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y") return True except: return False # Function to determine data interval def determine_data_interval(common_freq): if common_freq == 1: return "daily" elif common_freq == 7: return "weekly" elif 28 <= common_freq <= 31: return "monthly" else: return "irregular" # Function to convert and fill dates in dataframe def convert_and_fill_dates(df, start_date, end_date, interval): # Create a date range for the desired period all_dates = pd.date_range(start=start_date, end=end_date, freq="D") new_df = pd.DataFrame(all_dates, columns=["Date"]) # Preprocess and aggregate data based on the original interval if interval != "daily": # Resample to start of each week/month, then sum values for the same period if interval == "weekly": df = df.resample("W-MON", on="Date").sum().reset_index() elif interval == "monthly": df = df.resample("MS", on="Date").sum().reset_index() # Distribute values equally across the days in each week/month expanded_rows = [] for _, row in df.iterrows(): if interval == "weekly": period_dates = pd.date_range(row["Date"], periods=7) elif interval == "monthly": period_end = row["Date"] + pd.offsets.MonthEnd(1) period_dates = pd.date_range(row["Date"], period_end) for date in period_dates: new_row = row.copy() new_row["Date"] = date for col in df.columns: if col != "Date": # Skip 'Date' column new_row[col] = row[col] / len(period_dates) expanded_rows.append(new_row) # Create a DataFrame from expanded rows expanded_df = pd.DataFrame(expanded_rows) new_df = pd.merge(new_df, expanded_df, how="left", on="Date") else: # Daily data, aggregate if there are multiple entries for the same day df = df.groupby("Date").sum().reset_index() new_df = pd.merge(new_df, df, how="left", on="Date") # Ensure all dates from start to end are present, filling missing values with NaN new_df["Date"] = pd.to_datetime(new_df["Date"]) # Ensure 'Date' is datetime type new_df = new_df.set_index("Date").reindex(all_dates).reset_index() new_df.rename(columns={"index": "Date"}, inplace=True) return new_df # Function to convert a DataFrame from daily level granularity to either weekly or monthly level def convert_to_higher_granularity(df, required_granularity): if required_granularity == "daily": return df # Ensure 'Date' is the index and is in datetime format if not pd.api.types.is_datetime64_any_dtype(df["Date"]): df["Date"] = pd.to_datetime(df["Date"]) df.set_index("Date", inplace=True) # Resample and aggregate if required_granularity == "weekly": # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday df = df.resample("W-MON").sum() elif required_granularity == "monthly": # Resample to monthly, using 'MS' to indicate month start df = df.resample("MS").sum() # Reset index to move 'Date' back to a column df.reset_index(inplace=True) return df # # Read the CSV file, parsing 'Date' column as datetime main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"]) # st.write(main_df) # Get the start date (minimum) and end date (maximum) from the 'Date' column api_start_date = main_df["Date"].min() api_end_date = main_df["Date"].max() # Infer the granularity from the most common difference between consecutive dates date_diffs = main_df["Date"].diff().dt.days.dropna() common_diff = date_diffs.mode()[0] api_granularity = determine_data_interval(common_diff) # Convert the DataFrame to daily level granularity main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date) # Page Title st.title("Data Import") # File uploader uploaded_files = st.file_uploader( "Upload additional data", type=["xlsx"], accept_multiple_files=True ) # Custom HTML for upload instructions recommendation_html = f"""
Date
column formatted as DD-MM-YYYY
, be free of missing values, and aggregated to a {api_granularity} level.