import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from Eda_functions import format_numbers import numpy as np import pickle from st_aggrid import AgGrid from st_aggrid import GridOptionsBuilder,GridUpdateMode from utilities import set_header,load_local_css from st_aggrid import GridOptionsBuilder import time import itertools import statsmodels.api as sm import numpy as npc import re import itertools from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error from sklearn.preprocessing import MinMaxScaler import os import matplotlib.pyplot as plt from statsmodels.stats.outliers_influence import variance_inflation_factor st.set_option('deprecation.showPyplotGlobalUse', False) from datetime import datetime import seaborn as sns from Data_prep_functions import * st.set_page_config( page_title="Model Build", page_icon=":shark:", layout="wide", initial_sidebar_state='collapsed' ) load_local_css('styles.css') set_header() st.title('1. Build Your Model') # media_data=pd.read_csv('Media_data_for_model.csv') media_data=pd.read_csv('Media_data_for_model_dma_level.csv') date=media_data['Date'] st.session_state['date']=date revenue=media_data['Total Approved Accounts - Revenue'] media_data.drop(['Total Approved Accounts - Revenue'],axis=1,inplace=True) media_data.drop(['Date'],axis=1,inplace=True) media_data.reset_index(drop=True,inplace=True) media_data.dropna(inplace=True) if st.toggle('Apply Transformations on DMA/Panel Level'): dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel']]) else: """ code to aggregate data on date """ dma=None # dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()} # st.write(dma_dict) st.markdown('## Select the Range of Transformations') columns = st.columns(2) old_shape=media_data.shape if "old_shape" not in st.session_state: st.session_state['old_shape']=old_shape with columns[0]: slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f") with columns[1]: slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1) # with columns[2]: # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1) # with columns[1]: # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5) # st.number_input('Select the range of ') def lag(data,features,lags,dma=None): if dma: transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1) transformed_data=transformed_data.fillna(method='bfill') return pd.concat([transformed_data,data],axis=1) else: ''' data should be aggregated on date''' transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1) transformed_data=transformed_data.fillna(method='bfill') return pd.concat([transformed_data,data],axis=1) #adstock def adstock(df, alphas, cutoff, features,dma=None): if dma: transformed_data=pd.DataFrame() for d in df[dma].unique(): dma_sub_df = df[df[dma] == d] n = len(dma_sub_df) weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas]) X = dma_sub_df[features].to_numpy() res = pd.DataFrame(np.hstack(weights @ X), columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) transformed_data=pd.concat([transformed_data,res],axis=0) transformed_data.reset_index(drop=True,inplace=True) return pd.concat([transformed_data,df],axis=1) else: n = len(df) weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas]) X = df[features].to_numpy() res = pd.DataFrame(np.hstack(weights @ X), columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) return pd.concat([res,df],axis=1) if 'media_data' not in st.session_state: st.session_state['media_data']=pd.DataFrame() variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets if st.button('Apply Transformations'): with st.spinner('Applying Transformations'): transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma) variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1]+0.1,0.1), cutoff=8, features=variables_to_be_transformed,dma=dma) st.success('Done') st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}') st.write(media_data.head(10)) st.write(transformed_data_adstock) st.write(transformed_data_adstock.isnull().sum().sort_values(ascending=False)) # st.write(dma_dict) # st.session_state['media_data']=media_data # with st.spinner('Applying Transformations'): # time.sleep(2) # st.success("Transformations complete!") # if st.session_state['media_data'].shape[1]>old_shape[1]: # with columns[0]: # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') # #st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions', # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others', # ' GA App: Will And Cid Pequena Baixo Risco Clicks', # 'digital_tactic_others',"programmatic" # ] # with columns[1]: # if st.button('Create Combinations of Variables'): # top_3_correlated_features=[] # for col in st.session_state['media_data'].columns[:19]: # corr_df=pd.concat([st.session_state['media_data'].filter(regex=col), # revenue],axis=1).corr()['Total Approved Accounts - Revenue'].iloc[:-1] # top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index)) # flattened_list = [item for sublist in top_3_correlated_features for item in sublist] # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket} # channels_all=[values for values in all_features_set.values()] # st.session_state['combinations'] = list(itertools.product(*channels_all)) # # if 'combinations' not in st.session_state: # # st.session_state['combinations']=combinations_all # st.session_state['final_selection']=st.session_state['combinations'] # revenue.reset_index(drop=True,inplace=True) # if 'Model_results' not in st.session_state: # st.session_state['Model_results']={'Model_object':[], # 'Model_iteration':[], # 'Feature_set':[], # 'MAPE':[], # 'R2':[], # 'ADJR2':[] # } # #if st.button('Build Model'): # if 'iterations' not in st.session_state: # st.session_state['iterations']=1 # save_path = r"Model" # with columns[1]: # if "final_selection" in st.session_state: # st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}') # st.success('Done') # if st.checkbox('Build all iterations'): # iterations=len(st.session_state['final_selection']) # else: # iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=100, value=st.session_state['iterations']) # st.session_state['iterations']=iterations # st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill') # if st.button("Build Models"): # st.markdown('Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ') # progress_bar = st.progress(0) # Initialize the progress bar # #time_remaining_text = st.empty() # Create an empty space for time remaining text # start_time = time.time() # Record the start time # progress_text = st.empty() # #time_elapsed_text = st.empty() # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000+int(iterations)]): # df = st.session_state['media_data'] # fet = [var for var in selected_features if len(var) > 0] # X = df[fet] # y = revenue # ss = MinMaxScaler() # X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) # X = sm.add_constant(X) # X_train=X.iloc[:150] # X_test=X.iloc[150:] # y_train=y.iloc[:150] # y_test=y.iloc[150:] # model = sm.OLS(y_train, X_train).fit() # # st.write(fet) # positive_coeff=X.columns # negetive_coeff=[] # coefficients=model.params.to_dict() # model_possitive=[col for col in coefficients.keys() if coefficients[col]>0] # # st.write(positive_coeff) # # st.write(model_possitive) # pvalues=[var for var in list(model.pvalues) if var<=0.06] # if (len(model_possitive)/len(selected_features))>0.9 and (len(pvalues)/len(selected_features))>=0.8: # predicted_values = model.predict(X_train) # mape = mean_absolute_percentage_error(y_train, predicted_values) # adjr2 = model.rsquared_adj # r2 = model.rsquared # filename = os.path.join(save_path, f"model_{i}.pkl") # with open(filename, "wb") as f: # pickle.dump(model, f) # # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: # # model = pickle.load(file) # st.session_state['Model_results']['Model_object'].append(filename) # st.session_state['Model_results']['Model_iteration'].append(i) # st.session_state['Model_results']['Feature_set'].append(fet) # st.session_state['Model_results']['MAPE'].append(mape) # st.session_state['Model_results']['R2'].append(r2) # st.session_state['Model_results']['ADJR2'].append(adjr2) # current_time = time.time() # time_taken = current_time - start_time # time_elapsed_minutes = time_taken / 60 # completed_iterations_text = f"{i + 1}/{iterations}" # progress_bar.progress((i + 1) / int(iterations)) # progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') # st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') # pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv') # def to_percentage(value): # return f'{value * 100:.1f}%' # st.title('2. Select Models') # if 'tick' not in st.session_state: # st.session_state['tick']=False # if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']): # st.session_state['tick']=True # st.write('Select one model iteration to generate performance metrics for it:') # data=pd.DataFrame(st.session_state['Model_results']) # data.sort_values(by=['MAPE'],ascending=False,inplace=True) # data.drop_duplicates(subset='Model_iteration',inplace=True) # top_10=data.head(10) # top_10['Rank']=np.arange(1,len(top_10)+1,1) # top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage) # top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']] # #top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']] # gd=GridOptionsBuilder.from_dataframe(top_10_table) # gd.configure_pagination(enabled=True) # gd.configure_selection(use_checkbox=True) # gridoptions=gd.build() # table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED) # selected_rows=table.selected_rows # # if st.session_state["selected_rows"] != selected_rows: # # st.session_state["build_rc_cb"] = False # st.session_state["selected_rows"] = selected_rows # if 'Model' not in st.session_state: # st.session_state['Model']={} # if len(selected_rows)>0: # st.header('2.1 Results Summary') # model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object'] # features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set'] # with open(str(model_object.values[0]), 'rb') as file: # model = pickle.load(file) # st.write(model.summary()) # st.header('2.2 Actual vs. Predicted Plot') # df=st.session_state['media_data'] # X=df[features_set.values[0]] # X = sm.add_constant(X) # y=revenue # X_train=X.iloc[:150] # X_test=X.iloc[150:] # y_train=y.iloc[:150] # y_test=y.iloc[150:] # ss = MinMaxScaler() # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns) # st.session_state['X']=X_train # st.session_state['features_set']=features_set.values[0] # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue') # st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) # st.markdown('## 2.3 Residual Analysis') # columns=st.columns(2) # with columns[0]: # fig=plot_residual_predicted(y_train,model.predict(X_train),X_train) # st.plotly_chart(fig) # with columns[1]: # st.empty() # fig = qqplot(y_train,model.predict(X_train)) # st.plotly_chart(fig) # with columns[0]: # fig=residual_distribution(y_train,model.predict(X_train)) # st.pyplot(fig) # vif_data = pd.DataFrame() # # X=X.drop('const',axis=1) # vif_data["Variable"] = X_train.columns # vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])] # vif_data.sort_values(by=['VIF'],ascending=False,inplace=True) # vif_data=np.round(vif_data) # vif_data['VIF']=vif_data['VIF'].astype(float) # st.header('2.4 Variance Inflation Factor (VIF)') # #st.dataframe(vif_data) # color_mapping = { # 'darkgreen': (vif_data['VIF'] < 3), # 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10), # 'darkred': (vif_data['VIF'] > 10) # } # # Create a horizontal bar plot # fig, ax = plt.subplots() # fig.set_figwidth(10) # Adjust the width of the figure as needed # # Sort the bars by descending VIF values # vif_data = vif_data.sort_values(by='VIF', ascending=False) # # Iterate through the color mapping and plot bars with corresponding colors # for color, condition in color_mapping.items(): # subset = vif_data[condition] # bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color) # # Add text annotations on top of the bars # for bar in bars: # width = bar.get_width() # ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0), # textcoords='offset points', va='center') # # Customize the plot # ax.set_xlabel('VIF Values') # #ax.set_title('2.4 Variance Inflation Factor (VIF)') # #ax.legend(loc='upper right') # # Display the plot in Streamlit # st.pyplot(fig) # with st.expander('Results Summary Test data'): # ss = MinMaxScaler() # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns) # st.header('2.2 Actual vs. Predicted Plot') # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, model.predict(X_test), model,target_column='Revenue') # st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) # st.markdown('## 2.3 Residual Analysis') # columns=st.columns(2) # with columns[0]: # fig=plot_residual_predicted(revenue,model.predict(X_test),X_test) # st.plotly_chart(fig) # with columns[1]: # st.empty() # fig = qqplot(revenue,model.predict(X_test)) # st.plotly_chart(fig) # with columns[0]: # fig=residual_distribution(revenue,model.predict(X_test)) # st.pyplot(fig) # value=False # if st.checkbox('Save this model to tune',key='build_rc_cb'): # mod_name=st.text_input('Enter model name') # if len(mod_name)>0: # st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train} # st.session_state['X_train']=X_train # st.session_state['X_test']=X_test # st.session_state['y_train']=y_train # st.session_state['y_test']=y_test # with open("best_models.pkl", "wb") as f: # pickle.dump(st.session_state['Model'], f) # st.success('Model saved!, Proceed next page to tune the model') # value=False