Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from Eda_functions import format_numbers | |
import numpy as np | |
import pickle | |
from st_aggrid import AgGrid | |
from st_aggrid import GridOptionsBuilder,GridUpdateMode | |
from utilities import set_header,load_local_css | |
from st_aggrid import GridOptionsBuilder | |
import time | |
import itertools | |
import statsmodels.api as sm | |
import numpy as npc | |
import re | |
import itertools | |
from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error | |
from sklearn.preprocessing import MinMaxScaler | |
import os | |
import matplotlib.pyplot as plt | |
from statsmodels.stats.outliers_influence import variance_inflation_factor | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
from datetime import datetime | |
import seaborn as sns | |
from Data_prep_functions import * | |
def get_random_effects(media_data, panel_col, mdf): | |
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) | |
for i, market in enumerate(media_data[panel_col].unique()): | |
print(i, end='\r') | |
intercept = mdf.random_effects[market].values[0] | |
random_eff_df.loc[i, 'random_effect'] = intercept | |
random_eff_df.loc[i, panel_col] = market | |
return random_eff_df | |
def mdf_predict(X, mdf, random_eff_df) : | |
X['fixed_effect'] = mdf.predict(X) | |
merged_df=pd.merge(X[[panel_col,target_col]], random_eff_df, on = panel_col, how = 'left') | |
X['random_effect'] = merged_df['random_effect'] | |
X['pred'] = X['fixed_effect'] + X['random_effect'] | |
return X['pred'] | |
st.set_page_config( | |
page_title="Model Build", | |
page_icon=":shark:", | |
layout="wide", | |
initial_sidebar_state='collapsed' | |
) | |
load_local_css('styles.css') | |
set_header() | |
st.title('1. Build Your Model') | |
panel_col= 'markets' # set the panel column | |
date_col = 'date' | |
target_col = 'total_approved_accounts_revenue' | |
media_data=pd.read_csv('upf_data_converted.csv') | |
media_data.columns=[i.lower().replace('-','').replace(':','').replace("__", "_") for i in media_data.columns] | |
# st.write(media_data.columns) | |
media_data.sort_values(date_col, inplace=True) | |
media_data.reset_index(drop=True,inplace=True) | |
date=media_data[date_col] | |
st.session_state['date']=date | |
revenue=media_data[target_col] | |
media_data.drop([target_col],axis=1,inplace=True) | |
media_data.drop([date_col],axis=1,inplace=True) | |
media_data.reset_index(drop=True,inplace=True) | |
if st.toggle('Apply Transformations on DMA/Panel Level'): | |
dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel', 'markets']]) | |
else: | |
#""" code to aggregate data on date """ | |
dma=None | |
# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()} | |
# st.write(dma_dict) | |
st.markdown('## Select the Range of Transformations') | |
columns = st.columns(2) | |
old_shape=media_data.shape | |
if "old_shape" not in st.session_state: | |
st.session_state['old_shape']=old_shape | |
with columns[0]: | |
slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f") | |
with columns[1]: | |
slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1) | |
# with columns[2]: | |
# slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1) | |
# with columns[1]: | |
# st.number_input('Select the range of half saturation point ',min_value=1,max_value=5) | |
# st.number_input('Select the range of ') | |
# Section 1 - Transformations Functions | |
def lag(data,features,lags,dma=None): | |
if dma: | |
transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1) | |
transformed_data=transformed_data.fillna(method='bfill') | |
return pd.concat([transformed_data,data],axis=1) | |
else: | |
#''' data should be aggregated on date''' | |
transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1) | |
transformed_data=transformed_data.fillna(method='bfill') | |
return pd.concat([transformed_data,data],axis=1) | |
#adstock | |
def adstock(df, alphas, cutoff, features,dma=None): | |
# st.write(features) | |
if dma: | |
transformed_data=pd.DataFrame() | |
for d in df[dma].unique(): | |
dma_sub_df = df[df[dma] == d] | |
n = len(dma_sub_df) | |
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas]) | |
X = dma_sub_df[features].to_numpy() | |
res = pd.DataFrame(np.hstack(weights @ X), | |
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) | |
transformed_data=pd.concat([transformed_data,res],axis=0) | |
transformed_data.reset_index(drop=True,inplace=True) | |
return pd.concat([transformed_data,df],axis=1) | |
else: | |
n = len(df) | |
weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas]) | |
X = df[features].to_numpy() | |
res = pd.DataFrame(np.hstack(weights @ X), | |
columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) | |
return pd.concat([res,df],axis=1) | |
# Section 2 - Begin Transformations | |
if 'media_data' not in st.session_state: | |
st.session_state['media_data']=pd.DataFrame() | |
# variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets | |
variables_to_be_transformed=[col for col in media_data.columns if '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change | |
# st.write(variables_to_be_transformed) | |
# st.write(media_data[variables_to_be_transformed].dtypes) | |
with columns[0]: | |
if st.button('Apply Transformations'): | |
with st.spinner('Applying Transformations'): | |
transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma) | |
# variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets | |
variables_to_be_transformed = [col for col in media_data.columns if | |
'_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change | |
transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1],0.1), cutoff=8, features=variables_to_be_transformed,dma=dma) | |
# st.success('Done') | |
st.success("Transformations complete!") | |
st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}') | |
# st.write(media_data.head(10)) | |
# st.write(transformed_data_adstock.head(10)) | |
transformed_data_adstock.columns = [c.replace(".","_") for c in transformed_data_adstock.columns] # srishti | |
# st.write(transformed_data_adstock.columns) | |
st.session_state['media_data']=transformed_data_adstock # srishti | |
# with st.spinner('Applying Transformations'): | |
# time.sleep(2) | |
# st.success("Transformations complete!") | |
# if st.session_state['media_data'].shape[1]>old_shape[1]: | |
# with columns[0]: | |
# st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') | |
#st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') | |
# Section 3 - Create combinations | |
# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions', | |
# ' FB: Level Achieved - Tier 2 Impressions','paid_social_others', | |
# ' GA App: Will And Cid Pequena Baixo Risco Clicks', | |
# 'digital_tactic_others',"programmatic" | |
# ] | |
# srishti - bucket names changed | |
bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','fb_level_achieved_tier_2', | |
'fb_level_achieved_tier_1','paid_social_others', | |
'ga_app', | |
'digital_tactic_others',"programmatic" | |
] | |
with columns[1]: | |
if st.button('Create Combinations of Variables'): | |
top_3_correlated_features=[] | |
# for col in st.session_state['media_data'].columns[:19]: | |
original_cols = [c for c in st.session_state['media_data'].columns if "_clicks" in c.lower() or "_impressions" in c.lower()] | |
original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()] | |
# st.write(original_cols) | |
# for col in st.session_state['media_data'].columns[:19]: | |
for col in original_cols: # srishti - new | |
corr_df=pd.concat([st.session_state['media_data'].filter(regex=col), | |
revenue],axis=1).corr()[target_col].iloc[:-1] | |
top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index)) | |
# st.write(col, top_3_correlated_features) | |
flattened_list = [item for sublist in top_3_correlated_features for item in sublist] | |
# all_features_set={var:[col for col in flattened_list if var in col] for var in bucket} | |
all_features_set={var:[col for col in flattened_list if var in col] for var in bucket if len([col for col in flattened_list if var in col])>0} # srishti | |
channels_all=[values for values in all_features_set.values()] | |
# st.write(channels_all) | |
st.session_state['combinations'] = list(itertools.product(*channels_all)) | |
# if 'combinations' not in st.session_state: | |
# st.session_state['combinations']=combinations_all | |
st.session_state['final_selection']=st.session_state['combinations'] | |
st.success('Done') | |
# st.write(f"{len(st.session_state['combinations'])} combinations created") | |
revenue.reset_index(drop=True,inplace=True) | |
if 'Model_results' not in st.session_state: | |
st.session_state['Model_results']={'Model_object':[], | |
'Model_iteration':[], | |
'Feature_set':[], | |
'MAPE':[], | |
'R2':[], | |
'ADJR2':[] | |
} | |
def reset_model_result_dct(): | |
st.session_state['Model_results']={'Model_object':[], | |
'Model_iteration':[], | |
'Feature_set':[], | |
'MAPE':[], | |
'R2':[], | |
'ADJR2':[] | |
} | |
# if st.button('Build Model'): | |
if 'iterations' not in st.session_state: | |
st.session_state['iterations']=0 | |
# st.write("1",st.session_state["final_selection"]) | |
if 'final_selection' not in st.session_state: | |
st.session_state['final_selection']=False | |
save_path = r"Model/" | |
with columns[1]: | |
if st.session_state['final_selection']: | |
st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}') | |
if st.checkbox('Build all iterations'): | |
iterations=len(st.session_state['final_selection']) | |
else: | |
iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=10, value=st.session_state['iterations'],on_change=reset_model_result_dct) | |
# st.write("iterations=", iterations) | |
if st.button('Build Model',on_click=reset_model_result_dct): | |
st.session_state['iterations']=iterations | |
# st.write("2",st.session_state["final_selection"]) | |
# Section 4 - Model | |
st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill') | |
st.markdown( | |
'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ') | |
progress_bar = st.progress(0) # Initialize the progress bar | |
# time_remaining_text = st.empty() # Create an empty space for time remaining text | |
start_time = time.time() # Record the start time | |
progress_text = st.empty() | |
# time_elapsed_text = st.empty() | |
# for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]): | |
# st.write(st.session_state["final_selection"]) | |
# for i, selected_features in enumerate(st.session_state["final_selection"]): | |
for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti | |
print("@@@@@@@@@@@@@",i) | |
df = st.session_state['media_data'] | |
fet = [var for var in selected_features if len(var) > 0] | |
inp_vars_str = " + ".join(fet) # new | |
X = df[fet] | |
y = revenue | |
ss = MinMaxScaler() | |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
# X = sm.add_constant(X) | |
X['total_approved_accounts_revenue'] = revenue # new | |
X[panel_col] = df[panel_col] | |
X_train = X.iloc[:8000] | |
X_test = X.iloc[8000:] | |
y_train = y.iloc[:8000] | |
y_test = y.iloc[8000:] | |
print(X_train.shape) | |
# model = sm.OLS(y_train, X_train).fit() | |
md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str), | |
data=X_train[['total_approved_accounts_revenue'] + fet], | |
groups=X_train[panel_col]) | |
mdf = md.fit() | |
predicted_values = mdf.fittedvalues | |
# st.write(fet) | |
# positive_coeff=fet | |
# negetive_coeff=[] | |
coefficients = mdf.fe_params.to_dict() | |
model_possitive = [col for col in coefficients.keys() if coefficients[col] > 0] | |
# st.write(positive_coeff) | |
# st.write(model_possitive) | |
pvalues = [var for var in list(mdf.pvalues) if var <= 0.06] | |
# if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8: | |
if (len(model_possitive) / len(selected_features)) > 0 and (len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later | |
# predicted_values = model.predict(X_train) | |
mape = mean_absolute_percentage_error(y_train, predicted_values) | |
r2 = r2_score(y_train, predicted_values) | |
adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1) | |
filename = os.path.join(save_path, f"model_{i}.pkl") | |
with open(filename, "wb") as f: | |
pickle.dump(mdf, f) | |
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: | |
# model = pickle.load(file) | |
st.session_state['Model_results']['Model_object'].append(filename) | |
st.session_state['Model_results']['Model_iteration'].append(i) | |
st.session_state['Model_results']['Feature_set'].append(fet) | |
st.session_state['Model_results']['MAPE'].append(mape) | |
st.session_state['Model_results']['R2'].append(r2) | |
st.session_state['Model_results']['ADJR2'].append(adjr2) | |
current_time = time.time() | |
time_taken = current_time - start_time | |
time_elapsed_minutes = time_taken / 60 | |
completed_iterations_text = f"{i + 1}/{iterations}" | |
progress_bar.progress((i + 1) / int(iterations)) | |
progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') | |
st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') | |
pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv') | |
def to_percentage(value): | |
return f'{value * 100:.1f}%' | |
st.title('2. Select Models') | |
if 'tick' not in st.session_state: | |
st.session_state['tick']=False | |
if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']): | |
st.session_state['tick']=True | |
st.write('Select one model iteration to generate performance metrics for it:') | |
data=pd.DataFrame(st.session_state['Model_results']) | |
data.sort_values(by=['MAPE'],ascending=False,inplace=True) | |
data.drop_duplicates(subset='Model_iteration',inplace=True) | |
top_10=data.head(10) | |
top_10['Rank']=np.arange(1,len(top_10)+1,1) | |
top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage) | |
top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']] | |
#top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']] | |
gd=GridOptionsBuilder.from_dataframe(top_10_table) | |
gd.configure_pagination(enabled=True) | |
gd.configure_selection(use_checkbox=True) | |
gridoptions=gd.build() | |
table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED) | |
selected_rows=table.selected_rows | |
# if st.session_state["selected_rows"] != selected_rows: | |
# st.session_state["build_rc_cb"] = False | |
st.session_state["selected_rows"] = selected_rows | |
if 'Model' not in st.session_state: | |
st.session_state['Model']={} | |
if len(selected_rows)>0: | |
st.header('2.1 Results Summary') | |
model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object'] | |
features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set'] | |
with open(str(model_object.values[0]), 'rb') as file: | |
# print(file) | |
model = pickle.load(file) | |
st.write(model.summary()) | |
st.header('2.2 Actual vs. Predicted Plot') | |
df=st.session_state['media_data'] | |
X=df[features_set.values[0]] | |
# X = sm.add_constant(X) | |
y=revenue | |
ss = MinMaxScaler() | |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
X['total_approved_accounts_revenue'] = revenue # new | |
X[panel_col] = df[panel_col] | |
X_train=X.iloc[:8000] | |
X_test=X.iloc[8000:] | |
y_train=y.iloc[:8000] | |
y_test=y.iloc[8000:] | |
st.session_state['X']=X_train | |
st.session_state['features_set']=features_set.values[0] | |
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.fittedvalues, model,target_column='Revenue') | |
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) | |
random_eff_df = get_random_effects(media_data, panel_col, model) | |
st.markdown('## 2.3 Residual Analysis') | |
columns=st.columns(2) | |
with columns[0]: | |
fig=plot_residual_predicted(y_train,model.fittedvalues,X_train) | |
st.plotly_chart(fig) | |
with columns[1]: | |
st.empty() | |
fig = qqplot(y_train,model.fittedvalues) | |
st.plotly_chart(fig) | |
with columns[0]: | |
fig=residual_distribution(y_train,model.fittedvalues) | |
st.pyplot(fig) | |
vif_data = pd.DataFrame() | |
# X=X.drop('const',axis=1) | |
vif_data["Variable"] = X_train.columns | |
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])] | |
vif_data.sort_values(by=['VIF'],ascending=False,inplace=True) | |
vif_data=np.round(vif_data) | |
vif_data['VIF']=vif_data['VIF'].astype(float) | |
st.header('2.4 Variance Inflation Factor (VIF)') | |
#st.dataframe(vif_data) | |
color_mapping = { | |
'darkgreen': (vif_data['VIF'] < 3), | |
'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10), | |
'darkred': (vif_data['VIF'] > 10) | |
} | |
# Create a horizontal bar plot | |
fig, ax = plt.subplots() | |
fig.set_figwidth(10) # Adjust the width of the figure as needed | |
# Sort the bars by descending VIF values | |
vif_data = vif_data.sort_values(by='VIF', ascending=False) | |
# Iterate through the color mapping and plot bars with corresponding colors | |
for color, condition in color_mapping.items(): | |
subset = vif_data[condition] | |
bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color) | |
# Add text annotations on top of the bars | |
for bar in bars: | |
width = bar.get_width() | |
ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0), | |
textcoords='offset points', va='center') | |
# Customize the plot | |
ax.set_xlabel('VIF Values') | |
#ax.set_title('2.4 Variance Inflation Factor (VIF)') | |
#ax.legend(loc='upper right') | |
# Display the plot in Streamlit | |
st.pyplot(fig) | |
with st.expander('Results Summary Test data'): | |
ss = MinMaxScaler() | |
X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns) | |
st.header('2.2 Actual vs. Predicted Plot') | |
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, mdf_predict(X_test,mdf, random_eff_df), model,target_column='Revenue') | |
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) | |
st.markdown('## 2.3 Residual Analysis') | |
columns=st.columns(2) | |
with columns[0]: | |
fig=plot_residual_predicted(revenue,mdf_predict(X_test,mdf, random_eff_df),X_test) | |
st.plotly_chart(fig) | |
with columns[1]: | |
st.empty() | |
fig = qqplot(revenue,mdf_predict(X_test,mdf, random_eff_df)) | |
st.plotly_chart(fig) | |
with columns[0]: | |
fig=residual_distribution(revenue,mdf_predict(X_test,mdf, random_eff_df)) | |
st.pyplot(fig) | |
value=False | |
if st.checkbox('Save this model to tune',key='build_rc_cb'): | |
mod_name=st.text_input('Enter model name') | |
if len(mod_name)>0: | |
st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train} | |
st.session_state['X_train']=X_train | |
st.session_state['X_test']=X_test | |
st.session_state['y_train']=y_train | |
st.session_state['y_test']=y_test | |
with open("best_models.pkl", "wb") as f: | |
pickle.dump(st.session_state['Model'], f) | |
st.success('Model saved!, Proceed next page to tune the model') | |
value=False | |