Spaces:
Sleeping
Sleeping
''' | |
MMO Build Sprint 3 | |
date : | |
changes : capability to tune MixedLM as well as simple LR in the same page | |
''' | |
import streamlit as st | |
import pandas as pd | |
from Eda_functions import format_numbers | |
import pickle | |
from utilities import set_header,load_local_css | |
import statsmodels.api as sm | |
import re | |
from sklearn.preprocessing import MinMaxScaler | |
import matplotlib.pyplot as plt | |
from statsmodels.stats.outliers_influence import variance_inflation_factor | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
import statsmodels.formula.api as smf | |
from Data_prep_functions import * | |
for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features"] : | |
if i not in st.session_state : | |
st.session_state[i] = None | |
st.set_page_config( | |
page_title="Model Tuning", | |
page_icon=":shark:", | |
layout="wide", | |
initial_sidebar_state='collapsed' | |
) | |
load_local_css('styles.css') | |
set_header() | |
# Sprint3 | |
is_panel= True | |
panel_col= 'markets' # set the panel column | |
date_col = 'date' | |
target_col = 'total_approved_accounts_revenue' | |
st.title('1. Model Tuning') | |
if "X_train" not in st.session_state: | |
st.error( | |
"Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.") | |
st.stop() | |
X_train=st.session_state['X_train'] | |
X_test=st.session_state['X_test'] | |
y_train=st.session_state['y_train'] | |
y_test=st.session_state['y_test'] | |
df=st.session_state['media_data'] | |
# st.write(X_train.columns) | |
# st.write(X_test.columns) | |
with open("best_models.pkl", 'rb') as file: | |
model_dict= pickle.load(file) | |
if 'selected_model' not in st.session_state: | |
st.session_state['selected_model']=0 | |
# st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns) | |
st.markdown('### 1.1 Event Flags') | |
st.markdown('Helps in quantifying the impact of specific occurrences of events') | |
with st.expander('Apply Event Flags'): | |
st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys()) | |
model =model_dict[st.session_state["selected_model"]]['Model_object'] | |
date=st.session_state['date'] | |
date=pd.to_datetime(date) | |
X_train =model_dict[st.session_state["selected_model"]]['X_train'] | |
features_set= model_dict[st.session_state["selected_model"]]['feature_set'] | |
col=st.columns(3) | |
min_date=min(date) | |
max_date=max(date) | |
with col[0]: | |
start_date=st.date_input('Select Start Date',min_date,min_value=min_date,max_value=max_date) | |
with col[1]: | |
end_date=st.date_input('Select End Date',max_date,min_value=min_date,max_value=max_date) | |
with col[2]: | |
repeat=st.selectbox('Repeat Annually',['Yes','No'],index=1) | |
if repeat =='Yes': | |
repeat=True | |
else: | |
repeat=False | |
# X_train=sm.add_constant(X_train) | |
if 'Flags' not in st.session_state: | |
st.session_state['Flags']={} | |
# print("**"*50) | |
# print(y_train) | |
# print("**"*50) | |
# print(model.fittedvalues) | |
if is_panel : # Sprint3 | |
met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, | |
model.fittedvalues, model, | |
target_column='Revenue', | |
flag=(start_date, end_date), | |
repeat_all_years=repeat, is_panel=True) | |
st.plotly_chart(fig_flag, use_container_width=True) | |
# create flag on test | |
met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, | |
st.session_state['pred_test'], model, | |
target_column='Revenue', | |
flag=(start_date, end_date), | |
repeat_all_years=repeat, is_panel=True) | |
else : | |
met,line_values,fig_flag=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,flag=(start_date,end_date),repeat_all_years=repeat) | |
st.plotly_chart(fig_flag,use_container_width=True) | |
met,test_line_values,fig_flag=plot_actual_vs_predicted(date[150:], y_test, model.predict(X_test), model,flag=(start_date,end_date),repeat_all_years=repeat) | |
flag_name='f1' | |
flag_name=st.text_input('Enter Flag Name') | |
if st.button('Update flag'): | |
st.session_state['Flags'][flag_name]= {} | |
st.session_state['Flags'][flag_name]['train']=line_values | |
st.session_state['Flags'][flag_name]['test']=test_line_values | |
# st.write(st.session_state['Flags'][flag_name]) | |
st.success(f'{flag_name} stored') | |
options=list(st.session_state['Flags'].keys()) | |
selected_options = [] | |
num_columns = 4 | |
num_rows = -(-len(options) // num_columns) | |
tick=False | |
if st.checkbox('Select all'): | |
tick=True | |
selected_options = [] | |
for row in range(num_rows): | |
cols = st.columns(num_columns) | |
for col in cols: | |
if options: | |
option = options.pop(0) | |
selected = col.checkbox(option,value=tick) | |
if selected: | |
selected_options.append(option) | |
st.markdown('### 1.2 Select Parameters to Apply') | |
parameters=st.columns(3) | |
with parameters[0]: | |
Trend=st.checkbox("**Trend**") | |
st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness') | |
with parameters[1]: | |
week_number=st.checkbox('**Week_number**') | |
st.markdown('Assists in detecting and incorporating weekly patterns or seasonality') | |
with parameters[2]: | |
sine_cosine=st.checkbox('**Sine and Cosine Waves**') | |
st.markdown('Helps in capturing cyclical patterns or seasonality in the data') | |
if st.button('Build model with Selected Parameters and Flags'): | |
st.header('2.1 Results Summary') | |
# date=list(df.index) | |
# df = df.reset_index(drop=True) | |
# st.write(df.head(2)) | |
# X_train=df[features_set] | |
ss = MinMaxScaler() | |
if is_panel == True : | |
X = X_train[features_set] | |
X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
X_train_tuned[target_col] = X_train[target_col] | |
X_train_tuned[date_col] = X_train[date_col] | |
X_train_tuned[panel_col] = X_train[panel_col] | |
X = X_test[features_set] | |
X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns) | |
X_test_tuned[target_col] = X_test[target_col] | |
X_test_tuned[date_col] = X_test[date_col] | |
X_test_tuned[panel_col] = X_test[panel_col] | |
else : | |
X_train_tuned = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns) | |
X_train_tuned = sm.add_constant(X_train_tuned) | |
X_test_tuned = pd.DataFrame(ss.transform(X_test), columns=X_test.columns) | |
X_test_tuned = sm.add_constant(X_test_tuned) | |
for flag in selected_options: | |
X_train_tuned[flag]=st.session_state['Flags'][flag]['train'] | |
X_test_tuned[flag]=st.session_state['Flags'][flag]['test'] | |
#test | |
# X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False) | |
# X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False) | |
new_features = features_set | |
# print("()()"*20,flag, len(st.session_state['Flags'][flag])) | |
if Trend: | |
# Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set | |
if is_panel : | |
newdata = pd.DataFrame() | |
panel_wise_end_point_train = {} | |
for panel, groupdf in X_train_tuned.groupby(panel_col): | |
groupdf.sort_values(date_col, inplace=True) | |
groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1) | |
newdata = pd.concat([newdata, groupdf]) | |
panel_wise_end_point_train[panel] = len(groupdf) | |
X_train_tuned = newdata.copy() | |
test_newdata=pd.DataFrame() | |
for panel, test_groupdf in X_test_tuned.groupby(panel_col): | |
test_groupdf.sort_values(date_col, inplace=True) | |
start = panel_wise_end_point_train[panel]+1 | |
end = start + len(test_groupdf) | |
# print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start) | |
test_groupdf['Trend'] = np.arange(start, end, 1) | |
test_newdata = pd.concat([test_newdata, test_groupdf]) | |
X_test_tuned = test_newdata.copy() | |
new_features = new_features + ['Trend'] | |
# test | |
X_test_tuned.to_csv("Test/X_test_tuned_trend.csv", index=False) | |
X_train_tuned.to_csv("Test/X_train_tuned_trend.csv", index=False) | |
pd.concat([X_train_tuned,X_test_tuned]).sort_values([panel_col, date_col]).to_csv("Test/X_train_test_tuned_trend.csv", index=False) | |
else : | |
X_train_tuned['Trend']=np.arange(1,len(X_train_tuned)+1,1) | |
X_test_tuned['Trend'] = np.arange(len(X_train_tuned)+1, len(X_train_tuned)+len(X_test_tuned), 1) | |
if week_number : | |
# Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set | |
if is_panel : | |
X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col]) | |
X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week | |
if X_train_tuned['Week_number'].nunique() == 1 : | |
st.write("All dates in the data are of the same week day. Hence Week number can't be used.") | |
else : | |
X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col]) | |
X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week | |
new_features = new_features + ['Week_number'] | |
else : | |
date = pd.to_datetime(date.values) | |
X_train_tuned['Week_number'] = date.dt.day_of_week[:150] | |
X_test_tuned['Week_number'] = date.dt.day_of_week[150:] | |
if sine_cosine : | |
# Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set | |
if is_panel : | |
new_features = new_features + ['sine_wave', 'cosine_wave'] | |
newdata = pd.DataFrame() | |
groups = X_train_tuned.groupby(panel_col) | |
frequency = 2 * np.pi / 365 # Adjust the frequency as needed | |
train_panel_wise_end_point = {} | |
for panel, groupdf in groups: | |
num_samples = len(groupdf) | |
train_panel_wise_end_point[panel] = num_samples | |
days_since_start = np.arange(num_samples) | |
sine_wave = np.sin(frequency * days_since_start) | |
cosine_wave = np.cos(frequency * days_since_start) | |
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
assert len(sine_cosine_df) == len(groupdf) | |
# groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) | |
groupdf['sine_wave'] = sine_wave | |
groupdf['cosine_wave'] = cosine_wave | |
newdata = pd.concat([newdata, groupdf]) | |
test_groups = X_test_tuned.groupby(panel_col) | |
for panel, test_groupdf in test_groups: | |
num_samples = len(test_groupdf) | |
start = train_panel_wise_end_point[panel] | |
days_since_start = np.arange(start, start+num_samples, 1) | |
# print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1))) | |
sine_wave = np.sin(frequency * days_since_start) | |
cosine_wave = np.cos(frequency * days_since_start) | |
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
assert len(sine_cosine_df) == len(test_groupdf) | |
# groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) | |
test_groupdf['sine_wave'] = sine_wave | |
test_groupdf['cosine_wave'] = cosine_wave | |
newdata = pd.concat([newdata, test_groupdf]) | |
X_train_tuned = newdata.copy() | |
else : | |
num_samples = len(X_train_tuned) | |
frequency = 2 * np.pi / 365 # Adjust the frequency as needed | |
days_since_start = np.arange(num_samples) | |
sine_wave = np.sin(frequency * days_since_start) | |
cosine_wave = np.cos(frequency * days_since_start) | |
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
# Concatenate the sine and cosine waves with the scaled X DataFrame | |
X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1) | |
test_num_samples = len(X_test_tuned) | |
start = num_samples | |
days_since_start = np.arange(start, start+test_num_samples, 1) | |
sine_wave = np.sin(frequency * days_since_start) | |
cosine_wave = np.cos(frequency * days_since_start) | |
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) | |
# Concatenate the sine and cosine waves with the scaled X DataFrame | |
X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1) | |
# model | |
if is_panel : | |
if selected_options : | |
new_features = new_features + selected_options | |
inp_vars_str = " + ".join(new_features) | |
# X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False) | |
# st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes) | |
# st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum()) | |
md_tuned = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str), | |
data=X_train_tuned[['total_approved_accounts_revenue'] + new_features], | |
groups=X_train_tuned[panel_col]) | |
model_tuned = md_tuned.fit() | |
# plot act v pred for original model and tuned model | |
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, | |
model.fittedvalues, model, | |
target_column='Revenue', | |
is_panel=True) | |
metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col], | |
X_train_tuned[target_col], | |
model_tuned.fittedvalues, | |
model_tuned, | |
target_column='Revenue', | |
is_panel=True) | |
else : | |
model_tuned = sm.OLS(y_train, X_train_tuned).fit() | |
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:150], y_train, | |
model.predict(X_train), model, | |
target_column='Revenue') | |
metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:150], y_train, | |
model_tuned.predict( | |
X_train_tuned), | |
model_tuned, | |
target_column='Revenue') | |
# st.write(metrics_table_tuned) | |
mape=np.round(metrics_table.iloc[0,1],2) | |
r2=np.round(metrics_table.iloc[1,1],2) | |
adjr2=np.round(metrics_table.iloc[2,1],2) | |
mape_tuned=np.round(metrics_table_tuned.iloc[0,1],2) | |
r2_tuned=np.round(metrics_table_tuned.iloc[1,1],2) | |
adjr2_tuned=np.round(metrics_table_tuned.iloc[2,1],2) | |
parameters_=st.columns(3) | |
with parameters_[0]: | |
st.metric('R2',r2_tuned,np.round(r2_tuned-r2,2)) | |
with parameters_[1]: | |
st.metric('Adjusted R2',adjr2_tuned,np.round(adjr2_tuned-adjr2,2)) | |
with parameters_[2]: | |
st.metric('MAPE',mape_tuned,np.round(mape_tuned-mape,2),'inverse') | |
st.header('2.2 Actual vs. Predicted Plot') | |
# if is_panel: | |
# metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train), | |
# model, target_column='Revenue',is_panel=True) | |
# else: | |
# metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue') | |
metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col], | |
model_tuned.fittedvalues, model_tuned, | |
target_column='Revenue', | |
is_panel=True) | |
# plot_actual_vs_predicted(X_train[date_col], y_train, | |
# model.fittedvalues, model, | |
# target_column='Revenue', | |
# is_panel=is_panel) | |
st.plotly_chart(actual_vs_predicted_plot,use_container_width=True) | |
st.markdown('## 2.3 Residual Analysis') | |
columns=st.columns(2) | |
with columns[0]: | |
fig=plot_residual_predicted(y_train,model.predict(X_train),X_train) | |
st.plotly_chart(fig) | |
with columns[1]: | |
st.empty() | |
fig = qqplot(y_train,model.predict(X_train)) | |
st.plotly_chart(fig) | |
with columns[0]: | |
fig=residual_distribution(y_train,model.predict(X_train)) | |
st.pyplot(fig) | |
if st.checkbox('Use this model to build response curves',key='123'): | |
st.session_state["tuned_model"] = model_tuned | |
st.session_state["X_train_tuned"] = X_train_tuned | |
st.session_state["X_test_tuned"] = X_test_tuned | |
st.session_state["X_train_tuned"] = X_train_tuned | |
st.session_state["X_test_tuned"] = X_test_tuned | |
if is_panel : | |
st.session_state["tuned_model_features"] = new_features | |
with open("tuned_model.pkl", "wb") as f: | |
pickle.dump(st.session_state['tuned_model'], f) | |
st.success('Model saved!') | |
# raw_data=df[features_set] | |
# columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns] | |
# raw_data.columns=columns_raw | |
# columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media'] | |
# raw_data=raw_data[columns_media] | |
# raw_data['Date']=list(df.index) | |
# spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()] | |
# spends_df=df[spends_var] | |
# spends_df['Week']=list(df.index) | |
# j=0 | |
# X1=X.copy() | |
# col=X1.columns | |
# for i in model.params.values: | |
# X1[col[j]]=X1.iloc[:,j]*i | |
# j+=1 | |
# contribution_df=X1 | |
# contribution_df['Date']=list(df.index) | |
# excel_file='Overview_data.xlsx' | |
# with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer: | |
# raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False) | |
# spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False) | |
# contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM') | |