BlendMMM commited on
Commit
9d7bf1d
1 Parent(s): 94bbd2b

Upload 4 files

Browse files
dump/1_Transformations.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from Eda_functions import format_numbers
6
+ import numpy as np
7
+ import pickle
8
+ from st_aggrid import AgGrid
9
+ from st_aggrid import GridOptionsBuilder,GridUpdateMode
10
+ from utilities import set_header,load_local_css
11
+ from st_aggrid import GridOptionsBuilder
12
+ import time
13
+ import itertools
14
+ import statsmodels.api as sm
15
+ import numpy as npc
16
+ import re
17
+ import itertools
18
+ from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
19
+ from sklearn.preprocessing import MinMaxScaler
20
+ import os
21
+ import matplotlib.pyplot as plt
22
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
23
+ st.set_option('deprecation.showPyplotGlobalUse', False)
24
+ from datetime import datetime
25
+ import seaborn as sns
26
+ from Data_prep_functions import *
27
+
28
+ st.set_page_config(
29
+ page_title="Model Build",
30
+ page_icon=":shark:",
31
+ layout="wide",
32
+ initial_sidebar_state='collapsed'
33
+ )
34
+
35
+ load_local_css('styles.css')
36
+ set_header()
37
+
38
+
39
+ st.title('1. Build Your Model')
40
+
41
+ # media_data=pd.read_csv('Media_data_for_model.csv')
42
+ media_data=pd.read_csv('Media_data_for_model_dma_level.csv')
43
+ date=media_data['Date']
44
+ st.session_state['date']=date
45
+ revenue=media_data['Total Approved Accounts - Revenue']
46
+ media_data.drop(['Total Approved Accounts - Revenue'],axis=1,inplace=True)
47
+ media_data.drop(['Date'],axis=1,inplace=True)
48
+ media_data.reset_index(drop=True,inplace=True)
49
+ media_data.dropna(inplace=True)
50
+
51
+ if st.toggle('Apply Transformations on DMA/Panel Level'):
52
+ dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel']])
53
+
54
+
55
+ else:
56
+ """ code to aggregate data on date """
57
+
58
+
59
+ dma=None
60
+
61
+ # dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
62
+ # st.write(dma_dict)
63
+
64
+ st.markdown('## Select the Range of Transformations')
65
+ columns = st.columns(2)
66
+ old_shape=media_data.shape
67
+
68
+
69
+ if "old_shape" not in st.session_state:
70
+ st.session_state['old_shape']=old_shape
71
+
72
+
73
+ with columns[0]:
74
+ slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
75
+ with columns[1]:
76
+ slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
77
+
78
+ # with columns[2]:
79
+ # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
80
+
81
+ # with columns[1]:
82
+ # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
83
+ # st.number_input('Select the range of ')
84
+
85
+ def lag(data,features,lags,dma=None):
86
+ if dma:
87
+
88
+ transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
89
+ transformed_data=transformed_data.fillna(method='bfill')
90
+ return pd.concat([transformed_data,data],axis=1)
91
+
92
+ else:
93
+
94
+ ''' data should be aggregated on date'''
95
+
96
+ transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
97
+ transformed_data=transformed_data.fillna(method='bfill')
98
+
99
+ return pd.concat([transformed_data,data],axis=1)
100
+
101
+ #adstock
102
+ def adstock(df, alphas, cutoff, features,dma=None):
103
+
104
+ if dma:
105
+ transformed_data=pd.DataFrame()
106
+ for d in df[dma].unique():
107
+ dma_sub_df = df[df[dma] == d]
108
+ n = len(dma_sub_df)
109
+
110
+
111
+ weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
112
+
113
+ X = dma_sub_df[features].to_numpy()
114
+ res = pd.DataFrame(np.hstack(weights @ X),
115
+ columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
116
+
117
+ transformed_data=pd.concat([transformed_data,res],axis=0)
118
+ transformed_data.reset_index(drop=True,inplace=True)
119
+ return pd.concat([transformed_data,df],axis=1)
120
+
121
+ else:
122
+
123
+ n = len(df)
124
+
125
+
126
+ weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
127
+
128
+ X = df[features].to_numpy()
129
+ res = pd.DataFrame(np.hstack(weights @ X),
130
+ columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
131
+ return pd.concat([res,df],axis=1)
132
+
133
+
134
+
135
+
136
+
137
+ if 'media_data' not in st.session_state:
138
+
139
+ st.session_state['media_data']=pd.DataFrame()
140
+
141
+ variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
142
+
143
+
144
+ if st.button('Apply Transformations'):
145
+ with st.spinner('Applying Transformations'):
146
+ transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma)
147
+
148
+ variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
149
+
150
+ transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1]+0.1,0.1), cutoff=8, features=variables_to_be_transformed,dma=dma)
151
+
152
+ st.success('Done')
153
+ st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
154
+ st.write(media_data.head(10))
155
+ st.write(transformed_data_adstock)
156
+ st.write(transformed_data_adstock.isnull().sum().sort_values(ascending=False))
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+ # st.write(dma_dict)
165
+ # st.session_state['media_data']=media_data
166
+
167
+ # with st.spinner('Applying Transformations'):
168
+ # time.sleep(2)
169
+ # st.success("Transformations complete!")
170
+
171
+ # if st.session_state['media_data'].shape[1]>old_shape[1]:
172
+ # with columns[0]:
173
+ # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
174
+ # #st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
175
+
176
+
177
+ # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
178
+ # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
179
+ # ' GA App: Will And Cid Pequena Baixo Risco Clicks',
180
+ # 'digital_tactic_others',"programmatic"
181
+ # ]
182
+
183
+ # with columns[1]:
184
+ # if st.button('Create Combinations of Variables'):
185
+
186
+ # top_3_correlated_features=[]
187
+ # for col in st.session_state['media_data'].columns[:19]:
188
+ # corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
189
+ # revenue],axis=1).corr()['Total Approved Accounts - Revenue'].iloc[:-1]
190
+ # top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
191
+ # flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
192
+ # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
193
+ # channels_all=[values for values in all_features_set.values()]
194
+ # st.session_state['combinations'] = list(itertools.product(*channels_all))
195
+
196
+ # # if 'combinations' not in st.session_state:
197
+ # # st.session_state['combinations']=combinations_all
198
+
199
+ # st.session_state['final_selection']=st.session_state['combinations']
200
+
201
+
202
+
203
+ # revenue.reset_index(drop=True,inplace=True)
204
+ # if 'Model_results' not in st.session_state:
205
+ # st.session_state['Model_results']={'Model_object':[],
206
+ # 'Model_iteration':[],
207
+ # 'Feature_set':[],
208
+ # 'MAPE':[],
209
+ # 'R2':[],
210
+ # 'ADJR2':[]
211
+ # }
212
+
213
+ # #if st.button('Build Model'):
214
+ # if 'iterations' not in st.session_state:
215
+ # st.session_state['iterations']=1
216
+ # save_path = r"Model"
217
+ # with columns[1]:
218
+ # if "final_selection" in st.session_state:
219
+ # st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
220
+
221
+ # st.success('Done')
222
+ # if st.checkbox('Build all iterations'):
223
+ # iterations=len(st.session_state['final_selection'])
224
+ # else:
225
+ # iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=100, value=st.session_state['iterations'])
226
+
227
+ # st.session_state['iterations']=iterations
228
+
229
+
230
+ # st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
231
+ # if st.button("Build Models"):
232
+ # st.markdown('Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
233
+ # progress_bar = st.progress(0) # Initialize the progress bar
234
+ # #time_remaining_text = st.empty() # Create an empty space for time remaining text
235
+ # start_time = time.time() # Record the start time
236
+ # progress_text = st.empty()
237
+ # #time_elapsed_text = st.empty()
238
+ # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000+int(iterations)]):
239
+ # df = st.session_state['media_data']
240
+
241
+ # fet = [var for var in selected_features if len(var) > 0]
242
+ # X = df[fet]
243
+ # y = revenue
244
+ # ss = MinMaxScaler()
245
+ # X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
246
+ # X = sm.add_constant(X)
247
+ # X_train=X.iloc[:150]
248
+ # X_test=X.iloc[150:]
249
+ # y_train=y.iloc[:150]
250
+ # y_test=y.iloc[150:]
251
+
252
+
253
+ # model = sm.OLS(y_train, X_train).fit()
254
+ # # st.write(fet)
255
+ # positive_coeff=X.columns
256
+ # negetive_coeff=[]
257
+ # coefficients=model.params.to_dict()
258
+ # model_possitive=[col for col in coefficients.keys() if coefficients[col]>0]
259
+ # # st.write(positive_coeff)
260
+ # # st.write(model_possitive)
261
+ # pvalues=[var for var in list(model.pvalues) if var<=0.06]
262
+ # if (len(model_possitive)/len(selected_features))>0.9 and (len(pvalues)/len(selected_features))>=0.8:
263
+
264
+
265
+ # predicted_values = model.predict(X_train)
266
+ # mape = mean_absolute_percentage_error(y_train, predicted_values)
267
+ # adjr2 = model.rsquared_adj
268
+ # r2 = model.rsquared
269
+ # filename = os.path.join(save_path, f"model_{i}.pkl")
270
+ # with open(filename, "wb") as f:
271
+ # pickle.dump(model, f)
272
+ # # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
273
+ # # model = pickle.load(file)
274
+
275
+ # st.session_state['Model_results']['Model_object'].append(filename)
276
+ # st.session_state['Model_results']['Model_iteration'].append(i)
277
+ # st.session_state['Model_results']['Feature_set'].append(fet)
278
+ # st.session_state['Model_results']['MAPE'].append(mape)
279
+ # st.session_state['Model_results']['R2'].append(r2)
280
+ # st.session_state['Model_results']['ADJR2'].append(adjr2)
281
+
282
+ # current_time = time.time()
283
+ # time_taken = current_time - start_time
284
+ # time_elapsed_minutes = time_taken / 60
285
+ # completed_iterations_text = f"{i + 1}/{iterations}"
286
+ # progress_bar.progress((i + 1) / int(iterations))
287
+ # progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
288
+
289
+ # st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
290
+ # pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
291
+
292
+ # def to_percentage(value):
293
+ # return f'{value * 100:.1f}%'
294
+
295
+ # st.title('2. Select Models')
296
+ # if 'tick' not in st.session_state:
297
+ # st.session_state['tick']=False
298
+ # if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
299
+ # st.session_state['tick']=True
300
+ # st.write('Select one model iteration to generate performance metrics for it:')
301
+ # data=pd.DataFrame(st.session_state['Model_results'])
302
+ # data.sort_values(by=['MAPE'],ascending=False,inplace=True)
303
+ # data.drop_duplicates(subset='Model_iteration',inplace=True)
304
+ # top_10=data.head(10)
305
+ # top_10['Rank']=np.arange(1,len(top_10)+1,1)
306
+ # top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
307
+ # top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
308
+ # #top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
309
+ # gd=GridOptionsBuilder.from_dataframe(top_10_table)
310
+ # gd.configure_pagination(enabled=True)
311
+ # gd.configure_selection(use_checkbox=True)
312
+
313
+
314
+ # gridoptions=gd.build()
315
+
316
+ # table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
317
+
318
+ # selected_rows=table.selected_rows
319
+ # # if st.session_state["selected_rows"] != selected_rows:
320
+ # # st.session_state["build_rc_cb"] = False
321
+ # st.session_state["selected_rows"] = selected_rows
322
+ # if 'Model' not in st.session_state:
323
+ # st.session_state['Model']={}
324
+
325
+ # if len(selected_rows)>0:
326
+ # st.header('2.1 Results Summary')
327
+
328
+ # model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
329
+ # features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
330
+
331
+ # with open(str(model_object.values[0]), 'rb') as file:
332
+ # model = pickle.load(file)
333
+ # st.write(model.summary())
334
+ # st.header('2.2 Actual vs. Predicted Plot')
335
+
336
+ # df=st.session_state['media_data']
337
+ # X=df[features_set.values[0]]
338
+ # X = sm.add_constant(X)
339
+ # y=revenue
340
+ # X_train=X.iloc[:150]
341
+ # X_test=X.iloc[150:]
342
+ # y_train=y.iloc[:150]
343
+ # y_test=y.iloc[150:]
344
+ # ss = MinMaxScaler()
345
+ # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
346
+ # st.session_state['X']=X_train
347
+ # st.session_state['features_set']=features_set.values[0]
348
+
349
+ # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
350
+
351
+ # st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
352
+
353
+
354
+
355
+ # st.markdown('## 2.3 Residual Analysis')
356
+ # columns=st.columns(2)
357
+ # with columns[0]:
358
+ # fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
359
+ # st.plotly_chart(fig)
360
+
361
+ # with columns[1]:
362
+ # st.empty()
363
+ # fig = qqplot(y_train,model.predict(X_train))
364
+ # st.plotly_chart(fig)
365
+
366
+ # with columns[0]:
367
+ # fig=residual_distribution(y_train,model.predict(X_train))
368
+ # st.pyplot(fig)
369
+
370
+
371
+
372
+ # vif_data = pd.DataFrame()
373
+ # # X=X.drop('const',axis=1)
374
+ # vif_data["Variable"] = X_train.columns
375
+ # vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
376
+ # vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
377
+ # vif_data=np.round(vif_data)
378
+ # vif_data['VIF']=vif_data['VIF'].astype(float)
379
+ # st.header('2.4 Variance Inflation Factor (VIF)')
380
+ # #st.dataframe(vif_data)
381
+ # color_mapping = {
382
+ # 'darkgreen': (vif_data['VIF'] < 3),
383
+ # 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
384
+ # 'darkred': (vif_data['VIF'] > 10)
385
+ # }
386
+
387
+ # # Create a horizontal bar plot
388
+ # fig, ax = plt.subplots()
389
+ # fig.set_figwidth(10) # Adjust the width of the figure as needed
390
+
391
+ # # Sort the bars by descending VIF values
392
+ # vif_data = vif_data.sort_values(by='VIF', ascending=False)
393
+
394
+ # # Iterate through the color mapping and plot bars with corresponding colors
395
+ # for color, condition in color_mapping.items():
396
+ # subset = vif_data[condition]
397
+ # bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
398
+
399
+ # # Add text annotations on top of the bars
400
+ # for bar in bars:
401
+ # width = bar.get_width()
402
+ # ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
403
+ # textcoords='offset points', va='center')
404
+
405
+ # # Customize the plot
406
+ # ax.set_xlabel('VIF Values')
407
+ # #ax.set_title('2.4 Variance Inflation Factor (VIF)')
408
+ # #ax.legend(loc='upper right')
409
+
410
+ # # Display the plot in Streamlit
411
+ # st.pyplot(fig)
412
+
413
+ # with st.expander('Results Summary Test data'):
414
+ # ss = MinMaxScaler()
415
+ # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
416
+ # st.header('2.2 Actual vs. Predicted Plot')
417
+
418
+ # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, model.predict(X_test), model,target_column='Revenue')
419
+
420
+ # st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
421
+
422
+ # st.markdown('## 2.3 Residual Analysis')
423
+ # columns=st.columns(2)
424
+ # with columns[0]:
425
+ # fig=plot_residual_predicted(revenue,model.predict(X_test),X_test)
426
+ # st.plotly_chart(fig)
427
+
428
+ # with columns[1]:
429
+ # st.empty()
430
+ # fig = qqplot(revenue,model.predict(X_test))
431
+ # st.plotly_chart(fig)
432
+
433
+ # with columns[0]:
434
+ # fig=residual_distribution(revenue,model.predict(X_test))
435
+ # st.pyplot(fig)
436
+
437
+ # value=False
438
+ # if st.checkbox('Save this model to tune',key='build_rc_cb'):
439
+ # mod_name=st.text_input('Enter model name')
440
+ # if len(mod_name)>0:
441
+ # st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
442
+ # st.session_state['X_train']=X_train
443
+ # st.session_state['X_test']=X_test
444
+ # st.session_state['y_train']=y_train
445
+ # st.session_state['y_test']=y_test
446
+ # with open("best_models.pkl", "wb") as f:
447
+ # pickle.dump(st.session_state['Model'], f)
448
+ # st.success('Model saved!, Proceed next page to tune the model')
449
+ # value=False
dump/1_Transformations_with_panel.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from Eda_functions import format_numbers
6
+ import numpy as np
7
+ import pickle
8
+ from st_aggrid import AgGrid
9
+ from st_aggrid import GridOptionsBuilder,GridUpdateMode
10
+ from utilities import set_header,load_local_css
11
+ from st_aggrid import GridOptionsBuilder
12
+ import time
13
+ import itertools
14
+ import statsmodels.api as sm
15
+ import numpy as npc
16
+ import re
17
+ import itertools
18
+ from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
19
+ from sklearn.preprocessing import MinMaxScaler
20
+ import os
21
+ import matplotlib.pyplot as plt
22
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
23
+ st.set_option('deprecation.showPyplotGlobalUse', False)
24
+ import statsmodels.api as sm
25
+ import statsmodels.formula.api as smf
26
+
27
+ from datetime import datetime
28
+ import seaborn as sns
29
+ from Data_prep_functions import *
30
+
31
+
32
+ def get_random_effects(media_data, panel_col, mdf):
33
+ random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
34
+
35
+ for i, market in enumerate(media_data[panel_col].unique()):
36
+ print(i, end='\r')
37
+ intercept = mdf.random_effects[market].values[0]
38
+ random_eff_df.loc[i, 'random_effect'] = intercept
39
+ random_eff_df.loc[i, panel_col] = market
40
+
41
+ return random_eff_df
42
+
43
+
44
+ def mdf_predict(X, mdf, random_eff_df) :
45
+
46
+ X['fixed_effect'] = mdf.predict(X)
47
+ merged_df=pd.merge(X[[panel_col,target_col]], random_eff_df, on = panel_col, how = 'left')
48
+ X['random_effect'] = merged_df['random_effect']
49
+ X['pred'] = X['fixed_effect'] + X['random_effect']
50
+ return X['pred']
51
+
52
+ st.set_page_config(
53
+ page_title="Model Build",
54
+ page_icon=":shark:",
55
+ layout="wide",
56
+ initial_sidebar_state='collapsed'
57
+ )
58
+
59
+ load_local_css('styles.css')
60
+ set_header()
61
+
62
+
63
+ st.title('1. Build Your Model')
64
+
65
+ panel_col= 'markets' # set the panel column
66
+ date_col = 'date'
67
+ target_col = 'total_approved_accounts_revenue'
68
+
69
+ media_data=pd.read_csv('upf_data_converted.csv')
70
+ media_data.columns=[i.lower().replace('-','').replace(':','').replace("__", "_") for i in media_data.columns]
71
+
72
+ # st.write(media_data.columns)
73
+ media_data.sort_values(date_col, inplace=True)
74
+ media_data.reset_index(drop=True,inplace=True)
75
+
76
+ date=media_data[date_col]
77
+ st.session_state['date']=date
78
+ revenue=media_data[target_col]
79
+ media_data.drop([target_col],axis=1,inplace=True)
80
+ media_data.drop([date_col],axis=1,inplace=True)
81
+ media_data.reset_index(drop=True,inplace=True)
82
+
83
+
84
+ if st.toggle('Apply Transformations on DMA/Panel Level'):
85
+ dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel', 'markets']])
86
+
87
+
88
+ else:
89
+ #""" code to aggregate data on date """
90
+
91
+
92
+ dma=None
93
+
94
+ # dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
95
+ # st.write(dma_dict)
96
+
97
+ st.markdown('## Select the Range of Transformations')
98
+ columns = st.columns(2)
99
+ old_shape=media_data.shape
100
+
101
+
102
+ if "old_shape" not in st.session_state:
103
+ st.session_state['old_shape']=old_shape
104
+
105
+
106
+ with columns[0]:
107
+ slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
108
+ with columns[1]:
109
+ slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
110
+
111
+ # with columns[2]:
112
+ # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
113
+
114
+ # with columns[1]:
115
+ # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
116
+ # st.number_input('Select the range of ')
117
+
118
+ # Section 1 - Transformations Functions
119
+ def lag(data,features,lags,dma=None):
120
+ if dma:
121
+
122
+ transformed_data=pd.concat([data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
123
+ transformed_data=transformed_data.fillna(method='bfill')
124
+ return pd.concat([transformed_data,data],axis=1)
125
+
126
+ else:
127
+
128
+ #''' data should be aggregated on date'''
129
+
130
+ transformed_data=pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags],axis=1)
131
+ transformed_data=transformed_data.fillna(method='bfill')
132
+
133
+ return pd.concat([transformed_data,data],axis=1)
134
+
135
+ #adstock
136
+ def adstock(df, alphas, cutoff, features,dma=None):
137
+ # st.write(features)
138
+
139
+ if dma:
140
+ transformed_data=pd.DataFrame()
141
+ for d in df[dma].unique():
142
+ dma_sub_df = df[df[dma] == d]
143
+ n = len(dma_sub_df)
144
+
145
+
146
+ weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
147
+ X = dma_sub_df[features].to_numpy()
148
+
149
+ res = pd.DataFrame(np.hstack(weights @ X),
150
+ columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
151
+
152
+ transformed_data=pd.concat([transformed_data,res],axis=0)
153
+ transformed_data.reset_index(drop=True,inplace=True)
154
+ return pd.concat([transformed_data,df],axis=1)
155
+
156
+ else:
157
+
158
+ n = len(df)
159
+
160
+
161
+ weights = np.array([[[alpha**(i-j) if i >= j and j >= i-cutoff else 0. for j in range(n)] for i in range(n)] for alpha in alphas])
162
+
163
+ X = df[features].to_numpy()
164
+ res = pd.DataFrame(np.hstack(weights @ X),
165
+ columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
166
+ return pd.concat([res,df],axis=1)
167
+
168
+
169
+
170
+
171
+ # Section 2 - Begin Transformations
172
+
173
+ if 'media_data' not in st.session_state:
174
+
175
+ st.session_state['media_data']=pd.DataFrame()
176
+
177
+ # variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
178
+ variables_to_be_transformed=[col for col in media_data.columns if '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
179
+ # st.write(variables_to_be_transformed)
180
+ # st.write(media_data[variables_to_be_transformed].dtypes)
181
+
182
+ with columns[0]:
183
+ if st.button('Apply Transformations'):
184
+ with st.spinner('Applying Transformations'):
185
+ transformed_data_lag=lag(media_data,features=variables_to_be_transformed,lags=np.arange(slider_value_lag[0],slider_value_lag[1]+1,1),dma=dma)
186
+
187
+ # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
188
+ variables_to_be_transformed = [col for col in media_data.columns if
189
+ '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
190
+
191
+ transformed_data_adstock=adstock(df=transformed_data_lag, alphas=np.arange(slider_value_adstock[0],slider_value_adstock[1],0.1), cutoff=8, features=variables_to_be_transformed,dma=dma)
192
+
193
+ # st.success('Done')
194
+ st.success("Transformations complete!")
195
+
196
+ st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
197
+ # st.write(media_data.head(10))
198
+ # st.write(transformed_data_adstock.head(10))
199
+
200
+ transformed_data_adstock.columns = [c.replace(".","_") for c in transformed_data_adstock.columns] # srishti
201
+ # st.write(transformed_data_adstock.columns)
202
+ st.session_state['media_data']=transformed_data_adstock # srishti
203
+
204
+ # with st.spinner('Applying Transformations'):
205
+ # time.sleep(2)
206
+ # st.success("Transformations complete!")
207
+
208
+ # if st.session_state['media_data'].shape[1]>old_shape[1]:
209
+ # with columns[0]:
210
+ # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
211
+ #st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
212
+
213
+ # Section 3 - Create combinations
214
+
215
+ # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
216
+ # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
217
+ # ' GA App: Will And Cid Pequena Baixo Risco Clicks',
218
+ # 'digital_tactic_others',"programmatic"
219
+ # ]
220
+
221
+ # srishti - bucket names changed
222
+ bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','fb_level_achieved_tier_2',
223
+ 'fb_level_achieved_tier_1','paid_social_others',
224
+ 'ga_app',
225
+ 'digital_tactic_others',"programmatic"
226
+ ]
227
+
228
+ with columns[1]:
229
+ if st.button('Create Combinations of Variables'):
230
+
231
+ top_3_correlated_features=[]
232
+ # for col in st.session_state['media_data'].columns[:19]:
233
+ original_cols = [c for c in st.session_state['media_data'].columns if "_clicks" in c.lower() or "_impressions" in c.lower()]
234
+ original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]
235
+ # st.write(original_cols)
236
+
237
+ # for col in st.session_state['media_data'].columns[:19]:
238
+ for col in original_cols: # srishti - new
239
+ corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
240
+ revenue],axis=1).corr()[target_col].iloc[:-1]
241
+ top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
242
+ # st.write(col, top_3_correlated_features)
243
+ flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
244
+ # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
245
+ all_features_set={var:[col for col in flattened_list if var in col] for var in bucket if len([col for col in flattened_list if var in col])>0} # srishti
246
+
247
+ channels_all=[values for values in all_features_set.values()]
248
+ # st.write(channels_all)
249
+ st.session_state['combinations'] = list(itertools.product(*channels_all))
250
+ # if 'combinations' not in st.session_state:
251
+ # st.session_state['combinations']=combinations_all
252
+
253
+ st.session_state['final_selection']=st.session_state['combinations']
254
+ st.success('Done')
255
+ # st.write(f"{len(st.session_state['combinations'])} combinations created")
256
+
257
+
258
+ revenue.reset_index(drop=True,inplace=True)
259
+ if 'Model_results' not in st.session_state:
260
+ st.session_state['Model_results']={'Model_object':[],
261
+ 'Model_iteration':[],
262
+ 'Feature_set':[],
263
+ 'MAPE':[],
264
+ 'R2':[],
265
+ 'ADJR2':[]
266
+ }
267
+
268
+ def reset_model_result_dct():
269
+ st.session_state['Model_results']={'Model_object':[],
270
+ 'Model_iteration':[],
271
+ 'Feature_set':[],
272
+ 'MAPE':[],
273
+ 'R2':[],
274
+ 'ADJR2':[]
275
+ }
276
+
277
+ # if st.button('Build Model'):
278
+ if 'iterations' not in st.session_state:
279
+ st.session_state['iterations']=0
280
+ # st.write("1",st.session_state["final_selection"])
281
+
282
+ if 'final_selection' not in st.session_state:
283
+ st.session_state['final_selection']=False
284
+
285
+ save_path = r"Model/"
286
+ with columns[1]:
287
+ if st.session_state['final_selection']:
288
+ st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
289
+
290
+
291
+ if st.checkbox('Build all iterations'):
292
+ iterations=len(st.session_state['final_selection'])
293
+ else:
294
+ iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=10, value=st.session_state['iterations'],on_change=reset_model_result_dct)
295
+ # st.write("iterations=", iterations)
296
+
297
+ if st.button('Build Model',on_click=reset_model_result_dct):
298
+ st.session_state['iterations']=iterations
299
+ # st.write("2",st.session_state["final_selection"])
300
+
301
+ # Section 4 - Model
302
+
303
+ st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
304
+ st.markdown(
305
+ 'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
306
+ progress_bar = st.progress(0) # Initialize the progress bar
307
+ # time_remaining_text = st.empty() # Create an empty space for time remaining text
308
+ start_time = time.time() # Record the start time
309
+ progress_text = st.empty()
310
+ # time_elapsed_text = st.empty()
311
+ # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
312
+ # st.write(st.session_state["final_selection"])
313
+ # for i, selected_features in enumerate(st.session_state["final_selection"]):
314
+ for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
315
+ print("@@@@@@@@@@@@@",i)
316
+ df = st.session_state['media_data']
317
+
318
+ fet = [var for var in selected_features if len(var) > 0]
319
+ inp_vars_str = " + ".join(fet) # new
320
+
321
+ X = df[fet]
322
+ y = revenue
323
+ ss = MinMaxScaler()
324
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
325
+ # X = sm.add_constant(X)
326
+
327
+ X['total_approved_accounts_revenue'] = revenue # new
328
+ X[panel_col] = df[panel_col]
329
+
330
+ X_train = X.iloc[:8000]
331
+ X_test = X.iloc[8000:]
332
+ y_train = y.iloc[:8000]
333
+ y_test = y.iloc[8000:]
334
+
335
+ print(X_train.shape)
336
+ # model = sm.OLS(y_train, X_train).fit()
337
+ md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
338
+ data=X_train[['total_approved_accounts_revenue'] + fet],
339
+ groups=X_train[panel_col])
340
+ mdf = md.fit()
341
+ predicted_values = mdf.fittedvalues
342
+
343
+ # st.write(fet)
344
+ # positive_coeff=fet
345
+ # negetive_coeff=[]
346
+
347
+ coefficients = mdf.fe_params.to_dict()
348
+ model_possitive = [col for col in coefficients.keys() if coefficients[col] > 0]
349
+ # st.write(positive_coeff)
350
+ # st.write(model_possitive)
351
+ pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]
352
+
353
+ # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
354
+ if (len(model_possitive) / len(selected_features)) > 0 and (len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later
355
+ # predicted_values = model.predict(X_train)
356
+ mape = mean_absolute_percentage_error(y_train, predicted_values)
357
+ r2 = r2_score(y_train, predicted_values)
358
+ adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)
359
+
360
+ filename = os.path.join(save_path, f"model_{i}.pkl")
361
+ with open(filename, "wb") as f:
362
+ pickle.dump(mdf, f)
363
+ # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
364
+ # model = pickle.load(file)
365
+
366
+ st.session_state['Model_results']['Model_object'].append(filename)
367
+ st.session_state['Model_results']['Model_iteration'].append(i)
368
+ st.session_state['Model_results']['Feature_set'].append(fet)
369
+ st.session_state['Model_results']['MAPE'].append(mape)
370
+ st.session_state['Model_results']['R2'].append(r2)
371
+ st.session_state['Model_results']['ADJR2'].append(adjr2)
372
+
373
+ current_time = time.time()
374
+ time_taken = current_time - start_time
375
+ time_elapsed_minutes = time_taken / 60
376
+ completed_iterations_text = f"{i + 1}/{iterations}"
377
+ progress_bar.progress((i + 1) / int(iterations))
378
+ progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
379
+
380
+ st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
381
+ pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
382
+
383
+ def to_percentage(value):
384
+ return f'{value * 100:.1f}%'
385
+
386
+ st.title('2. Select Models')
387
+ if 'tick' not in st.session_state:
388
+ st.session_state['tick']=False
389
+ if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
390
+ st.session_state['tick']=True
391
+ st.write('Select one model iteration to generate performance metrics for it:')
392
+ data=pd.DataFrame(st.session_state['Model_results'])
393
+ data.sort_values(by=['MAPE'],ascending=False,inplace=True)
394
+ data.drop_duplicates(subset='Model_iteration',inplace=True)
395
+ top_10=data.head(10)
396
+ top_10['Rank']=np.arange(1,len(top_10)+1,1)
397
+ top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
398
+ top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
399
+ #top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
400
+ gd=GridOptionsBuilder.from_dataframe(top_10_table)
401
+ gd.configure_pagination(enabled=True)
402
+ gd.configure_selection(use_checkbox=True)
403
+
404
+
405
+ gridoptions=gd.build()
406
+
407
+ table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
408
+
409
+ selected_rows=table.selected_rows
410
+ # if st.session_state["selected_rows"] != selected_rows:
411
+ # st.session_state["build_rc_cb"] = False
412
+ st.session_state["selected_rows"] = selected_rows
413
+ if 'Model' not in st.session_state:
414
+ st.session_state['Model']={}
415
+
416
+ if len(selected_rows)>0:
417
+ st.header('2.1 Results Summary')
418
+
419
+ model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
420
+ features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
421
+
422
+ with open(str(model_object.values[0]), 'rb') as file:
423
+ # print(file)
424
+ model = pickle.load(file)
425
+ st.write(model.summary())
426
+ st.header('2.2 Actual vs. Predicted Plot')
427
+
428
+ df=st.session_state['media_data']
429
+ X=df[features_set.values[0]]
430
+ # X = sm.add_constant(X)
431
+ y=revenue
432
+
433
+ ss = MinMaxScaler()
434
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
435
+
436
+ X['total_approved_accounts_revenue'] = revenue # new
437
+ X[panel_col] = df[panel_col]
438
+
439
+ X_train=X.iloc[:8000]
440
+ X_test=X.iloc[8000:]
441
+ y_train=y.iloc[:8000]
442
+ y_test=y.iloc[8000:]
443
+
444
+ st.session_state['X']=X_train
445
+ st.session_state['features_set']=features_set.values[0]
446
+
447
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.fittedvalues, model,target_column='Revenue')
448
+
449
+ st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
450
+
451
+ random_eff_df = get_random_effects(media_data, panel_col, model)
452
+
453
+
454
+ st.markdown('## 2.3 Residual Analysis')
455
+ columns=st.columns(2)
456
+ with columns[0]:
457
+ fig=plot_residual_predicted(y_train,model.fittedvalues,X_train)
458
+ st.plotly_chart(fig)
459
+
460
+ with columns[1]:
461
+ st.empty()
462
+ fig = qqplot(y_train,model.fittedvalues)
463
+ st.plotly_chart(fig)
464
+
465
+ with columns[0]:
466
+ fig=residual_distribution(y_train,model.fittedvalues)
467
+ st.pyplot(fig)
468
+
469
+
470
+
471
+ vif_data = pd.DataFrame()
472
+ # X=X.drop('const',axis=1)
473
+ vif_data["Variable"] = X_train.columns
474
+ vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
475
+ vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
476
+ vif_data=np.round(vif_data)
477
+ vif_data['VIF']=vif_data['VIF'].astype(float)
478
+ st.header('2.4 Variance Inflation Factor (VIF)')
479
+ #st.dataframe(vif_data)
480
+ color_mapping = {
481
+ 'darkgreen': (vif_data['VIF'] < 3),
482
+ 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
483
+ 'darkred': (vif_data['VIF'] > 10)
484
+ }
485
+
486
+ # Create a horizontal bar plot
487
+ fig, ax = plt.subplots()
488
+ fig.set_figwidth(10) # Adjust the width of the figure as needed
489
+
490
+ # Sort the bars by descending VIF values
491
+ vif_data = vif_data.sort_values(by='VIF', ascending=False)
492
+
493
+ # Iterate through the color mapping and plot bars with corresponding colors
494
+ for color, condition in color_mapping.items():
495
+ subset = vif_data[condition]
496
+ bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
497
+
498
+ # Add text annotations on top of the bars
499
+ for bar in bars:
500
+ width = bar.get_width()
501
+ ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
502
+ textcoords='offset points', va='center')
503
+
504
+ # Customize the plot
505
+ ax.set_xlabel('VIF Values')
506
+ #ax.set_title('2.4 Variance Inflation Factor (VIF)')
507
+ #ax.legend(loc='upper right')
508
+
509
+ # Display the plot in Streamlit
510
+ st.pyplot(fig)
511
+
512
+ with st.expander('Results Summary Test data'):
513
+ ss = MinMaxScaler()
514
+ X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
515
+ st.header('2.2 Actual vs. Predicted Plot')
516
+
517
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, mdf_predict(X_test,mdf, random_eff_df), model,target_column='Revenue')
518
+
519
+ st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
520
+
521
+ st.markdown('## 2.3 Residual Analysis')
522
+ columns=st.columns(2)
523
+ with columns[0]:
524
+ fig=plot_residual_predicted(revenue,mdf_predict(X_test,mdf, random_eff_df),X_test)
525
+ st.plotly_chart(fig)
526
+
527
+ with columns[1]:
528
+ st.empty()
529
+ fig = qqplot(revenue,mdf_predict(X_test,mdf, random_eff_df))
530
+ st.plotly_chart(fig)
531
+
532
+ with columns[0]:
533
+ fig=residual_distribution(revenue,mdf_predict(X_test,mdf, random_eff_df))
534
+ st.pyplot(fig)
535
+
536
+ value=False
537
+ if st.checkbox('Save this model to tune',key='build_rc_cb'):
538
+ mod_name=st.text_input('Enter model name')
539
+ if len(mod_name)>0:
540
+ st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
541
+ st.session_state['X_train']=X_train
542
+ st.session_state['X_test']=X_test
543
+ st.session_state['y_train']=y_train
544
+ st.session_state['y_test']=y_test
545
+ with open("best_models.pkl", "wb") as f:
546
+ pickle.dump(st.session_state['Model'], f)
547
+ st.success('Model saved!, Proceed next page to tune the model')
548
+ value=False
dump/2_Model_Build_and_Performance.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from Eda_functions import format_numbers
6
+ import numpy as np
7
+ import pickle
8
+ from st_aggrid import AgGrid
9
+ from st_aggrid import GridOptionsBuilder,GridUpdateMode
10
+ from utilities import set_header,load_local_css
11
+ from st_aggrid import GridOptionsBuilder
12
+ import time
13
+ import itertools
14
+ import statsmodels.api as sm
15
+ import numpy as np
16
+ import re
17
+ import itertools
18
+ from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
19
+ from sklearn.preprocessing import MinMaxScaler
20
+ import os
21
+ import matplotlib.pyplot as plt
22
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
23
+ st.set_option('deprecation.showPyplotGlobalUse', False)
24
+ from datetime import datetime
25
+ import seaborn as sns
26
+ from Data_prep_functions import *
27
+
28
+ st.set_page_config(
29
+ page_title="Model Build",
30
+ page_icon=":shark:",
31
+ layout="wide",
32
+ initial_sidebar_state='collapsed'
33
+ )
34
+
35
+ load_local_css('styles.css')
36
+ set_header()
37
+
38
+
39
+ st.title('1. Build Your Model')
40
+
41
+ # media_data=pd.read_csv('Media_data_for_model.csv')
42
+ media_data=pd.read_csv('Media_data_for_model_dma_level.csv')
43
+ date=media_data['Date']
44
+ st.session_state['date']=date
45
+ revenue=media_data['Total Approved Accounts - Revenue']
46
+ media_data.drop(['Total Approved Accounts - Revenue'],axis=1,inplace=True)
47
+ media_data.drop(['Date'],axis=1,inplace=True)
48
+ media_data.reset_index(drop=True,inplace=True)
49
+
50
+ dma=st.selectbox('Select the Level of data ',[ col for col in media_data.columns if col.lower() in ['dma','panel']])
51
+
52
+ dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
53
+ # st.write(dma_dict)
54
+
55
+ st.markdown('## Select the Range of Transformations')
56
+ columns = st.columns(2)
57
+ old_shape=media_data.shape
58
+
59
+
60
+
61
+
62
+
63
+ if "old_shape" not in st.session_state:
64
+ st.session_state['old_shape']=old_shape
65
+
66
+
67
+
68
+
69
+ with columns[0]:
70
+ slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, format="%.2f")
71
+ with columns[1]:
72
+ slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), step=1)
73
+
74
+ # with columns[2]:
75
+ # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
76
+
77
+ # with columns[1]:
78
+ # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
79
+ # st.number_input('Select the range of ')
80
+
81
+ def lag(X, features, min_lag=0,max_lag=6):
82
+
83
+ for i in features:
84
+ for lag in range(min_lag, max_lag + 1):
85
+ X[f'{i}_lag{lag}'] = X[i].shift(periods=lag)
86
+ return X.fillna(method='bfill')
87
+
88
+ def adstock_variable(X,variable_name,decay):
89
+ adstock = [0] * len(X[variable_name])
90
+
91
+ for t in range(len(X[variable_name])):
92
+ if t == 0:
93
+ adstock[t] = X[variable_name][t]
94
+ else:
95
+ adstock[t] = X[variable_name][t] + adstock[t-1] * decay
96
+ return adstock
97
+
98
+ if 'media_data' not in st.session_state:
99
+
100
+ st.session_state['media_data']=pd.DataFrame()
101
+
102
+ variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ]
103
+ # st.write(variables_to_be_transformed)
104
+ with columns[0]:
105
+ if st.button('Apply Transformations'):
106
+ for dm in dma_dict.keys():
107
+ dma_dict[dm].reset_index(drop=True,inplace=True)
108
+ dma_dict[dm]=lag(dma_dict[dm],variables_to_be_transformed,min_lag=slider_value_lag[0],max_lag=slider_value_lag[1])
109
+
110
+ for dm in dma_dict.keys():
111
+ for i in dma_dict[dm].drop(['DMA','Panel'],axis=1).columns:
112
+ for j in np.arange(slider_value_adstock[0],slider_value_adstock[1]+0.1,0.1):#adding adstock
113
+ dma_dict[dm][f'{i}_adst.{np.round(j,2)}']=adstock_variable(dma_dict[dm],i,j)
114
+
115
+
116
+ st.write(dma_dict)
117
+ st.session_state['media_data']=media_data
118
+
119
+ with st.spinner('Applying Transformations'):
120
+ time.sleep(2)
121
+ st.success("Transformations complete!")
122
+
123
+ if st.session_state['media_data'].shape[1]>old_shape[1]:
124
+ with columns[0]:
125
+ st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
126
+ #st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
127
+
128
+
129
+ bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
130
+ ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
131
+ ' GA App: Will And Cid Pequena Baixo Risco Clicks',
132
+ 'digital_tactic_others',"programmatic"
133
+ ]
134
+
135
+ with columns[1]:
136
+ if st.button('Create Combinations of Variables'):
137
+
138
+ top_3_correlated_features=[]
139
+ for col in st.session_state['media_data'].columns[:19]:
140
+ corr_df=pd.concat([st.session_state['media_data'].filter(regex=col),
141
+ revenue],axis=1).corr()['Total Approved Accounts - Revenue'].iloc[:-1]
142
+ top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
143
+ flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
144
+ all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
145
+ channels_all=[values for values in all_features_set.values()]
146
+ st.session_state['combinations'] = list(itertools.product(*channels_all))
147
+
148
+ # if 'combinations' not in st.session_state:
149
+ # st.session_state['combinations']=combinations_all
150
+
151
+ st.session_state['final_selection']=st.session_state['combinations']
152
+
153
+
154
+
155
+ revenue.reset_index(drop=True,inplace=True)
156
+ if 'Model_results' not in st.session_state:
157
+ st.session_state['Model_results']={'Model_object':[],
158
+ 'Model_iteration':[],
159
+ 'Feature_set':[],
160
+ 'MAPE':[],
161
+ 'R2':[],
162
+ 'ADJR2':[]
163
+ }
164
+
165
+ #if st.button('Build Model'):
166
+ if 'iterations' not in st.session_state:
167
+ st.session_state['iterations']=1
168
+ save_path = r"Model"
169
+ with columns[1]:
170
+ if "final_selection" in st.session_state:
171
+ st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
172
+
173
+ st.success('Done')
174
+ if st.checkbox('Build all iterations'):
175
+ iterations=len(st.session_state['final_selection'])
176
+ else:
177
+ iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=100, value=st.session_state['iterations'])
178
+
179
+ st.session_state['iterations']=iterations
180
+
181
+
182
+ st.session_state['media_data']=st.session_state['media_data'].fillna(method='ffill')
183
+ if st.button("Build Models"):
184
+ st.markdown('Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
185
+ progress_bar = st.progress(0) # Initialize the progress bar
186
+ #time_remaining_text = st.empty() # Create an empty space for time remaining text
187
+ start_time = time.time() # Record the start time
188
+ progress_text = st.empty()
189
+ #time_elapsed_text = st.empty()
190
+ for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000+int(iterations)]):
191
+ df = st.session_state['media_data']
192
+
193
+ fet = [var for var in selected_features if len(var) > 0]
194
+ X = df[fet]
195
+ y = revenue
196
+ ss = MinMaxScaler()
197
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
198
+ X = sm.add_constant(X)
199
+ X_train=X.iloc[:150]
200
+ X_test=X.iloc[150:]
201
+ y_train=y.iloc[:150]
202
+ y_test=y.iloc[150:]
203
+
204
+
205
+ model = sm.OLS(y_train, X_train).fit()
206
+ # st.write(fet)
207
+ positive_coeff=X.columns
208
+ negetive_coeff=[]
209
+ coefficients=model.params.to_dict()
210
+ model_possitive=[col for col in coefficients.keys() if coefficients[col]>0]
211
+ # st.write(positive_coeff)
212
+ # st.write(model_possitive)
213
+ pvalues=[var for var in list(model.pvalues) if var<=0.06]
214
+ if (len(model_possitive)/len(selected_features))>0.9 and (len(pvalues)/len(selected_features))>=0.8:
215
+
216
+
217
+ predicted_values = model.predict(X_train)
218
+ mape = mean_absolute_percentage_error(y_train, predicted_values)
219
+ adjr2 = model.rsquared_adj
220
+ r2 = model.rsquared
221
+ filename = os.path.join(save_path, f"model_{i}.pkl")
222
+ with open(filename, "wb") as f:
223
+ pickle.dump(model, f)
224
+ # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
225
+ # model = pickle.load(file)
226
+
227
+ st.session_state['Model_results']['Model_object'].append(filename)
228
+ st.session_state['Model_results']['Model_iteration'].append(i)
229
+ st.session_state['Model_results']['Feature_set'].append(fet)
230
+ st.session_state['Model_results']['MAPE'].append(mape)
231
+ st.session_state['Model_results']['R2'].append(r2)
232
+ st.session_state['Model_results']['ADJR2'].append(adjr2)
233
+
234
+ current_time = time.time()
235
+ time_taken = current_time - start_time
236
+ time_elapsed_minutes = time_taken / 60
237
+ completed_iterations_text = f"{i + 1}/{iterations}"
238
+ progress_bar.progress((i + 1) / int(iterations))
239
+ progress_text.text(f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
240
+
241
+ st.write(f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
242
+ pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
243
+
244
+ def to_percentage(value):
245
+ return f'{value * 100:.1f}%'
246
+
247
+ st.title('2. Select Models')
248
+ if 'tick' not in st.session_state:
249
+ st.session_state['tick']=False
250
+ if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)',value=st.session_state['tick']):
251
+ st.session_state['tick']=True
252
+ st.write('Select one model iteration to generate performance metrics for it:')
253
+ data=pd.DataFrame(st.session_state['Model_results'])
254
+ data.sort_values(by=['MAPE'],ascending=False,inplace=True)
255
+ data.drop_duplicates(subset='Model_iteration',inplace=True)
256
+ top_10=data.head(10)
257
+ top_10['Rank']=np.arange(1,len(top_10)+1,1)
258
+ top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
259
+ top_10_table = top_10[['Rank','Model_iteration','MAPE','ADJR2','R2']]
260
+ #top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
261
+
262
+
263
+ gd=GridOptionsBuilder.from_dataframe(top_10_table)
264
+ gd.configure_pagination(enabled=True)
265
+ gd.configure_selection(use_checkbox=True)
266
+
267
+
268
+ gridoptions=gd.build()
269
+
270
+ table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
271
+
272
+ selected_rows=table.selected_rows
273
+ # if st.session_state["selected_rows"] != selected_rows:
274
+ # st.session_state["build_rc_cb"] = False
275
+ st.session_state["selected_rows"] = selected_rows
276
+ if 'Model' not in st.session_state:
277
+ st.session_state['Model']={}
278
+
279
+ if len(selected_rows)>0:
280
+ st.header('2.1 Results Summary')
281
+
282
+ model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
283
+ features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
284
+
285
+ with open(str(model_object.values[0]), 'rb') as file:
286
+ model = pickle.load(file)
287
+ st.write(model.summary())
288
+ st.header('2.2 Actual vs. Predicted Plot')
289
+
290
+ df=st.session_state['media_data']
291
+ X=df[features_set.values[0]]
292
+ X = sm.add_constant(X)
293
+ y=revenue
294
+ X_train=X.iloc[:150]
295
+ X_test=X.iloc[150:]
296
+ y_train=y.iloc[:150]
297
+ y_test=y.iloc[150:]
298
+ ss = MinMaxScaler()
299
+ X_train = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
300
+ st.session_state['X']=X_train
301
+ st.session_state['features_set']=features_set.values[0]
302
+
303
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
304
+
305
+ st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
306
+
307
+
308
+
309
+ st.markdown('## 2.3 Residual Analysis')
310
+ columns=st.columns(2)
311
+ with columns[0]:
312
+ fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
313
+ st.plotly_chart(fig)
314
+
315
+ with columns[1]:
316
+ st.empty()
317
+ fig = qqplot(y_train,model.predict(X_train))
318
+ st.plotly_chart(fig)
319
+
320
+ with columns[0]:
321
+ fig=residual_distribution(y_train,model.predict(X_train))
322
+ st.pyplot(fig)
323
+
324
+
325
+
326
+ vif_data = pd.DataFrame()
327
+ # X=X.drop('const',axis=1)
328
+ vif_data["Variable"] = X_train.columns
329
+ vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
330
+ vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
331
+ vif_data=np.round(vif_data)
332
+ vif_data['VIF']=vif_data['VIF'].astype(float)
333
+ st.header('2.4 Variance Inflation Factor (VIF)')
334
+ #st.dataframe(vif_data)
335
+ color_mapping = {
336
+ 'darkgreen': (vif_data['VIF'] < 3),
337
+ 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
338
+ 'darkred': (vif_data['VIF'] > 10)
339
+ }
340
+
341
+ # Create a horizontal bar plot
342
+ fig, ax = plt.subplots()
343
+ fig.set_figwidth(10) # Adjust the width of the figure as needed
344
+
345
+ # Sort the bars by descending VIF values
346
+ vif_data = vif_data.sort_values(by='VIF', ascending=False)
347
+
348
+ # Iterate through the color mapping and plot bars with corresponding colors
349
+ for color, condition in color_mapping.items():
350
+ subset = vif_data[condition]
351
+ bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
352
+
353
+ # Add text annotations on top of the bars
354
+ for bar in bars:
355
+ width = bar.get_width()
356
+ ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
357
+ textcoords='offset points', va='center')
358
+
359
+ # Customize the plot
360
+ ax.set_xlabel('VIF Values')
361
+ #ax.set_title('2.4 Variance Inflation Factor (VIF)')
362
+ #ax.legend(loc='upper right')
363
+
364
+ # Display the plot in Streamlit
365
+ st.pyplot(fig)
366
+
367
+ with st.expander('Results Summary Test data'):
368
+ ss = MinMaxScaler()
369
+ X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
370
+ st.header('2.2 Actual vs. Predicted Plot')
371
+
372
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_test, model.predict(X_test), model,target_column='Revenue')
373
+
374
+ st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
375
+
376
+ st.markdown('## 2.3 Residual Analysis')
377
+ columns=st.columns(2)
378
+ with columns[0]:
379
+ fig=plot_residual_predicted(revenue,model.predict(X_test),X_test)
380
+ st.plotly_chart(fig)
381
+
382
+ with columns[1]:
383
+ st.empty()
384
+ fig = qqplot(revenue,model.predict(X_test))
385
+ st.plotly_chart(fig)
386
+
387
+ with columns[0]:
388
+ fig=residual_distribution(revenue,model.predict(X_test))
389
+ st.pyplot(fig)
390
+
391
+ value=False
392
+ if st.checkbox('Save this model to tune',key='build_rc_cb'):
393
+ mod_name=st.text_input('Enter model name')
394
+ if len(mod_name)>0:
395
+ st.session_state['Model'][mod_name]={"Model_object":model,'feature_set':st.session_state['features_set'],'X_train':X_train}
396
+ st.session_state['X_train']=X_train
397
+ st.session_state['X_test']=X_test
398
+ st.session_state['y_train']=y_train
399
+ st.session_state['y_test']=y_test
400
+ with open("best_models.pkl", "wb") as f:
401
+ pickle.dump(st.session_state['Model'], f)
402
+ st.success('Model saved!, Proceed next page to tune the model')
403
+ value=False
dump/3_Model_Tuning.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from Eda_functions import format_numbers
4
+ import pickle
5
+ from utilities import set_header,load_local_css
6
+ import statsmodels.api as sm
7
+ import re
8
+ from sklearn.preprocessing import MinMaxScaler
9
+ import matplotlib.pyplot as plt
10
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
11
+ st.set_option('deprecation.showPyplotGlobalUse', False)
12
+ from Data_prep_functions import *
13
+
14
+ st.set_page_config(
15
+ page_title="Model Tuning",
16
+ page_icon=":shark:",
17
+ layout="wide",
18
+ initial_sidebar_state='collapsed'
19
+ )
20
+ load_local_css('styles.css')
21
+ set_header()
22
+
23
+
24
+ st.title('1. Model Tuning')
25
+
26
+
27
+ if "X_train" not in st.session_state:
28
+ st.error(
29
+ "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
30
+ st.stop()
31
+ X_train=st.session_state['X_train']
32
+ X_test=st.session_state['X_test']
33
+ y_train=st.session_state['y_train']
34
+ y_test=st.session_state['y_test']
35
+ df=st.session_state['media_data']
36
+
37
+ with open("best_models.pkl", 'rb') as file:
38
+ model_dict= pickle.load(file)
39
+
40
+ if 'selected_model' not in st.session_state:
41
+ st.session_state['selected_model']=0
42
+
43
+
44
+ st.markdown('### 1.1 Event Flags')
45
+ st.markdown('Helps in quantifying the impact of specific occurrences of events')
46
+ with st.expander('Apply Event Flags'):
47
+ st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
48
+ model =model_dict[st.session_state["selected_model"]]['Model_object']
49
+ date=st.session_state['date']
50
+ date=pd.to_datetime(date)
51
+ X_train =model_dict[st.session_state["selected_model"]]['X_train']
52
+ features_set= model_dict[st.session_state["selected_model"]]['feature_set']
53
+
54
+ col=st.columns(3)
55
+ min_date=min(date)
56
+ max_date=max(date)
57
+ with col[0]:
58
+ start_date=st.date_input('Select Start Date',min_date,min_value=min_date,max_value=max_date)
59
+ with col[1]:
60
+ end_date=st.date_input('Select End Date',max_date,min_value=min_date,max_value=max_date)
61
+ with col[2]:
62
+ repeat=st.selectbox('Repeat Annually',['Yes','No'],index=1)
63
+ if repeat =='Yes':
64
+ repeat=True
65
+ else:
66
+ repeat=False
67
+ # X_train=sm.add_constant(X_train)
68
+
69
+ if 'Flags' not in st.session_state:
70
+ st.session_state['Flags']={}
71
+
72
+ met,line_values,fig_flag=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,flag=(start_date,end_date),repeat_all_years=repeat)
73
+ st.plotly_chart(fig_flag,use_container_width=True)
74
+ flag_name='f1'
75
+ flag_name=st.text_input('Enter Flag Name')
76
+ if st.button('Update flag'):
77
+ st.session_state['Flags'][flag_name]=line_values
78
+ st.success(f'{flag_name} stored')
79
+
80
+ options=list(st.session_state['Flags'].keys())
81
+ selected_options = []
82
+ num_columns = 4
83
+ num_rows = -(-len(options) // num_columns)
84
+
85
+
86
+ tick=False
87
+ if st.checkbox('Select all'):
88
+ tick=True
89
+ selected_options = []
90
+ for row in range(num_rows):
91
+ cols = st.columns(num_columns)
92
+ for col in cols:
93
+ if options:
94
+ option = options.pop(0)
95
+ selected = col.checkbox(option,value=tick)
96
+ if selected:
97
+ selected_options.append(option)
98
+
99
+ st.markdown('### 1.2 Select Parameters to Apply')
100
+ parameters=st.columns(3)
101
+ with parameters[0]:
102
+ Trend=st.checkbox("**Trend**")
103
+ st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
104
+ with parameters[1]:
105
+ week_number=st.checkbox('**Week_number**')
106
+ st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
107
+ with parameters[2]:
108
+ sine_cosine=st.checkbox('**Sine and Cosine Waves**')
109
+ st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
110
+ if st.button('Build model with Selected Parameters and Flags'):
111
+ st.header('2.1 Results Summary')
112
+ # date=list(df.index)
113
+ # df = df.reset_index(drop=True)
114
+ # st.write(df.head(2))
115
+ # X_train=df[features_set]
116
+ ss = MinMaxScaler()
117
+ X_train_tuned = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
118
+ X_train_tuned=sm.add_constant(X_train_tuned)
119
+ for flag in selected_options:
120
+ X_train_tuned[flag]=st.session_state['Flags'][flag]
121
+ if Trend:
122
+ X_train_tuned['Trend']=np.arange(1,len(X_train_tuned)+1,1)
123
+ # if week_number:
124
+ # st.write(date)
125
+ date=pd.to_datetime(date.values)
126
+ X_train_tuned['Week_number']=date.day_of_week[:150]
127
+ model_tuned = sm.OLS(y_train, X_train_tuned).fit()
128
+
129
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,target_column='Revenue')
130
+ metrics_table_tuned,line,actual_vs_predicted_plot_tuned=plot_actual_vs_predicted(date[:150], y_train, model_tuned.predict(X_train_tuned), model_tuned,target_column='Revenue')
131
+
132
+ # st.write(metrics_table)
133
+ mape=np.round(metrics_table.iloc[0,1],2)
134
+ r2=np.round(metrics_table.iloc[1,1],2)
135
+ adjr2=np.round(metrics_table.iloc[2,1],2)
136
+ mape_tuned=np.round(metrics_table_tuned.iloc[0,1],2)
137
+ r2_tuned=np.round(metrics_table_tuned.iloc[1,1],2)
138
+ adjr2_tuned=np.round(metrics_table_tuned.iloc[2,1],2)
139
+ parameters_=st.columns(3)
140
+ with parameters_[0]:
141
+ st.metric('R2',r2_tuned,np.round(r2_tuned-r2,2))
142
+ with parameters_[1]:
143
+ st.metric('Adjusted R2',adjr2_tuned,np.round(adjr2_tuned-adjr2,2))
144
+ with parameters_[2]:
145
+ st.metric('MAPE',mape_tuned,np.round(mape_tuned-mape,2),'inverse')
146
+
147
+ st.header('2.2 Actual vs. Predicted Plot')
148
+ metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
149
+
150
+ st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)
151
+
152
+
153
+
154
+ st.markdown('## 2.3 Residual Analysis')
155
+ columns=st.columns(2)
156
+ with columns[0]:
157
+ fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
158
+ st.plotly_chart(fig)
159
+
160
+ with columns[1]:
161
+ st.empty()
162
+ fig = qqplot(y_train,model.predict(X_train))
163
+ st.plotly_chart(fig)
164
+
165
+ with columns[0]:
166
+ fig=residual_distribution(y_train,model.predict(X_train))
167
+ st.pyplot(fig)
168
+
169
+ # if st.checkbox('Use this model to build response curves',key='123'):
170
+
171
+ # raw_data=df[features_set]
172
+ # columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
173
+ # raw_data.columns=columns_raw
174
+ # columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
175
+ # raw_data=raw_data[columns_media]
176
+
177
+ # raw_data['Date']=list(df.index)
178
+
179
+ # spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
180
+ # spends_df=df[spends_var]
181
+ # spends_df['Week']=list(df.index)
182
+
183
+
184
+ # j=0
185
+ # X1=X.copy()
186
+ # col=X1.columns
187
+ # for i in model.params.values:
188
+ # X1[col[j]]=X1.iloc[:,j]*i
189
+ # j+=1
190
+ # contribution_df=X1
191
+ # contribution_df['Date']=list(df.index)
192
+ # excel_file='Overview_data.xlsx'
193
+
194
+ # with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
195
+ # raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
196
+ # spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
197
+ # contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')