Mastercard / 10_Model_Build_draft.py
BlendMMM's picture
Upload 81 files
94bbd2b verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Eda_functions import format_numbers,line_plot,summary
import numpy as np
from Transformation_functions import check_box
from Transformation_functions import apply_lag,apply_adstock,top_correlated_feature
import pickle
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder,GridUpdateMode
from utilities import set_header,initialize_data,load_local_css
from st_aggrid import GridOptionsBuilder
import time
import itertools
import statsmodels.api as sm
import numpy as np
import re
import itertools
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error
from PIL import Image
import os
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
st.set_option('deprecation.showPyplotGlobalUse', False)
st.set_page_config(
page_title="Model Build",
page_icon=":shark:",
layout="wide",
initial_sidebar_state='collapsed'
)
load_local_css('styles.css')
set_header()
# logo = Image.open("Full_Logo_Blue.png")
# # Set the logo size
# logo = logo.resize((100, 100))
# st.image(logo)
# st.markdown("""
# <style>
# .logo {
# position: absolute;
# top: 10px;
# right: 10px;
# }
# </style>
# """,unsafe_allow_html=True)
# st.image(logo, use_column_width=True, top=0.95, right=0.05)
# Use CSS to position the logo in the top right corner
# st.write(
# """
# <style>
# .logo {
# position: absolute;
# top: 10px;
# right: 10px;
# }
# </style>
# """
# )
st.title('Model Build')
with open("filtered_variables.pkl", 'rb') as file:
filtered_variables = pickle.load(file)
with open('Categorised_data.pkl', 'rb') as file:
Categorised_data = pickle.load(file)
with open("target_column.pkl", 'rb') as file:
target_column= pickle.load(file)
with open("df.pkl", 'rb') as file:
df= pickle.load(file)
#st.markdown('### Generating all the possible combinations of variables')
if 'final_selection' not in st.session_state:
st.session_state['final_selection']=None
keywords = ['Digital (Impressions)', 'Streaming (Impressions)']
# Use list comprehension to filter columns
#drop_columns = [col for col in df.columns if any(keyword in col for keyword in keywords)]
#st.write(drop_columns)
#df.drop(drop_columns,axis=1,inplace=True)
if st.button('Create all Possibile combinations of Variables'):
with st.spinner('Wait for it'):
multiple_col=[col for col in filtered_variables.keys() if Categorised_data[col]['VB']=='Holiday']
#st.write(multiple_col)
for var in multiple_col:
all_combinations_hol = []
for r in range(1, len(filtered_variables[var]) + 1):
combinations = itertools.combinations(filtered_variables[var], r)
all_combinations_hol.extend(combinations)
all_combinations_hol.append([])
all_combinations_hol = [list(comb) for comb in all_combinations_hol]
filtered_variables[var]=all_combinations_hol
# st.write(filtered_variables)
price=[col for col in df.columns if Categorised_data[re.split(r'_adst|_lag', col )[0]]['VB']=='Price']
price.append("Non Promo Price")
price.append('Promo Price') #tempfix
#st.write(price)
Distribution=[col for col in df.columns if Categorised_data[re.split(r'_adst|_lag', col )[0]]['VB']=='Distribution']
Promotion=[col for col in df.columns if Categorised_data[re.split(r'_adst|_lag', col )[0]]['VB']=='Promotion']
Promotion.remove("Non Promo Price")
price.append('')
Distribution.append('')
Promotion.remove('Promo Price') #temp fi------
filtered_variables['Price']=price
filtered_variables['Distribution']=Distribution
filtered_variables['Promotion']=Promotion
variable_names = list(filtered_variables.keys())
variable_values = list(filtered_variables.values())
combinations = list(itertools.product(*variable_values))
# for combo in combinations:
# flattened_combo = [item for sublist in combo for item in (sublist if isinstance(sublist, list) else [sublist])]
# print(flattened_combo)
# st.text(flattened_combo)
final_selection=[]
for comb in combinations:
nested_tuple = comb
flattened_list = [item for sublist in nested_tuple for item in (sublist if isinstance(sublist, list) else [sublist])]
final_selection.append(flattened_list)
#st.write(final_selection[:15])
st.session_state['final_selection']=final_selection
st.success('Done')
st.write(f'Total combinations created {format_numbers(len(final_selection))}')
if 'Model_results' not in st.session_state:
st.session_state['Model_results']={'Model_object':[],
'Model_iteration':[],
'Feature_set':[],
'MAPE':[],
'R2':[],
'ADJR2':[]
}
#if st.button('Build Model'):
save_path = r"C:\Users\ManojP\Documents\MMM\simopt\Model"
iterations = st.number_input('Select the number of iterations to perform', min_value=1, step=1, value=1)
if st.button("Build Model"):
progress_bar = st.progress(0) # Initialize the progress bar
#time_remaining_text = st.empty() # Create an empty space for time remaining text
start_time = time.time() # Record the start time
progress_text = st.empty()
#time_elapsed_text = st.empty()
for i, selected_features in enumerate(st.session_state["final_selection"][:int(iterations)]):
df = df.reset_index(drop=True)
fet = [var for var in selected_features if len(var) > 0]
X = df[fet]
y = df['Prospects']
ss = MinMaxScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
# st.write(fet)
positive_coeff=[col for col in fet if Categorised_data[re.split(r'_adst|_lag', col )[0]]['VB'] in ["Distribution","Promotion TV" ,"Display", "Video" ,"Facebook", "Twitter" ,"Instagram" ,"Pintrest", "YouTube" ,"Paid Search" ,"OOH Radio" ,"Audio Streaming",'Digital']]
negetive_coeff=[col for col in fet if Categorised_data[re.split(r'_adst|_lag', col )[0]]['VB'] in ["Price"]]
coefficients=model.params.to_dict()
model_possitive=[col for col in coefficients.keys() if coefficients[col]>0]
model_negatives=[col for col in coefficients.keys() if coefficients[col]<0]
# st.write(positive_coeff)
# st.write(model_possitive)
pvalues=[var for var in list(model.pvalues) if var<=0.06]
if (set(positive_coeff).issubset(set(model_possitive))) and (set(negetive_coeff).issubset(model_negatives)) and (len(pvalues)/len(selected_features))>=0.5:
predicted_values = model.predict(X)
mape = mean_absolute_percentage_error(y, predicted_values)
adjr2 = model.rsquared_adj
r2 = model.rsquared
filename = os.path.join(save_path, f"model_{i}.pkl")
with open(filename, "wb") as f:
pickle.dump(model, f)
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
# model = pickle.load(file)
st.session_state['Model_results']['Model_object'].append(filename)
st.session_state['Model_results']['Model_iteration'].append(i)
st.session_state['Model_results']['Feature_set'].append(fet)
st.session_state['Model_results']['MAPE'].append(mape)
st.session_state['Model_results']['R2'].append(r2)
st.session_state['Model_results']['ADJR2'].append(adjr2)
current_time = time.time()
time_taken = current_time - start_time
time_elapsed_minutes = time_taken / 60
completed_iterations_text = f"{i + 1}/{iterations}"
progress_bar.progress((i + 1) / int(iterations))
progress_text.text(f'Completed iterations: {completed_iterations_text} Time Elapsed (min): {time_elapsed_minutes:.2f}')
st.write(f'Out of {iterations} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
def to_percentage(value):
return f'{value * 100:.1f}%'
st.title('Analysis of Results')
if st.checkbox('Show Results of Top 10 Models'):
st.write('Click on the Row to Generate Model Result')
data=pd.DataFrame(st.session_state['Model_results'])
data.sort_values(by=['MAPE'],ascending=False,inplace=True)
top_10=data.head(10)
top_10['Row_number']=np.arange(1,11,1)
top_10[['MAPE','R2','ADJR2']]=np.round(top_10[['MAPE','R2','ADJR2']],4).applymap(to_percentage)
gd=GridOptionsBuilder.from_dataframe(top_10[['Row_number','MAPE','R2','ADJR2','Model_iteration']])
gd.configure_pagination(enabled=True)
gd.configure_selection(use_checkbox=True)
#gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
gridoptions=gd.build()
table = AgGrid(top_10,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED)
selected_rows=table.selected_rows
if len(selected_rows)>0:
st.header('Model Summary')
#st.text(selected_rows[0]['Model_iteration'])
model_object=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Model_object']
features_set=data[data['Model_iteration']==selected_rows[0]['Model_iteration']]['Feature_set']
#st.write(features_set.values)
with open(str(model_object.values[0]), 'rb') as file:
model = pickle.load(file)
st.write(model.summary())
# st.write(df.index)
def plot_actual_vs_predicted(date, y, predicted_values, model):
fig = go.Figure()
fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
# Calculate MAPE
mape = mean_absolute_percentage_error(y, predicted_values)
# Calculate AdjR2 # Assuming X is your feature matrix
adjr2 = model.rsquared_adj
# Create a table to display the metrics
metrics_table = pd.DataFrame({
'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
'Value': [mape, model.rsquared, adjr2]
})
fig.update_layout(
xaxis=dict(title='Date'),
yaxis=dict(title=target_column),
xaxis_tickangle=-30
)
#metrics_table.set_index(['Metric'],inplace=True)
return metrics_table, fig
# st.text(features_set.values[0])
# st.dataframe(df[features_set.values[0]])
date=list(df.index)
df = df.reset_index(drop=True)
X=df[features_set.values[0]]
ss = MinMaxScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X=sm.add_constant(X)
#st.write(model.predict(X))
#st.write(df[target_column])
metrics_table,fig=plot_actual_vs_predicted(date, df[target_column], model.predict(X), model)
st.plotly_chart(fig,use_container_width=True)
def plot_residual_predicted(actual, predicted, df_):
df_['Residuals'] = actual - pd.Series(predicted)
df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()
# Create a Plotly scatter plot
fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5)
# Add horizontal lines
fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
fig.add_hline(y=2, line_color="red")
fig.add_hline(y=-2, line_color="red")
fig.update_xaxes(title='Predicted')
fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')
# Set the same width and height for both figures
fig.update_layout(title='Residuals over Predicted values', autosize=False, width=600, height=400)
return fig
def residual_distribution(actual, predicted):
Residuals = actual - pd.Series(predicted)
# Create a Plotly histogram and distribution curve with custom colors
fig = go.Figure()
fig.add_trace(go.Histogram(x=Residuals, name='Residuals', histnorm='probability',
marker_color="#11B6BD"))
fig.add_trace(go.Histogram(x=Residuals, histnorm='probability', showlegend=False,
marker_color="#11B6BD"))
fig.update_layout(title='Distribution of Residuals',title_x=0.5, autosize=False, width=600, height=400)
return fig
def qqplot(actual, predicted):
Residuals = actual - pd.Series(predicted)
Residuals = pd.Series(Residuals)
Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
# Create a QQ plot using Plotly with custom colors
fig = go.Figure()
fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
y=sm.ProbPlot(Resud_std).sample_quantiles,
mode='markers',
marker=dict(size=5, color="#11B6BD"),
name='QQ Plot'))
# Add the 45-degree reference line
diagonal_line = go.Scatter(
x=[-2, 2], # Adjust the x values as needed to fit the range of your data
y=[-2, 2], # Adjust the y values accordingly
mode='lines',
line=dict(color='red'), # Customize the line color and style
name=' '
)
fig.add_trace(diagonal_line)
# Customize the layout
fig.update_layout(title='QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
return fig
st.markdown('## Residual Analysis')
columns=st.columns(2)
with columns[0]:
fig=plot_residual_predicted(df[target_column],model.predict(X),df)
st.plotly_chart(fig)
with columns[1]:
st.empty()
fig = qqplot(df[target_column],model.predict(X))
st.plotly_chart(fig)
with columns[0]:
fig=residual_distribution(df[target_column],model.predict(X))
st.plotly_chart(fig)
vif_data = pd.DataFrame()
X=X.drop('const',axis=1)
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data.sort_values(by=['VIF'],ascending=False,inplace=True)
st.dataframe(vif_data)