Mastercard / Model_Results_Pretrained_copy.py
BlendMMM's picture
Upload 81 files
94bbd2b verified
raw
history blame
15.6 kB
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
import sys
import os
from utilities import (set_header,
load_local_css,
load_authenticator)
import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz as sv
import tempfile
from sklearn.preprocessing import MinMaxScaler
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder,GridUpdateMode
from st_aggrid import GridOptionsBuilder
import sys
sys.setrecursionlimit(10**6)
original_stdout = sys.stdout
sys.stdout = open('temp_stdout.txt', 'w')
sys.stdout.close()
sys.stdout = original_stdout
st.set_page_config(layout='wide')
load_local_css('styles.css')
set_header()
for k, v in st.session_state.items():
if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
st.session_state[k] = v
authenticator = st.session_state.get('authenticator')
if authenticator is None:
authenticator = load_authenticator()
name, authentication_status, username = authenticator.login('Login', 'main')
auth_status = st.session_state.get('authentication_status')
if auth_status == True:
is_state_initiaized = st.session_state.get('initialized',False)
if not is_state_initiaized:
a=1
def plot_residual_predicted(actual, predicted, df_):
df_['Residuals'] = actual - pd.Series(predicted)
df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()
# Create a Plotly scatter plot
fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"])
# Add horizontal lines
fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
fig.add_hline(y=2, line_color="red")
fig.add_hline(y=-2, line_color="red")
fig.update_xaxes(title='Predicted')
fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')
# Set the same width and height for both figures
fig.update_layout(title='Residuals over Predicted Values', autosize=False, width=600, height=400)
return fig
def residual_distribution(actual, predicted):
Residuals = actual - pd.Series(predicted)
# Create a Seaborn distribution plot
sns.set(style="whitegrid")
plt.figure(figsize=(6, 4))
sns.histplot(Residuals, kde=True, color="#11B6BD")
plt.title(' Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Probability Density')
return plt
def qqplot(actual, predicted):
Residuals = actual - pd.Series(predicted)
Residuals = pd.Series(Residuals)
Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
# Create a QQ plot using Plotly with custom colors
fig = go.Figure()
fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
y=sm.ProbPlot(Resud_std).sample_quantiles,
mode='markers',
marker=dict(size=5, color="#11B6BD"),
name='QQ Plot'))
# Add the 45-degree reference line
diagonal_line = go.Scatter(
x=[-2, 2], # Adjust the x values as needed to fit the range of your data
y=[-2, 2], # Adjust the y values accordingly
mode='lines',
line=dict(color='red'), # Customize the line color and style
name=' '
)
fig.add_trace(diagonal_line)
# Customize the layout
fig.update_layout(title='QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
return fig
def plot_actual_vs_predicted(date, y, predicted_values, model):
fig = go.Figure()
fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='orange')))
# Calculate MAPE
mape = mean_absolute_percentage_error(y, predicted_values)*100
# Calculate R-squared
rss = np.sum((y - predicted_values) ** 2)
tss = np.sum((y - np.mean(y)) ** 2)
r_squared = 1 - (rss / tss)
# Get the number of predictors
num_predictors = model.df_model
# Get the number of samples
num_samples = len(y)
# Calculate Adjusted R-squared
adj_r_squared = 1 - ((1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)))
metrics_table = pd.DataFrame({
'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
'Value': [mape, r_squared, adj_r_squared]})
fig.update_layout(
xaxis=dict(title='Date'),
yaxis=dict(title='Value'),
title=f'MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}',
xaxis_tickangle=-30
)
return metrics_table,fig
transformed_data=pd.read_csv('transformed_data.csv')
# hard coded for now, need to get features set from model
feature_set_dct={'app_installs_-_appsflyer':['paid_search_clicks',
'fb:_level_achieved_-_tier_1_impressions_lag2',
'fb:_level_achieved_-_tier_2_clicks_lag2',
'paid_social_others_impressions_adst.1',
'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2',
'digital_tactic_others_clicks',
'kwai_clicks_adst.3',
'programmaticclicks',
'indicacao_clicks_adst.1',
'infleux_clicks_adst.4',
'influencer_clicks'],
'account_requests_-_appsflyer':['paid_search_impressions',
'fb:_level_achieved_-_tier_1_clicks_adst.1',
'fb:_level_achieved_-_tier_2_clicks_adst.1',
'paid_social_others_clicks_lag2',
'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1',
'digital_tactic_others_clicks_adst.1',
'kwai_clicks_adst.2',
'programmaticimpressions_lag4_adst.1',
'indicacao_clicks',
'infleux_clicks_adst.2',
'influencer_clicks'],
'total_approved_accounts_-_appsflyer':['paid_search_clicks',
'fb:_level_achieved_-_tier_1_impressions_lag2_adst.1',
'fb:_level_achieved_-_tier_2_impressions_lag2',
'paid_social_others_clicks_lag2_adst.2',
'ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4',
'digital_tactic_others_clicks',
'kwai_impressions_adst.2',
'programmaticclicks_adst.5',
'indicacao_clicks_adst.1',
'infleux_clicks_adst.3',
'influencer_clicks'],
'total_approved_accounts_-_revenue':['paid_search_impressions_adst.5',
'kwai_impressions_lag2_adst.3',
'indicacao_clicks_adst.3',
'infleux_clicks_adst.3',
'programmaticclicks_adst.4',
'influencer_clicks_adst.3',
'fb:_level_achieved_-_tier_1_impressions_adst.2',
'fb:_level_achieved_-_tier_2_impressions_lag3_adst.5',
'paid_social_others_impressions_adst.3',
'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5',
'digital_tactic_others_clicks_adst.2']
}
#""" the above part should be modified so that we are getting features set from the saved model"""
def model_fit(features_set,target):
X = transformed_data[features_set]
y= transformed_data[target]
ss = MinMaxScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = sm.add_constant(X)
X_train=X.iloc[:150]
X_test=X.iloc[150:]
y_train=y.iloc[:150]
y_test=y.iloc[150:]
model = sm.OLS(y_train, X_train).fit()
predicted_values_train = model.predict(X_train)
r2 = model.rsquared
adjr2 = model.rsquared_adj
train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
test_mape=mean_absolute_percentage_error(y_test, model.predict(X_test))
summary=model.summary()
return pd.DataFrame({'Model':target,'R2':np.round(r2,2),'ADJr2':np.round(adjr2,2),'Train Mape':np.round(train_mape,2),
'Test Mape':np.round(test_mape,2),'Summary':summary,'Model_object':model
},index=[0])
metrics_table=pd.DataFrame()
for target,feature_set in feature_set_dct.items():
metrics_table= pd.concat([metrics_table,model_fit(features_set=feature_set,target=target)])
metrics_table.reset_index(drop=True,inplace=True)
eda_columns=st.columns(3)
with eda_columns[1]:
eda=st.button('Generate EDA Report',help="Click to generate a bivariate report for the selected response metric from the table below.")
st.title('Analysis of Model Results')
# st.markdown()
gd=GridOptionsBuilder.from_dataframe(metrics_table.iloc[:,:-2])
gd.configure_pagination(enabled=True)
gd.configure_selection(use_checkbox=True)
gridoptions=gd.build()
# st.markdown('Model Metrics')
table = AgGrid(metrics_table.iloc[:,:-2],gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True,
columns_auto_size_mode='ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW')
if len(table.selected_rows)==0:
st.warning("Click on the checkbox to view comprehensive results of the selected model.")
st.stop()
else:
target_column=table.selected_rows[0]['Model']
feature_set=feature_set_dct[target_column]
st.header('')
# st.write(feature_set)
# st.write(target_column)
# # Perform linear regression
# model = sm.OLS(y, X).fit()
with eda_columns[1]:
if eda:
def generate_report_with_target(channel_data, target_feature):
report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature,verbose=False)
temp_dir = tempfile.mkdtemp()
report_path = os.path.join(temp_dir, "report.html")
report.show_html(filepath=report_path, open_browser=False) # Generate the report as an HTML file
return report_path
report_data=transformed_data[feature_set]
report_data[target_column]=transformed_data[target_column]
report_file = generate_report_with_target(report_data, target_column)
if os.path.exists(report_file):
with open(report_file, 'rb') as f:
st.download_button(
label="Download EDA Report",
data=f.read(),
file_name="report.html",
mime="text/html"
)
else:
st.warning("Report generation failed. Unable to find the report file.")
model=metrics_table[metrics_table['Model']==target_column]['Model_object'].iloc[0]
st.header('Model Summary')
st.write(model.summary())
X=transformed_data[feature_set]
ss=MinMaxScaler()
X=pd.DataFrame(ss.fit_transform(X),columns=X.columns)
X=sm.add_constant(X)
y=transformed_data[target_column]
X_train=X.iloc[:150]
X_test=X.iloc[150:]
y_train=y.iloc[:150]
y_test=y.iloc[150:]
X.index=transformed_data['date']
y.index=transformed_data['date']
metrics_table_train,fig_train= plot_actual_vs_predicted(X_train.index, y_train, model.predict(X_train), model)
metrics_table_test,fig_test= plot_actual_vs_predicted(X_test.index, y_test, model.predict(X_test), model)
metrics_table_train=metrics_table_train.set_index('Metric').transpose()
metrics_table_train.index=['Train']
metrics_table_test=metrics_table_test.set_index('Metric').transpose()
metrics_table_test.index=['test']
metrics_table=np.round(pd.concat([metrics_table_train,metrics_table_test]),2)
st.markdown('Result Overview')
st.dataframe(np.round(metrics_table,2),use_container_width=True)
st.subheader('Actual vs Predicted Plot Train')
st.plotly_chart(fig_train,use_container_width=True)
st.subheader('Actual vs Predicted Plot Test')
st.plotly_chart(fig_test,use_container_width=True)
st.markdown('## Residual Analysis')
columns=st.columns(2)
Xtrain1=X_train.copy()
with columns[0]:
fig=plot_residual_predicted(y_train,model.predict(Xtrain1),Xtrain1)
st.plotly_chart(fig)
with columns[1]:
st.empty()
fig = qqplot(y_train,model.predict(X_train))
st.plotly_chart(fig)
with columns[0]:
fig=residual_distribution(y_train,model.predict(X_train))
st.pyplot(fig)
elif auth_status == False:
st.error('Username/Password is incorrect')
try:
username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
if username_forgot_pw:
st.success('New password sent securely')
# Random password to be transferred to the user securely
elif username_forgot_pw == False:
st.error('Username not found')
except Exception as e:
st.error(e)