Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

Mastercard / Data_prep_functions.py

BlendMMM

Upload Data_prep_functions.py

dfeb790 verified over 1 year ago

raw

history blame

8.22 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import numpy as np
	import pickle
	import statsmodels.api as sm
	import numpy as np
	from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error
	from sklearn.preprocessing import MinMaxScaler
	import matplotlib.pyplot as plt
	from statsmodels.stats.outliers_influence import variance_inflation_factor
	from plotly.subplots import make_subplots

	st.set_option('deprecation.showPyplotGlobalUse', False)
	from datetime import datetime
	import seaborn as sns

	def calculate_discount(promo_price_series, non_promo_price_series):
	# Calculate the 4-week moving average of non-promo price
	window_size = 4
	base_price = non_promo_price_series.rolling(window=window_size).mean()

	# Calculate discount_raw
	discount_raw_series = (1 - promo_price_series / base_price) * 100

	# Calculate discount_final
	discount_final_series = discount_raw_series.where(discount_raw_series >= 5, 0)

	return base_price, discount_raw_series, discount_final_series


	def create_dual_axis_line_chart(date_series, promo_price_series, non_promo_price_series, base_price_series, discount_series):
	# Create traces for the primary axis (price vars)
	trace1 = go.Scatter(
	x=date_series,
	y=promo_price_series,
	name='Promo Price',
	yaxis='y1'
	)

	trace2 = go.Scatter(
	x=date_series,
	y=non_promo_price_series,
	name='Non-Promo Price',
	yaxis='y1'
	)

	trace3 = go.Scatter(
	x=date_series,
	y=base_price_series,
	name='Base Price',
	yaxis='y1'
	)

	# Create a trace for the secondary axis (discount)
	trace4 = go.Scatter(
	x=date_series,
	y=discount_series,
	name='Discount',
	yaxis='y2'
	)

	# Create the layout with dual axes
	layout = go.Layout(
	title='Price and Discount Over Time',
	yaxis=dict(
	title='Price',
	side='left'
	),
	yaxis2=dict(
	title='Discount',
	side='right',
	overlaying='y',
	showgrid=False
	),
	xaxis=dict(title='Date'),
	)

	# Create the figure with the defined traces and layout
	fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

	return fig


	def to_percentage(value):
	return f'{value * 100:.1f}%'


	def plot_actual_vs_predicted(date, y, predicted_values, model, target_column=None, flag=None, repeat_all_years=False, is_panel=False):
	if flag is not None:
	fig = make_subplots(specs=[[{"secondary_y": True}]])
	else:
	fig = go.Figure()

	if is_panel:
	df = pd.DataFrame()
	df['date'] = date
	df['Actual'] = y
	df['Predicted'] = predicted_values
	df_agg = df.groupby('date').agg({'Actual': 'sum', 'Predicted': 'sum'}).reset_index()
	df_agg.columns = ['date', 'Actual', 'Predicted']
	assert len(df_agg) == pd.Series(date).nunique()

	fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
	fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))

	else:
	fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
	fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))

	line_values = []
	if flag:
	min_date, max_date = flag[0], flag[1]
	min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
	max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
	month = pd.to_datetime(min_date).month
	day = pd.to_datetime(min_date).day

	if repeat_all_years:
	line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >= int(min_week)) & (pd.Timestamp(x).week <= int(max_week)) else 0))
	assert len(line_values) == len(date)
	fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)
	else:
	line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
	fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)

	mape = mean_absolute_percentage_error(y, predicted_values)
	r2 = r2_score(y, predicted_values)
	adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)

	metrics_table = pd.DataFrame({
	'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
	'Value': [mape, r2, adjr2]
	})

	# Convert date to datetime
	date = pd.to_datetime(date)

	# Calculate the number of days between each tick based on the date range
	date_range = (max(date) - min(date)).days
	#x_axis_tick_spacing = max(1, date_range // 50) # Divide the date range by 14 to get approximately 15 ticks

	fig.update_layout(
	xaxis=dict(title='Date', tickangle=-30),
	yaxis=dict(title=target_column),
	)

	fig.add_annotation(
	text=f"MAPE: {mape * 100:0.1f}%, Adjr2: {adjr2 * 100:.1f}%",
	xref="paper",
	yref="paper",
	x=0.95,
	y=1.2,
	showarrow=False,
	)

	return metrics_table, line_values, fig

	def plot_residual_predicted(actual, predicted, df):
	df_=df.copy()
	df_['Residuals'] = actual - pd.Series(predicted)
	df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()

	# Create a Plotly scatter plot
	fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"])

	# Add horizontal lines
	fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
	fig.add_hline(y=2, line_color="red")
	fig.add_hline(y=-2, line_color="red")

	fig.update_xaxes(title='Predicted')
	fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')

	# Set the same width and height for both figures
	fig.update_layout(title='2.3.1 Residuals over Predicted Values', autosize=False, width=600, height=400)

	return fig

	def residual_distribution(actual, predicted):
	Residuals = actual - pd.Series(predicted)

	# Create a Seaborn distribution plot
	sns.set(style="whitegrid")
	plt.figure(figsize=(6, 4))
	sns.histplot(Residuals, kde=True, color="#11B6BD")

	plt.title('2.3.3 Distribution of Residuals')
	plt.xlabel('Residuals')
	plt.ylabel('Probability Density')

	return plt


	def qqplot(actual, predicted):
	Residuals = actual - pd.Series(predicted)
	Residuals = pd.Series(Residuals)
	Resud_std = (Residuals - Residuals.mean()) / Residuals.std()

	# Create a QQ plot using Plotly with custom colors
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
	y=sm.ProbPlot(Resud_std).sample_quantiles,
	mode='markers',
	marker=dict(size=5, color="#11B6BD"),
	name='QQ Plot'))

	# Add the 45-degree reference line
	diagonal_line = go.Scatter(
	x=[-2, 2], # Adjust the x values as needed to fit the range of your data
	y=[-2, 2], # Adjust the y values accordingly
	mode='lines',
	line=dict(color='red'), # Customize the line color and style
	name=' '
	)
	fig.add_trace(diagonal_line)

	# Customize the layout
	fig.update_layout(title='2.3.2 QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
	xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')

	return fig