Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import statsmodels.api as sm | |
| from sklearn.metrics import mean_absolute_error, r2_score,mean_absolute_percentage_error | |
| from sklearn.preprocessing import MinMaxScaler | |
| import matplotlib.pyplot as plt | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| from plotly.subplots import make_subplots | |
| st.set_option('deprecation.showPyplotGlobalUse', False) | |
| from datetime import datetime | |
| import seaborn as sns | |
| def plot_actual_vs_predicted(date, y, predicted_values, model, target_column=None, flag=None, repeat_all_years=False, is_panel=False): | |
| """ | |
| Plots actual vs predicted values with optional flags and aggregation for panel data. | |
| Parameters: | |
| date (pd.Series): Series of dates for x-axis. | |
| y (pd.Series): Actual values. | |
| predicted_values (pd.Series): Predicted values from the model. | |
| model (object): Trained model object. | |
| target_column (str, optional): Name of the target column. | |
| flag (tuple, optional): Start and end dates for flagging periods. | |
| repeat_all_years (bool, optional): Whether to repeat flags for all years. | |
| is_panel (bool, optional): Whether the data is panel data requiring aggregation. | |
| Returns: | |
| metrics_table (pd.DataFrame): DataFrame containing MAPE, R-squared, and Adjusted R-squared. | |
| line_values (list): List of flag values for plotting. | |
| fig (go.Figure): Plotly figure object. | |
| """ | |
| if flag is not None: | |
| fig = make_subplots(specs=[[{"secondary_y": True}]]) | |
| else: | |
| fig = go.Figure() | |
| if is_panel: | |
| df = pd.DataFrame() | |
| df['date'] = date | |
| df['Actual'] = y | |
| df['Predicted'] = predicted_values | |
| df_agg = df.groupby('date').agg({'Actual': 'sum', 'Predicted': 'sum'}).reset_index() | |
| df_agg.columns = ['date', 'Actual', 'Predicted'] | |
| assert len(df_agg) == pd.Series(date).nunique() | |
| fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B'))) | |
| fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD'))) | |
| else: | |
| fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B'))) | |
| fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD'))) | |
| line_values = [] | |
| if flag: | |
| min_date, max_date = flag[0], flag[1] | |
| min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U") | |
| max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U") | |
| if repeat_all_years: | |
| line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >= int(min_week)) & (pd.Timestamp(x).week <= int(max_week)) else 0)) | |
| assert len(line_values) == len(date) | |
| fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True) | |
| else: | |
| line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0)) | |
| fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True) | |
| mape = mean_absolute_percentage_error(y, predicted_values) | |
| r2 = r2_score(y, predicted_values) | |
| adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.params) - 1) | |
| metrics_table = pd.DataFrame({ | |
| 'Metric': ['MAPE', 'R-squared', 'AdjR-squared'], | |
| 'Value': [mape, r2, adjr2] | |
| }) | |
| fig.update_layout( | |
| xaxis=dict(title='Date'), | |
| yaxis=dict(title=target_column), | |
| xaxis_tickangle=-30 | |
| ) | |
| fig.add_annotation( | |
| text=f"MAPE: {mape * 100:0.1f}%, Adj. R-squared: {adjr2 * 100:.1f}%", | |
| xref="paper", | |
| yref="paper", | |
| x=0.95, | |
| y=1.2, | |
| showarrow=False, | |
| ) | |
| return metrics_table, line_values, fig | |
| def plot_residual_predicted(actual, predicted, df): | |
| """ | |
| Plots standardized residuals against predicted values. | |
| Parameters: | |
| actual (pd.Series): Actual values. | |
| predicted (pd.Series): Predicted values. | |
| df (pd.DataFrame): DataFrame containing the data. | |
| Returns: | |
| fig (go.Figure): Plotly figure object. | |
| """ | |
| df_ = df.copy() | |
| df_['Residuals'] = actual - pd.Series(predicted) | |
| df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std() | |
| fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5, color_discrete_sequence=["#11B6BD"]) | |
| fig.add_hline(y=0, line_dash="dash", line_color="darkorange") | |
| fig.add_hline(y=2, line_color="red") | |
| fig.add_hline(y=-2, line_color="red") | |
| fig.update_xaxes(title='Predicted') | |
| fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)') | |
| fig.update_layout(title='2.3.1 Residuals over Predicted Values', autosize=False, width=600, height=400) | |
| return fig | |
| def residual_distribution(actual, predicted): | |
| """ | |
| Plots the distribution of residuals. | |
| Parameters: | |
| actual (pd.Series): Actual values. | |
| predicted (pd.Series): Predicted values. | |
| Returns: | |
| plt (matplotlib.pyplot): Matplotlib plot object. | |
| """ | |
| Residuals = actual - pd.Series(predicted) | |
| sns.set(style="whitegrid") | |
| plt.figure(figsize=(6, 4)) | |
| sns.histplot(Residuals, kde=True, color="#11B6BD") | |
| plt.title('2.3.3 Distribution of Residuals') | |
| plt.xlabel('Residuals') | |
| plt.ylabel('Probability Density') | |
| return plt | |
| def qqplot(actual, predicted): | |
| """ | |
| Creates a QQ plot of the residuals. | |
| Parameters: | |
| actual (pd.Series): Actual values. | |
| predicted (pd.Series): Predicted values. | |
| Returns: | |
| fig (go.Figure): Plotly figure object. | |
| """ | |
| Residuals = actual - pd.Series(predicted) | |
| Residuals = pd.Series(Residuals) | |
| Resud_std = (Residuals - Residuals.mean()) / Residuals.std() | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles, | |
| y=sm.ProbPlot(Resud_std).sample_quantiles, | |
| mode='markers', | |
| marker=dict(size=5, color="#11B6BD"), | |
| name='QQ Plot')) | |
| diagonal_line = go.Scatter( | |
| x=[-2, 2], | |
| y=[-2, 2], | |
| mode='lines', | |
| line=dict(color='red'), | |
| name=' ' | |
| ) | |
| fig.add_trace(diagonal_line) | |
| fig.update_layout(title='2.3.2 QQ Plot of Residuals', title_x=0.5, autosize=False, width=600, height=400, | |
| xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles') | |
| return fig | |