import os
import streamlit as st
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.express as px
from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features,
apply_moving_average, apply_exponential_average,
first_order_diff, second_order_diff, isStatinary,
inverse_first_order_diff)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation
from models.arima import auto_arima, sarimax_forecast
from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT
import xgboost as xgb
from models.xgboost import xgboost
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error
# App title
st.title("Time Series Forcasting Web App")
# App slider
sider = st.sidebar
# Load data Section
sider.write("Load Data")
default_data = sider.selectbox("Defaults", options=os.listdir(DEFAULT_DATASETS_DIR))
upload_file = sider.file_uploader("Uploader un fichier CSV", type=["csv"])
default_url = os.path.join(DEFAULT_DATASETS_DIR, default_data)
data_url =default_url
if upload_file is not None:
data_url = upload_file
st.session_state.file_uploaded = True
else:
st.session_state.file_uploaded = False
if st.session_state.file_uploaded:
st.experimental_rerun()
cols = df_col(data_url)
date_col =sider.selectbox("Date", options=cols)
data_col = sider.selectbox("Data", options=[c for c in cols if c!=date_col])
try:
df = load_data(data_url, date_col,data_col)
#st.caching.clear_cache()
except:
cols = df_col(default_url)
df = load_data(default_url, date_col,data_col)
st.text("Failed")
# fix the data freq
freq = pd.infer_freq(df.index)
if freq is None:
st.info("Unable to infer the data freq, enter it manually")
n = st.number_input(label="Periode of ", min_value=1, step=1, format='%u')
freq_type = st.selectbox("Type", options=list(FREQ_DICT.keys()))
df = df.asfreq(freq=str(n)+FREQ_DICT[freq_type], method='bfill')
else:
df = df.asfreq(freq=freq, method='bfill')
# Save a copy of the dataframe
df_copy = df.copy()
# Data Transformation
# Sélection des transformations dans l'ordre
sider.divider()
transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff]
def transform_view(df, model="",transform_name="", key="transform"):
transf_col = "Transformed"+model
transformations = [t.__name__ for t in transformations_function]
selected_transformations = sider.multiselect("Data Transfomations", transformations)
# Apply transformations
df[transf_col] = df["data"]
for transformation in selected_transformations:
if transformation == np.log.__name__:
try:
df[transf_col] = np.log(df[transf_col])
transform_name = transform_name + "log_"
except:
st.error(f"Unable to apply {np.log.__name__}")
elif transformation == np.sqrt.__name__:
try:
df[transf_col] = np.sqrt(df[transf_col])
transform_name = transform_name + "sqrt_"
except:
st.error(f"Unable to apply {np.sqrt.__name__}")
elif transformation == "first_order_diff":
try:
df[transf_col], lags = first_order_diff(df[transf_col])
transform_name = transform_name + f"diff-{lags}_"
except:
st.error(f"Unable to apply {transformation}")
elif transformation == "second_order_diff":
try:
df[transf_col] = second_order_diff(df[transf_col])
transform_name = transform_name + "diff2_"
except:
st.error(f"Unable to apply {transformation}")
transform_name = transform_name[:-1]
return df,transform_name
df, transform_name = transform_view(df)
# Graphics
st.subheader("Visualize your Time Series")
original, decomp,transforms, smoothing, pacf_acf = st.tabs(["ORIGINAL DATA","DECOMPOSITION", "TRANSFORMATIONS","SMOOTHING","PACF AND ACF", ])
d , df_diff= numberOfDiff(df.data)
with original:
fig1 = plotTs(df["data"].to_frame())
st.plotly_chart(fig1)
with decomp:
fig2 = plotDecompse(df["data"].to_frame())
st.plotly_chart(fig2)
with transforms:
transf_fig = plotTransformation(df["data"], df["Transformed"].dropna(), transfom_name=transform_name)
st.plotly_chart(transf_fig)
try:
decision =isStatinary(df["Transformed"].dropna())
st.markdown(f'
Stationarity : {decision}
', unsafe_allow_html=True)
except:
pass
with smoothing:
smoothing_type = st.selectbox("Smoothing Type", options=["Moving Average", "Exponential Average"])
if smoothing_type=="Exponential Average":
ewm_fig = plotTs(apply_exponential_average(df["data"]).to_frame())
st.plotly_chart(ewm_fig)
else :
ma_fig = plotTs(apply_moving_average(df["data"]).to_frame())
st.plotly_chart(ma_fig)
with pacf_acf:
choice = st.selectbox("Plot PACF an ACF of :", options=['Original', 'Transformed'])
fig2, (ax1, ax2) = plt.subplots(nrows=2,ncols=1)
if choice=="Original":
plot_acf(df["data"], ax=ax1)
plot_pacf(df["data"], ax=ax2)
else:
plot_acf(df[choice].dropna(), ax=ax1)
plot_pacf(df[choice].dropna(), ax=ax2)
st.pyplot(fig2)
# Split the df
train_size = int(0.8*len(df))
train = df.iloc[0:train_size,:]
test = df.iloc[train_size:, :]
# Models selections
options = sider.multiselect(
'Models',options=["ARIMA", "SARIMA", "XGBoost"])
models = {}
model_inv = {}
for option in options:
if option == "ARIMA":
sider.divider()
sider.subheader("ARIMA")
func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="arima")
if func == "None":
df["X_ARIMA"]= df["data"]
model_inv["ARIMA"] = lambda x :x
elif func == "log":
try:
df["X_ARIMA"] = np.log(df["data"])
model_inv["ARIMA"] = np.exp
except:
st.error("Unable to apply log, use the default data")
df["X_ARIMA"]= df["data"]
model_inv["ARIMA"] = lambda x :x
elif func == "sqrt":
try:
df["X_ARIMA"] = np.sqrt(df["data"])
model_inv["ARIMA"] = np.square
except:
st.error("Unable to apply sqrt, use the default data")
df["X_ARIMA"]= df["data"]
model_inv["ARIMA"] = lambda x :x
train_size = int(0.8*len(df))
train = df.iloc[0:train_size,:]
test = df.iloc[train_size:, :]
result = None
p_range_arima = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0], key="arima_p_range")
q_range_arima = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0], key="arima_q_range")
arima_params_dict ={
"start_p" : p_range_arima[0],
"start_q" : q_range_arima[0],
"max_p" : p_range_arima[1],
"max_q" : q_range_arima[1]
}
elif option == "SARIMA" :
sider.subheader("SARIMA")
func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="sarima")
if func == "None":
df["X_SARIMA"]= df["data"]
model_inv["SARIMA"] = lambda x :x
elif func == "log":
try:
df["X_SARIMA"] = np.log(df["data"])
model_inv["SARIMA"] = np.exp
except:
st.error("Unable to apply log, use the default data")
df["X_SARIMA"]= df["data"]
model_inv["SARIMA"] = lambda x :x
elif func == "sqrt":
try:
df["X_SARIMA"] = np.sqrt(df["data"])
model_inv["SARIMA"] = np.square
except:
st.error("Unable to apply sqrt, use the default data")
df["X_SARIMA"]= df["data"]
model_inv["SARIMA"] = lambda x :x
train_size = int(0.8*len(df))
train = df.iloc[0:train_size,:]
test = df.iloc[train_size:, :]
result = None
m = sider.slider("seasonal period m ",min_value=0, max_value=30, value=12)
p_range = sider.slider("p Range",min_value=0, max_value=30, value=[0, 0], key="sarima_p_range")
q_range = sider.slider("q Range",min_value=0, max_value=30, value=[0, 0], key="sarima_q_range")
P_range = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0])
Q_range = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0])
sarima_params_dict ={
"seasonal":True,
"m":m,
"start_p" : p_range[0],
"start_q" : q_range[0],
"max_p" : p_range[1],
"max_q" : q_range[1],
"start_P" : P_range[0],
"start_Q" : Q_range[0],
"max_P" : P_range[1],
"max_Q" : q_range[1],
}
elif option == "XGBoost":
sider.subheader(option)
f1 = sider.selectbox("Apply First Transformation", options=["None", "log", "sqrt"], index= 0, key="xgboost_f1")
if f1 == "None":
df["X_XGBoost_1"]= df["data"]
model_inv["XGBoost"] = lambda x :x
elif f1 == "log":
try:
df["X_XGBoost_1"] = np.log(df["data"])
model_inv["XGBoost"] = np.exp
except:
st.error("Unable to apply log, use the default data")
df["X_XGBoost_1"]= df["data"]
model_inv["XGBoost"] = lambda x :x
elif f1 == "sqrt":
try:
df["X_XGBoost_1"] = np.sqrt(df["data"])
model_inv["XGBoost"] = np.square
except:
st.error("Unable to apply sqrt, use the default data")
df["X_XGBoost_1"]= df["data"]
model_inv["XGBoost"] = lambda x :x
f2 = sider.selectbox("Apply Second Tronsformation", options=["None","first_order_diff"], index= 0, key="xgboost_2")
if f2 == "None":
df["X_XGBoost"]= df["X_XGBoost_1"]
model_inv["XGBoost"] = lambda x :x
elif f2 == "first_order_diff":
try:
df["X_XGBoost"],xg_lags= first_order_diff(df["X_XGBoost_1"].bfill())
except:
st.error("Unable to apply first_order diff, use the default data")
train_size = int(0.8*len(df))
train = df.iloc[0:train_size,:]
test = df.iloc[train_size:, :]
max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5)
lags= sider.slider("Lags features",min_value=1, max_value=30, value=5)
learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01)
n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000)
X_train, y_train = create_features(train,lags=lags, feature_col="X_XGBoost")
X_test, y_test = create_features(pd.concat([train.iloc[-lags:,:], test["X_XGBoost"]]), lags=lags, feature_col="X_XGBoost")
fit = sider.button("Train Models")
if fit:
for option in options:
if option == "ARIMA":
st.subheader("ARIMA")
result_arima = auto_arima(train["X_ARIMA"],
start_p=arima_params_dict["start_p"],
start_q= arima_params_dict["start_q"],
max_q = arima_params_dict["max_q"],
max_p = arima_params_dict["max_p"],
)
arimax_pred, conf = sarimax_forecast(result_arima, steps=len(test))
conf_int_arima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
models[option] = (result, conf_int_arima)
test[option] = arimax_pred
elif option == "SARIMA":
st.subheader("SARIMA")
result_sarima = auto_arima(train["X_SARIMA"], **sarima_params_dict)
sarimax_pred, conf = sarimax_forecast(result_sarima, steps=len(test))
conf_int_sarima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
models[option] = (result, conf_int_sarima)
test[option] = sarimax_pred
elif option =="XGBoost":
st.subheader("SARIMA")
model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
test[option] = model_xgb.predict(X_test)
xgb_fig, xgb_ax = plt.subplots()
xgb.plot_importance(model_xgb, ax = xgb_ax, max_num_features=5)
models[option] = (model_xgb, None)
st.pyplot(xgb_fig)
st.divider()
st.subheader("Prediction")
if options:
pred_tabs = st.tabs(options)
for idx, option in enumerate(options):
if option in test.columns:
c = None
if option=="ARIMA":
c = conf_int_arima
if option=="SARIMA":
c = conf_int_sarima
with pred_tabs[idx]:
fig = plotForcast(df[f"X_{option}"], test[option], confint=c)
st.plotly_chart(fig)
# Model error
errors = {}
metric_labels = ["MAE","MAPE", "RMSE"]
errors["Model"] = []
errors["Type"] = []
errors["error"] = []
for option in options:
if option in test.columns:
if option == "XGBoost" and f2 == "first_order_diff":
first = test["X_XGBoost_1"].iloc[:xg_lags].values
inv = model_inv["XGBoost"](
inverse_first_order_diff(test[option], xg_lags,first))
else:
inv = model_inv[option](test[option])
mae = mean_absolute_error(test["data"],inv )
mape = mean_absolute_percentage_error(test["data"], inv)
rmse = root_mean_squared_error(test["data"], inv)
errors["Model"].extend([option]*len(metric_labels))
errors["Type"].extend(metric_labels)
errors["error"].extend([mae,mape, rmse])
if fit:
st.divider()
st.subheader("Compare Models Errors")
errors_df = pd.DataFrame(errors)
erro_fig, ax= plt.subplots(nrows=2)
fig = px.bar(errors_df[errors_df["Type"].isin(["MAE", "RMSE"])],
y="error", x = "Type", color = "Model", barmode="group")
fig2 = px.bar(errors_df[errors_df["Type"].isin(["MAPE"])],
y="error", x = "Type", color="Model", barmode="group")
st.plotly_chart(fig)
st.plotly_chart(fig2)