import os import streamlit as st import pandas as pd import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt import plotly.express as px from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features, apply_moving_average, apply_exponential_average, first_order_diff, second_order_diff, isStatinary, inverse_first_order_diff) from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation from models.arima import auto_arima, sarimax_forecast from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT import xgboost as xgb from models.xgboost import xgboost import seaborn as sns from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error # App title st.title("Time Series Forcasting Web App") # App slider sider = st.sidebar # Load data Section sider.write("Load Data") default_data = sider.selectbox("Defaults", options=os.listdir(DEFAULT_DATASETS_DIR)) upload_file = sider.file_uploader("Uploader un fichier CSV", type=["csv"]) default_url = os.path.join(DEFAULT_DATASETS_DIR, default_data) data_url =default_url if upload_file is not None: data_url = upload_file st.session_state.file_uploaded = True else: st.session_state.file_uploaded = False if st.session_state.file_uploaded: st.experimental_rerun() cols = df_col(data_url) date_col =sider.selectbox("Date", options=cols) data_col = sider.selectbox("Data", options=[c for c in cols if c!=date_col]) try: df = load_data(data_url, date_col,data_col) #st.caching.clear_cache() except: cols = df_col(default_url) df = load_data(default_url, date_col,data_col) st.text("Failed") # fix the data freq freq = pd.infer_freq(df.index) if freq is None: st.info("Unable to infer the data freq, enter it manually") n = st.number_input(label="Periode of ", min_value=1, step=1, format='%u') freq_type = st.selectbox("Type", options=list(FREQ_DICT.keys())) df = df.asfreq(freq=str(n)+FREQ_DICT[freq_type], method='bfill') else: df = df.asfreq(freq=freq, method='bfill') # Save a copy of the dataframe df_copy = df.copy() # Data Transformation # Sélection des transformations dans l'ordre sider.divider() transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff] def transform_view(df, model="",transform_name="", key="transform"): transf_col = "Transformed"+model transformations = [t.__name__ for t in transformations_function] selected_transformations = sider.multiselect("Data Transfomations", transformations) # Apply transformations df[transf_col] = df["data"] for transformation in selected_transformations: if transformation == np.log.__name__: try: df[transf_col] = np.log(df[transf_col]) transform_name = transform_name + "log_" except: st.error(f"Unable to apply {np.log.__name__}") elif transformation == np.sqrt.__name__: try: df[transf_col] = np.sqrt(df[transf_col]) transform_name = transform_name + "sqrt_" except: st.error(f"Unable to apply {np.sqrt.__name__}") elif transformation == "first_order_diff": try: df[transf_col], lags = first_order_diff(df[transf_col]) transform_name = transform_name + f"diff-{lags}_" except: st.error(f"Unable to apply {transformation}") elif transformation == "second_order_diff": try: df[transf_col] = second_order_diff(df[transf_col]) transform_name = transform_name + "diff2_" except: st.error(f"Unable to apply {transformation}") transform_name = transform_name[:-1] return df,transform_name df, transform_name = transform_view(df) # Graphics st.subheader("Visualize your Time Series") original, decomp,transforms, smoothing, pacf_acf = st.tabs(["ORIGINAL DATA","DECOMPOSITION", "TRANSFORMATIONS","SMOOTHING","PACF AND ACF", ]) d , df_diff= numberOfDiff(df.data) with original: fig1 = plotTs(df["data"].to_frame()) st.plotly_chart(fig1) with decomp: fig2 = plotDecompse(df["data"].to_frame()) st.plotly_chart(fig2) with transforms: transf_fig = plotTransformation(df["data"], df["Transformed"].dropna(), transfom_name=transform_name) st.plotly_chart(transf_fig) try: decision =isStatinary(df["Transformed"].dropna()) st.markdown(f'

Stationarity : {decision}

', unsafe_allow_html=True) except: pass with smoothing: smoothing_type = st.selectbox("Smoothing Type", options=["Moving Average", "Exponential Average"]) if smoothing_type=="Exponential Average": ewm_fig = plotTs(apply_exponential_average(df["data"]).to_frame()) st.plotly_chart(ewm_fig) else : ma_fig = plotTs(apply_moving_average(df["data"]).to_frame()) st.plotly_chart(ma_fig) with pacf_acf: choice = st.selectbox("Plot PACF an ACF of :", options=['Original', 'Transformed']) fig2, (ax1, ax2) = plt.subplots(nrows=2,ncols=1) if choice=="Original": plot_acf(df["data"], ax=ax1) plot_pacf(df["data"], ax=ax2) else: plot_acf(df[choice].dropna(), ax=ax1) plot_pacf(df[choice].dropna(), ax=ax2) st.pyplot(fig2) # Split the df train_size = int(0.8*len(df)) train = df.iloc[0:train_size,:] test = df.iloc[train_size:, :] # Models selections options = sider.multiselect( 'Models',options=["ARIMA", "SARIMA", "XGBoost"]) models = {} model_inv = {} for option in options: if option == "ARIMA": sider.divider() sider.subheader("ARIMA") func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="arima") if func == "None": df["X_ARIMA"]= df["data"] model_inv["ARIMA"] = lambda x :x elif func == "log": try: df["X_ARIMA"] = np.log(df["data"]) model_inv["ARIMA"] = np.exp except: st.error("Unable to apply log, use the default data") df["X_ARIMA"]= df["data"] model_inv["ARIMA"] = lambda x :x elif func == "sqrt": try: df["X_ARIMA"] = np.sqrt(df["data"]) model_inv["ARIMA"] = np.square except: st.error("Unable to apply sqrt, use the default data") df["X_ARIMA"]= df["data"] model_inv["ARIMA"] = lambda x :x train_size = int(0.8*len(df)) train = df.iloc[0:train_size,:] test = df.iloc[train_size:, :] result = None p_range_arima = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0], key="arima_p_range") q_range_arima = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0], key="arima_q_range") arima_params_dict ={ "start_p" : p_range_arima[0], "start_q" : q_range_arima[0], "max_p" : p_range_arima[1], "max_q" : q_range_arima[1] } elif option == "SARIMA" : sider.subheader("SARIMA") func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="sarima") if func == "None": df["X_SARIMA"]= df["data"] model_inv["SARIMA"] = lambda x :x elif func == "log": try: df["X_SARIMA"] = np.log(df["data"]) model_inv["SARIMA"] = np.exp except: st.error("Unable to apply log, use the default data") df["X_SARIMA"]= df["data"] model_inv["SARIMA"] = lambda x :x elif func == "sqrt": try: df["X_SARIMA"] = np.sqrt(df["data"]) model_inv["SARIMA"] = np.square except: st.error("Unable to apply sqrt, use the default data") df["X_SARIMA"]= df["data"] model_inv["SARIMA"] = lambda x :x train_size = int(0.8*len(df)) train = df.iloc[0:train_size,:] test = df.iloc[train_size:, :] result = None m = sider.slider("seasonal period m ",min_value=0, max_value=30, value=12) p_range = sider.slider("p Range",min_value=0, max_value=30, value=[0, 0], key="sarima_p_range") q_range = sider.slider("q Range",min_value=0, max_value=30, value=[0, 0], key="sarima_q_range") P_range = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0]) Q_range = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0]) sarima_params_dict ={ "seasonal":True, "m":m, "start_p" : p_range[0], "start_q" : q_range[0], "max_p" : p_range[1], "max_q" : q_range[1], "start_P" : P_range[0], "start_Q" : Q_range[0], "max_P" : P_range[1], "max_Q" : q_range[1], } elif option == "XGBoost": sider.subheader(option) f1 = sider.selectbox("Apply First Transformation", options=["None", "log", "sqrt"], index= 0, key="xgboost_f1") if f1 == "None": df["X_XGBoost_1"]= df["data"] model_inv["XGBoost"] = lambda x :x elif f1 == "log": try: df["X_XGBoost_1"] = np.log(df["data"]) model_inv["XGBoost"] = np.exp except: st.error("Unable to apply log, use the default data") df["X_XGBoost_1"]= df["data"] model_inv["XGBoost"] = lambda x :x elif f1 == "sqrt": try: df["X_XGBoost_1"] = np.sqrt(df["data"]) model_inv["XGBoost"] = np.square except: st.error("Unable to apply sqrt, use the default data") df["X_XGBoost_1"]= df["data"] model_inv["XGBoost"] = lambda x :x f2 = sider.selectbox("Apply Second Tronsformation", options=["None","first_order_diff"], index= 0, key="xgboost_2") if f2 == "None": df["X_XGBoost"]= df["X_XGBoost_1"] model_inv["XGBoost"] = lambda x :x elif f2 == "first_order_diff": try: df["X_XGBoost"],xg_lags= first_order_diff(df["X_XGBoost_1"].bfill()) except: st.error("Unable to apply first_order diff, use the default data") train_size = int(0.8*len(df)) train = df.iloc[0:train_size,:] test = df.iloc[train_size:, :] max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5) lags= sider.slider("Lags features",min_value=1, max_value=30, value=5) learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01) n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000) X_train, y_train = create_features(train,lags=lags, feature_col="X_XGBoost") X_test, y_test = create_features(pd.concat([train.iloc[-lags:,:], test["X_XGBoost"]]), lags=lags, feature_col="X_XGBoost") fit = sider.button("Train Models") if fit: for option in options: if option == "ARIMA": st.subheader("ARIMA") result_arima = auto_arima(train["X_ARIMA"], start_p=arima_params_dict["start_p"], start_q= arima_params_dict["start_q"], max_q = arima_params_dict["max_q"], max_p = arima_params_dict["max_p"], ) arimax_pred, conf = sarimax_forecast(result_arima, steps=len(test)) conf_int_arima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"]) models[option] = (result, conf_int_arima) test[option] = arimax_pred elif option == "SARIMA": st.subheader("SARIMA") result_sarima = auto_arima(train["X_SARIMA"], **sarima_params_dict) sarimax_pred, conf = sarimax_forecast(result_sarima, steps=len(test)) conf_int_sarima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"]) models[option] = (result, conf_int_sarima) test[option] = sarimax_pred elif option =="XGBoost": st.subheader("SARIMA") model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators) test[option] = model_xgb.predict(X_test) xgb_fig, xgb_ax = plt.subplots() xgb.plot_importance(model_xgb, ax = xgb_ax, max_num_features=5) models[option] = (model_xgb, None) st.pyplot(xgb_fig) st.divider() st.subheader("Prediction") if options: pred_tabs = st.tabs(options) for idx, option in enumerate(options): if option in test.columns: c = None if option=="ARIMA": c = conf_int_arima if option=="SARIMA": c = conf_int_sarima with pred_tabs[idx]: fig = plotForcast(df[f"X_{option}"], test[option], confint=c) st.plotly_chart(fig) # Model error errors = {} metric_labels = ["MAE","MAPE", "RMSE"] errors["Model"] = [] errors["Type"] = [] errors["error"] = [] for option in options: if option in test.columns: if option == "XGBoost" and f2 == "first_order_diff": first = test["X_XGBoost_1"].iloc[:xg_lags].values inv = model_inv["XGBoost"]( inverse_first_order_diff(test[option], xg_lags,first)) else: inv = model_inv[option](test[option]) mae = mean_absolute_error(test["data"],inv ) mape = mean_absolute_percentage_error(test["data"], inv) rmse = root_mean_squared_error(test["data"], inv) errors["Model"].extend([option]*len(metric_labels)) errors["Type"].extend(metric_labels) errors["error"].extend([mae,mape, rmse]) if fit: st.divider() st.subheader("Compare Models Errors") errors_df = pd.DataFrame(errors) erro_fig, ax= plt.subplots(nrows=2) fig = px.bar(errors_df[errors_df["Type"].isin(["MAE", "RMSE"])], y="error", x = "Type", color = "Model", barmode="group") fig2 = px.bar(errors_df[errors_df["Type"].isin(["MAPE"])], y="error", x = "Type", color="Model", barmode="group") st.plotly_chart(fig) st.plotly_chart(fig2)