autodidacte228 commited on
Commit
91839e1
1 Parent(s): 82f2020

SARIMA and XGBoost bugs fixed

Browse files
Files changed (4) hide show
  1. app.py +208 -85
  2. models/arima.py +17 -12
  3. utils/dataprocess.py +19 -5
  4. utils/graphics.py +1 -1
app.py CHANGED
@@ -6,10 +6,11 @@ import statsmodels.api as sm
6
  import matplotlib.pyplot as plt
7
  from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features,
8
  apply_moving_average, apply_exponential_average,
9
- first_order_diff, second_order_diff, isStatinary)
 
10
  from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
11
  from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation
12
- from models.arima import SARIMAXGridSearch, valid_model, sarimax_forecast
13
  from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT
14
  import xgboost as xgb
15
  from models.xgboost import xgboost
@@ -63,42 +64,46 @@ df_copy = df.copy()
63
  # Data Transformation
64
  # Sélection des transformations dans l'ordre
65
  sider.divider()
 
66
  transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff]
67
- transformations = [t.__name__ for t in transformations_function]
68
- selected_transformations = sider.multiselect("Data Transfomations", transformations)
69
-
70
- # Apply transformations
71
-
72
- df["Transformed"] = df["data"]
73
- transform_name = ""
74
- for transformation in selected_transformations:
75
- if transformation == np.log.__name__:
76
- try:
77
- df["Transformed"] = np.log(df["Transformed"])
78
- transform_name = transform_name + "log_"
79
- except:
80
- st.error(f"Unable to apply {np.log.__name__}")
81
- elif transformation == np.sqrt.__name__:
82
- try:
83
- df["Transformed"] = np.sqrt(df["Transformed"])
84
- transform_name = transform_name + "sqrt_"
85
- except:
86
- st.error(f"Unable to apply {np.sqrt.__name__}")
87
-
88
- elif transformation == "first_order_diff":
89
- try:
90
- df["Transformed"] = first_order_diff(df["Transformed"])
91
- transform_name = transform_name + "diff1_"
92
- except:
93
- st.error(f"Unable to apply {transformation}")
94
- elif transformation == "second_order_diff":
95
- try:
96
- df["Transformed"] = second_order_diff(df["Transformed"])
97
- transform_name = transform_name + "diff2_"
98
- except:
99
- st.error(f"Unable to apply {transformation}")
100
- transform_name = transform_name[:-1]
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  # Graphics
103
  st.subheader("Visualize your Time Series")
104
 
@@ -136,12 +141,10 @@ with pacf_acf:
136
  plot_acf(df["data"], ax=ax1)
137
  plot_pacf(df["data"], ax=ax2)
138
  else:
139
- plot_acf(df[choice], ax=ax1)
140
- plot_pacf(df[choice], ax=ax2)
141
  st.pyplot(fig2)
142
-
143
- sider.divider()
144
-
145
  # Split the df
146
  train_size = int(0.8*len(df))
147
  train = df.iloc[0:train_size,:]
@@ -152,55 +155,165 @@ test = df.iloc[train_size:, :]
152
  options = sider.multiselect(
153
  'Models',options=["ARIMA", "SARIMA", "XGBoost"])
154
  models = {}
 
155
  for option in options:
156
  if option == "ARIMA":
157
- st.divider()
158
  sider.divider()
159
  sider.subheader("ARIMA")
160
- st.subheader("ARIMA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  result = None
162
- p_range = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0])
163
- q_range = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0])
164
- ps = range(p_range[0], p_range[1]+1)
165
- ds = range(d, d+1)
166
- qs = range(q_range[0], q_range[1]+1)
167
- if sider.toggle("Train ARIMA"):
168
- result, best_score, best_param = SARIMAXGridSearch.search(train, ps, ds, qs)
169
- st.write(result, best_score, best_param)
170
- sarimax_pred, conf_int = sarimax_forecast(result, steps=len(test))
171
- models[option] = (result, conf_int)
172
- test[option] = sarimax_pred
173
- if result:
174
- with st.expander("Model Diagnostics"):
175
- st.write(result.plot_diagnostics())
176
- with st.expander("Model Validation"):
177
- lb = float(result.summary().tables[2].data[1][1])
178
- jb = float(result.summary().tables[2].data[1][3])
179
- st.markdown(f'<h3>Ljung-Box Test : {lb}</h3>', unsafe_allow_html=True)
180
- st.markdown(f'<h3>Jarque-Box Test : {jb}</h3>', unsafe_allow_html=True)
181
- color, label = ("green", "Validate") if valid_model(lb, jb) else ("red", "Reject")
182
- st.markdown(f'<h3 style="color:{color};">Decision : {label}</h3>', unsafe_allow_html=True)
183
-
184
- if option == "SARIMA":
185
- continue
186
- if option == "XGBoost":
187
- st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  sider.subheader(option)
189
- st.subheader(option)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5)
191
  lags= sider.slider("Lags features",min_value=1, max_value=30, value=5)
192
  learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01)
193
  n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000)
194
- X_train, y_train = create_features(train,lags=lags)
195
- X_test, y_test = create_features(pd.concat([train.iloc[-lags:,:], test["data"]]), lags=lags)
196
- if sider.toggle("Train XGBoost"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
198
  test[option] = model_xgb.predict(X_test)
199
  xgb_fig, xgb_ax = plt.subplots()
200
- xgb.plot_importance(model_xgb, ax = xgb_ax)
201
  models[option] = (model_xgb, None)
202
  st.pyplot(xgb_fig)
203
 
 
204
  st.divider()
205
  st.subheader("Prediction")
206
 
@@ -211,33 +324,43 @@ for idx, option in enumerate(options):
211
  if option in test.columns:
212
  c = None
213
  if option=="ARIMA":
214
- c = conf_int
 
 
215
  with pred_tabs[idx]:
216
- fig = plotForcast(df, test[option], confint=c)
217
  st.plotly_chart(fig)
218
 
219
 
220
  # Model error
221
  errors = {}
222
- metric_labels = ["MAE", "MAPE", "RMSE"]
223
  errors["Model"] = []
224
  errors["Type"] = []
225
  errors["error"] = []
226
 
227
  for option in options:
228
  if option in test.columns:
229
- mae = mean_absolute_error(test["data"], test[option])
230
- mape = mean_absolute_percentage_error(test["data"], test[option])
231
- rmse = root_mean_squared_error(test["data"], test[option])
 
 
 
 
 
 
232
  errors["Model"].extend([option]*len(metric_labels))
233
  errors["Type"].extend(metric_labels)
234
- errors["error"].extend([mae, mape, rmse])
235
- sider.divider()
236
 
237
- if sider.toggle("Compare Models"):
238
  st.divider()
239
  st.subheader("Compare Models Errors")
240
  errors_df = pd.DataFrame(errors)
241
- erro_fig, ax = plt.subplots()
242
- sns.barplot(data = errors_df, y="error", x = "Type",hue="Model")
 
 
 
243
  st.pyplot(erro_fig)
 
6
  import matplotlib.pyplot as plt
7
  from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features,
8
  apply_moving_average, apply_exponential_average,
9
+ first_order_diff, second_order_diff, isStatinary,
10
+ inverse_first_order_diff)
11
  from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
12
  from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation
13
+ from models.arima import auto_arima, sarimax_forecast
14
  from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT
15
  import xgboost as xgb
16
  from models.xgboost import xgboost
 
64
  # Data Transformation
65
  # Sélection des transformations dans l'ordre
66
  sider.divider()
67
+
68
  transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff]
69
+ def transform_view(df, model="",transform_name="", key="transform"):
70
+ transf_col = "Transformed"+model
71
+
72
+ transformations = [t.__name__ for t in transformations_function]
73
+ selected_transformations = sider.multiselect("Data Transfomations", transformations)
74
+
75
+ # Apply transformations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ df[transf_col] = df["data"]
78
+ for transformation in selected_transformations:
79
+ if transformation == np.log.__name__:
80
+ try:
81
+ df[transf_col] = np.log(df[transf_col])
82
+ transform_name = transform_name + "log_"
83
+ except:
84
+ st.error(f"Unable to apply {np.log.__name__}")
85
+ elif transformation == np.sqrt.__name__:
86
+ try:
87
+ df[transf_col] = np.sqrt(df[transf_col])
88
+ transform_name = transform_name + "sqrt_"
89
+ except:
90
+ st.error(f"Unable to apply {np.sqrt.__name__}")
91
+
92
+ elif transformation == "first_order_diff":
93
+ try:
94
+ df[transf_col], lags = first_order_diff(df[transf_col])
95
+ transform_name = transform_name + f"diff-{lags}_"
96
+ except:
97
+ st.error(f"Unable to apply {transformation}")
98
+ elif transformation == "second_order_diff":
99
+ try:
100
+ df[transf_col] = second_order_diff(df[transf_col])
101
+ transform_name = transform_name + "diff2_"
102
+ except:
103
+ st.error(f"Unable to apply {transformation}")
104
+ transform_name = transform_name[:-1]
105
+ return df,transform_name
106
+ df, transform_name = transform_view(df)
107
  # Graphics
108
  st.subheader("Visualize your Time Series")
109
 
 
141
  plot_acf(df["data"], ax=ax1)
142
  plot_pacf(df["data"], ax=ax2)
143
  else:
144
+ plot_acf(df[choice].dropna(), ax=ax1)
145
+ plot_pacf(df[choice].dropna(), ax=ax2)
146
  st.pyplot(fig2)
147
+
 
 
148
  # Split the df
149
  train_size = int(0.8*len(df))
150
  train = df.iloc[0:train_size,:]
 
155
  options = sider.multiselect(
156
  'Models',options=["ARIMA", "SARIMA", "XGBoost"])
157
  models = {}
158
+ model_inv = {}
159
  for option in options:
160
  if option == "ARIMA":
 
161
  sider.divider()
162
  sider.subheader("ARIMA")
163
+ func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="arima")
164
+ if func == "None":
165
+ df["X_ARIMA"]= df["data"]
166
+ model_inv["ARIMA"] = lambda x :x
167
+ elif func == "log":
168
+ try:
169
+ df["X_ARIMA"] = np.log(df["data"])
170
+ model_inv["ARIMA"] = np.exp
171
+ except:
172
+ st.error("Unable to apply log, use the default data")
173
+ df["X_ARIMA"]= df["data"]
174
+ model_inv["ARIMA"] = lambda x :x
175
+ elif func == "sqrt":
176
+ try:
177
+ df["X_ARIMA"] = np.sqrt(df["data"])
178
+ model_inv["ARIMA"] = np.square
179
+ except:
180
+ st.error("Unable to apply sqrt, use the default data")
181
+ df["X_ARIMA"]= df["data"]
182
+ model_inv["ARIMA"] = lambda x :x
183
+ train_size = int(0.8*len(df))
184
+ train = df.iloc[0:train_size,:]
185
+ test = df.iloc[train_size:, :]
186
  result = None
187
+ p_range_arima = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0], key="arima_p_range")
188
+ q_range_arima = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0], key="arima_q_range")
189
+ arima_params_dict ={
190
+ "start_p" : p_range_arima[0],
191
+ "start_q" : q_range_arima[0],
192
+ "max_p" : p_range_arima[1],
193
+ "max_q" : q_range_arima[1]
194
+ }
195
+ elif option == "SARIMA" :
196
+ sider.subheader("SARIMA")
197
+ func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="sarima")
198
+ if func == "None":
199
+ df["X_SARIMA"]= df["data"]
200
+ model_inv["SARIMA"] = lambda x :x
201
+ elif func == "log":
202
+ try:
203
+ df["X_SARIMA"] = np.log(df["data"])
204
+ model_inv["SARIMA"] = np.exp
205
+ except:
206
+ st.error("Unable to apply log, use the default data")
207
+ df["X_SARIMA"]= df["data"]
208
+ model_inv["SARIMA"] = lambda x :x
209
+ elif func == "sqrt":
210
+ try:
211
+ df["X_SARIMA"] = np.sqrt(df["data"])
212
+ model_inv["SARIMA"] = np.square
213
+ except:
214
+ st.error("Unable to apply sqrt, use the default data")
215
+ df["X_SARIMA"]= df["data"]
216
+ model_inv["SARIMA"] = lambda x :x
217
+ train_size = int(0.8*len(df))
218
+ train = df.iloc[0:train_size,:]
219
+ test = df.iloc[train_size:, :]
220
+ result = None
221
+ m = sider.slider("seasonal period m ",min_value=0, max_value=30, value=12)
222
+ p_range = sider.slider("p Range",min_value=0, max_value=30, value=[0, 0], key="sarima_p_range")
223
+ q_range = sider.slider("q Range",min_value=0, max_value=30, value=[0, 0], key="sarima_q_range")
224
+ P_range = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0])
225
+ Q_range = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0])
226
+ sarima_params_dict ={
227
+ "seasonal":True,
228
+ "m":m,
229
+ "start_p" : p_range[0],
230
+ "start_q" : q_range[0],
231
+ "max_p" : p_range[1],
232
+ "max_q" : q_range[1],
233
+ "start_P" : P_range[0],
234
+ "start_Q" : Q_range[0],
235
+ "max_P" : P_range[1],
236
+ "max_Q" : q_range[1],
237
+ }
238
+ elif option == "XGBoost":
239
+
240
  sider.subheader(option)
241
+ f1 = sider.selectbox("Apply First Transformation", options=["None", "log", "sqrt"], index= 0, key="xgboost_f1")
242
+ if f1 == "None":
243
+ df["X_XGBoost_1"]= df["data"]
244
+ model_inv["XGBoost"] = lambda x :x
245
+ elif f1 == "log":
246
+ try:
247
+ df["X_XGBoost_1"] = np.log(df["data"])
248
+ model_inv["XGBoost"] = np.exp
249
+ except:
250
+ st.error("Unable to apply log, use the default data")
251
+ df["X_XGBoost_1"]= df["data"]
252
+ model_inv["XGBoost"] = lambda x :x
253
+ elif f1 == "sqrt":
254
+ try:
255
+ df["X_XGBoost_1"] = np.sqrt(df["data"])
256
+ model_inv["XGBoost"] = np.square
257
+ except:
258
+ st.error("Unable to apply sqrt, use the default data")
259
+ df["X_XGBoost_1"]= df["data"]
260
+ model_inv["XGBoost"] = lambda x :x
261
+ f2 = sider.selectbox("Apply Second Tronsformation", options=["None","first_order_diff"], index= 0, key="xgboost_2")
262
+ if f2 == "None":
263
+ df["X_XGBoost"]= df["X_XGBoost_1"]
264
+ model_inv["XGBoost"] = lambda x :x
265
+ elif f2 == "first_order_diff":
266
+
267
+ try:
268
+
269
+ df["X_XGBoost"],xg_lags= first_order_diff(df["X_XGBoost_1"].bfill())
270
+
271
+ except:
272
+ st.error("Unable to apply first_order diff, use the default data")
273
+ train_size = int(0.8*len(df))
274
+ train = df.iloc[0:train_size,:]
275
+ test = df.iloc[train_size:, :]
276
  max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5)
277
  lags= sider.slider("Lags features",min_value=1, max_value=30, value=5)
278
  learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01)
279
  n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000)
280
+
281
+ X_train, y_train = create_features(train,lags=lags, feature_col="X_XGBoost")
282
+
283
+ X_test, y_test = create_features(pd.concat([train.iloc[-lags:,:], test["X_XGBoost"]]), lags=lags, feature_col="X_XGBoost")
284
+
285
+ fit = sider.button("Train Models")
286
+ if fit:
287
+ for option in options:
288
+ if option == "ARIMA":
289
+ st.subheader("ARIMA")
290
+ result_arima = auto_arima(train["X_ARIMA"],
291
+ start_p=arima_params_dict["start_p"],
292
+ start_q= arima_params_dict["start_q"],
293
+ max_q = arima_params_dict["max_q"],
294
+ max_p = arima_params_dict["max_p"],
295
+ )
296
+ arimax_pred, conf = sarimax_forecast(result_arima, steps=len(test))
297
+ conf_int_arima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
298
+ models[option] = (result, conf_int_arima)
299
+ test[option] = arimax_pred
300
+ elif option == "SARIMA":
301
+ st.subheader("SARIMA")
302
+ result_sarima = auto_arima(train["X_SARIMA"], **sarima_params_dict)
303
+ sarimax_pred, conf = sarimax_forecast(result_sarima, steps=len(test))
304
+ conf_int_sarima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
305
+ models[option] = (result, conf_int_sarima)
306
+ test[option] = sarimax_pred
307
+ elif option =="XGBoost":
308
+ st.subheader("SARIMA")
309
  model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
310
  test[option] = model_xgb.predict(X_test)
311
  xgb_fig, xgb_ax = plt.subplots()
312
+ xgb.plot_importance(model_xgb, ax = xgb_ax, max_num_features=5)
313
  models[option] = (model_xgb, None)
314
  st.pyplot(xgb_fig)
315
 
316
+
317
  st.divider()
318
  st.subheader("Prediction")
319
 
 
324
  if option in test.columns:
325
  c = None
326
  if option=="ARIMA":
327
+ c = conf_int_arima
328
+ if option=="SARIMA":
329
+ c = conf_int_sarima
330
  with pred_tabs[idx]:
331
+ fig = plotForcast(df[f"X_{option}"], test[option], confint=c)
332
  st.plotly_chart(fig)
333
 
334
 
335
  # Model error
336
  errors = {}
337
+ metric_labels = ["MAE","MAPE", "RMSE"]
338
  errors["Model"] = []
339
  errors["Type"] = []
340
  errors["error"] = []
341
 
342
  for option in options:
343
  if option in test.columns:
344
+ if option == "XGBoost" and f2 == "first_order_diff":
345
+ first = test["X_XGBoost_1"].iloc[:xg_lags].values
346
+ inv = model_inv["XGBoost"](
347
+ inverse_first_order_diff(test[option], xg_lags,first))
348
+ else:
349
+ inv = model_inv[option](test[option])
350
+ mae = mean_absolute_error(test["data"],inv )
351
+ mape = mean_absolute_percentage_error(test["data"], inv)
352
+ rmse = root_mean_squared_error(test["data"], inv)
353
  errors["Model"].extend([option]*len(metric_labels))
354
  errors["Type"].extend(metric_labels)
355
+ errors["error"].extend([mae,mape, rmse])
 
356
 
357
+ if fit:
358
  st.divider()
359
  st.subheader("Compare Models Errors")
360
  errors_df = pd.DataFrame(errors)
361
+ erro_fig, ax= plt.subplots(nrows=2)
362
+ sns.barplot(data = errors_df[errors_df["Type"].isin(["MAE", "RMSE"])],
363
+ y="error", x = "Type",hue="Model", ax=ax[0], width=0.4)
364
+ sns.barplot(data = errors_df[errors_df["Type"].isin(["MAPE"])],
365
+ y="error", x = "Type",hue="Model", width=0.2,ax=ax[1])
366
  st.pyplot(erro_fig)
models/arima.py CHANGED
@@ -4,6 +4,7 @@ import statsmodels.api as sm
4
  from itertools import product
5
  from scipy import stats
6
  from stqdm import stqdm
 
7
  """
8
  @st.cache_data
9
  def gridSearch(endog, order_ls:list, d= 0):
@@ -60,22 +61,26 @@ def valid_model(lb, jb):
60
 
61
  #@st.cache_data
62
  def sarimax_forecast(model, steps):
63
- forecat = model.get_forecast(steps=steps)
64
- predicted = forecat.predicted_mean
65
- confint = forecat.conf_int()
66
- return predicted, confint
67
 
68
  @st.cache_data
69
- def auto_arima():
70
- model = pm.auto_arima(data_actual,
71
- m=12, # frequency of series
 
72
  seasonal=seasonal, # TRUE if seasonal series
73
- d=None, # let model determine 'd'
74
  test='adf', # use adftest to find optimal 'd'
75
- start_p=0, start_q=0, # minimum p and q
76
- max_p=12, max_q=12, # maximum p and q
77
- D=None, # let model determine 'D'
 
 
 
 
78
  trace=True,
79
  error_action='ignore',
80
- suppress_warnings=True,
81
  stepwise=True)
 
4
  from itertools import product
5
  from scipy import stats
6
  from stqdm import stqdm
7
+ import pmdarima as pm
8
  """
9
  @st.cache_data
10
  def gridSearch(endog, order_ls:list, d= 0):
 
61
 
62
  #@st.cache_data
63
  def sarimax_forecast(model, steps):
64
+ forecat, confint = model.predict(n_periods=steps, return_conf_int=True)
65
+
66
+ return forecat, confint
 
67
 
68
  @st.cache_data
69
+ def auto_arima(endog, m=0, seasonal=False, d=None,D=None, start_p=0, start_q=0, start_P=0, start_Q=0, max_p=12, max_q=12, max_P=0, max_Q=0):
70
+ print(m, seasonal, d, D, start_p,start_q, max_p, max_q)
71
+ return pm.auto_arima(endog,
72
+ m=m, # frequency of series
73
  seasonal=seasonal, # TRUE if seasonal series
74
+ d=d, # let model determine 'd'
75
  test='adf', # use adftest to find optimal 'd'
76
+ start_p=start_p, start_q=start_q, # minimum p and q
77
+ max_p=max_p, max_q=max_q, # maximum p and q
78
+ start_P=start_P,
79
+ start_Q= start_Q,
80
+ max_P=max_P,
81
+ max_Q = max_Q,
82
+ D=D, # let model determine 'D'
83
  trace=True,
84
  error_action='ignore',
85
+ suppress_warnings=True,
86
  stepwise=True)
utils/dataprocess.py CHANGED
@@ -32,11 +32,12 @@ def numberOfDiff(y):
32
  d = d+1
33
  return d, diff
34
 
35
- @st.cache_data
36
- def create_features(df, lags = 1):
37
  """
38
  Creates time series features from datetime index
39
  """
 
40
  df['date'] = df.index
41
  df['hour'] = df['date'].dt.hour
42
  df['dayofweek'] = df['date'].dt.dayofweek
@@ -46,8 +47,8 @@ def create_features(df, lags = 1):
46
  df['dayofyear'] = df['date'].dt.dayofyear
47
  df['dayofmonth'] = df['date'].dt.day
48
  for i in range(1, lags):
49
- df[f'lag_{i}'] = df["data"].shift(i)
50
- df["target"] = df["data"].shift(lags)
51
  df.dropna(inplace=True)
52
  y = df[["target"]]
53
  X = df.drop(columns=["target", "date"])
@@ -72,6 +73,19 @@ def apply_exponential_average(data):
72
 
73
  def first_order_diff(df):
74
  lags = st.sidebar.slider("lags", min_value=1, max_value=30, value=1)
75
- return df.diff(lags)
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def second_order_diff(df):
77
  return df.diff().diff()
 
32
  d = d+1
33
  return d, diff
34
 
35
+
36
+ def create_features(df, lags = 1, feature_col="data"):
37
  """
38
  Creates time series features from datetime index
39
  """
40
+ df =df[feature_col].to_frame()
41
  df['date'] = df.index
42
  df['hour'] = df['date'].dt.hour
43
  df['dayofweek'] = df['date'].dt.dayofweek
 
47
  df['dayofyear'] = df['date'].dt.dayofyear
48
  df['dayofmonth'] = df['date'].dt.day
49
  for i in range(1, lags):
50
+ df[f'lag_{i}'] = df[feature_col].shift(i)
51
+ df["target"] = df[feature_col].shift(lags)
52
  df.dropna(inplace=True)
53
  y = df[["target"]]
54
  X = df.drop(columns=["target", "date"])
 
73
 
74
  def first_order_diff(df):
75
  lags = st.sidebar.slider("lags", min_value=1, max_value=30, value=1)
76
+ return df.diff(lags), lags
77
+ def inverse_first_order_diff(df, lags, first):
78
+ df =df
79
+ l = []
80
+ l.extend(first)
81
+ ls = df.values
82
+
83
+ for i in range(lags, len(ls)):
84
+ l.append(ls[i]+l[i-lags])
85
+ df = l
86
+ return df
87
+
88
+
89
+
90
  def second_order_diff(df):
91
  return df.diff().diff()
utils/graphics.py CHANGED
@@ -57,7 +57,7 @@ def plotForcast(df, pred, confint):
57
  fig.add_trace(
58
  go.Line(
59
  x = df.index,
60
- y = df.data,
61
  name= "Observed"
62
  )
63
  )
 
57
  fig.add_trace(
58
  go.Line(
59
  x = df.index,
60
+ y = df,
61
  name= "Observed"
62
  )
63
  )