Spaces:
Runtime error
Runtime error
Commit
•
91839e1
1
Parent(s):
82f2020
SARIMA and XGBoost bugs fixed
Browse files- app.py +208 -85
- models/arima.py +17 -12
- utils/dataprocess.py +19 -5
- utils/graphics.py +1 -1
app.py
CHANGED
@@ -6,10 +6,11 @@ import statsmodels.api as sm
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features,
|
8 |
apply_moving_average, apply_exponential_average,
|
9 |
-
first_order_diff, second_order_diff, isStatinary
|
|
|
10 |
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
11 |
from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation
|
12 |
-
from models.arima import
|
13 |
from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT
|
14 |
import xgboost as xgb
|
15 |
from models.xgboost import xgboost
|
@@ -63,42 +64,46 @@ df_copy = df.copy()
|
|
63 |
# Data Transformation
|
64 |
# Sélection des transformations dans l'ordre
|
65 |
sider.divider()
|
|
|
66 |
transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff]
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
for transformation in selected_transformations:
|
75 |
-
if transformation == np.log.__name__:
|
76 |
-
try:
|
77 |
-
df["Transformed"] = np.log(df["Transformed"])
|
78 |
-
transform_name = transform_name + "log_"
|
79 |
-
except:
|
80 |
-
st.error(f"Unable to apply {np.log.__name__}")
|
81 |
-
elif transformation == np.sqrt.__name__:
|
82 |
-
try:
|
83 |
-
df["Transformed"] = np.sqrt(df["Transformed"])
|
84 |
-
transform_name = transform_name + "sqrt_"
|
85 |
-
except:
|
86 |
-
st.error(f"Unable to apply {np.sqrt.__name__}")
|
87 |
-
|
88 |
-
elif transformation == "first_order_diff":
|
89 |
-
try:
|
90 |
-
df["Transformed"] = first_order_diff(df["Transformed"])
|
91 |
-
transform_name = transform_name + "diff1_"
|
92 |
-
except:
|
93 |
-
st.error(f"Unable to apply {transformation}")
|
94 |
-
elif transformation == "second_order_diff":
|
95 |
-
try:
|
96 |
-
df["Transformed"] = second_order_diff(df["Transformed"])
|
97 |
-
transform_name = transform_name + "diff2_"
|
98 |
-
except:
|
99 |
-
st.error(f"Unable to apply {transformation}")
|
100 |
-
transform_name = transform_name[:-1]
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
# Graphics
|
103 |
st.subheader("Visualize your Time Series")
|
104 |
|
@@ -136,12 +141,10 @@ with pacf_acf:
|
|
136 |
plot_acf(df["data"], ax=ax1)
|
137 |
plot_pacf(df["data"], ax=ax2)
|
138 |
else:
|
139 |
-
plot_acf(df[choice], ax=ax1)
|
140 |
-
plot_pacf(df[choice], ax=ax2)
|
141 |
st.pyplot(fig2)
|
142 |
-
|
143 |
-
sider.divider()
|
144 |
-
|
145 |
# Split the df
|
146 |
train_size = int(0.8*len(df))
|
147 |
train = df.iloc[0:train_size,:]
|
@@ -152,55 +155,165 @@ test = df.iloc[train_size:, :]
|
|
152 |
options = sider.multiselect(
|
153 |
'Models',options=["ARIMA", "SARIMA", "XGBoost"])
|
154 |
models = {}
|
|
|
155 |
for option in options:
|
156 |
if option == "ARIMA":
|
157 |
-
st.divider()
|
158 |
sider.divider()
|
159 |
sider.subheader("ARIMA")
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
result = None
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
if
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
sider.subheader(option)
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5)
|
191 |
lags= sider.slider("Lags features",min_value=1, max_value=30, value=5)
|
192 |
learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01)
|
193 |
n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000)
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
|
198 |
test[option] = model_xgb.predict(X_test)
|
199 |
xgb_fig, xgb_ax = plt.subplots()
|
200 |
-
xgb.plot_importance(model_xgb, ax = xgb_ax)
|
201 |
models[option] = (model_xgb, None)
|
202 |
st.pyplot(xgb_fig)
|
203 |
|
|
|
204 |
st.divider()
|
205 |
st.subheader("Prediction")
|
206 |
|
@@ -211,33 +324,43 @@ for idx, option in enumerate(options):
|
|
211 |
if option in test.columns:
|
212 |
c = None
|
213 |
if option=="ARIMA":
|
214 |
-
c =
|
|
|
|
|
215 |
with pred_tabs[idx]:
|
216 |
-
fig = plotForcast(df, test[option], confint=c)
|
217 |
st.plotly_chart(fig)
|
218 |
|
219 |
|
220 |
# Model error
|
221 |
errors = {}
|
222 |
-
metric_labels = ["MAE",
|
223 |
errors["Model"] = []
|
224 |
errors["Type"] = []
|
225 |
errors["error"] = []
|
226 |
|
227 |
for option in options:
|
228 |
if option in test.columns:
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
errors["Model"].extend([option]*len(metric_labels))
|
233 |
errors["Type"].extend(metric_labels)
|
234 |
-
errors["error"].extend([mae,
|
235 |
-
sider.divider()
|
236 |
|
237 |
-
if
|
238 |
st.divider()
|
239 |
st.subheader("Compare Models Errors")
|
240 |
errors_df = pd.DataFrame(errors)
|
241 |
-
erro_fig, ax
|
242 |
-
sns.barplot(data = errors_df
|
|
|
|
|
|
|
243 |
st.pyplot(erro_fig)
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from utils.dataprocess import (load_data, df_col, numberOfDiff,create_features,
|
8 |
apply_moving_average, apply_exponential_average,
|
9 |
+
first_order_diff, second_order_diff, isStatinary,
|
10 |
+
inverse_first_order_diff)
|
11 |
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
12 |
from utils.graphics import plotDecompse, plotTs, plotForcast, plotTransformation
|
13 |
+
from models.arima import auto_arima, sarimax_forecast
|
14 |
from utils.constants import DEFAULT_DATASETS_DIR, FREQ_DICT
|
15 |
import xgboost as xgb
|
16 |
from models.xgboost import xgboost
|
|
|
64 |
# Data Transformation
|
65 |
# Sélection des transformations dans l'ordre
|
66 |
sider.divider()
|
67 |
+
|
68 |
transformations_function = [np.log,np.sqrt, first_order_diff, second_order_diff]
|
69 |
+
def transform_view(df, model="",transform_name="", key="transform"):
|
70 |
+
transf_col = "Transformed"+model
|
71 |
+
|
72 |
+
transformations = [t.__name__ for t in transformations_function]
|
73 |
+
selected_transformations = sider.multiselect("Data Transfomations", transformations)
|
74 |
+
|
75 |
+
# Apply transformations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
df[transf_col] = df["data"]
|
78 |
+
for transformation in selected_transformations:
|
79 |
+
if transformation == np.log.__name__:
|
80 |
+
try:
|
81 |
+
df[transf_col] = np.log(df[transf_col])
|
82 |
+
transform_name = transform_name + "log_"
|
83 |
+
except:
|
84 |
+
st.error(f"Unable to apply {np.log.__name__}")
|
85 |
+
elif transformation == np.sqrt.__name__:
|
86 |
+
try:
|
87 |
+
df[transf_col] = np.sqrt(df[transf_col])
|
88 |
+
transform_name = transform_name + "sqrt_"
|
89 |
+
except:
|
90 |
+
st.error(f"Unable to apply {np.sqrt.__name__}")
|
91 |
+
|
92 |
+
elif transformation == "first_order_diff":
|
93 |
+
try:
|
94 |
+
df[transf_col], lags = first_order_diff(df[transf_col])
|
95 |
+
transform_name = transform_name + f"diff-{lags}_"
|
96 |
+
except:
|
97 |
+
st.error(f"Unable to apply {transformation}")
|
98 |
+
elif transformation == "second_order_diff":
|
99 |
+
try:
|
100 |
+
df[transf_col] = second_order_diff(df[transf_col])
|
101 |
+
transform_name = transform_name + "diff2_"
|
102 |
+
except:
|
103 |
+
st.error(f"Unable to apply {transformation}")
|
104 |
+
transform_name = transform_name[:-1]
|
105 |
+
return df,transform_name
|
106 |
+
df, transform_name = transform_view(df)
|
107 |
# Graphics
|
108 |
st.subheader("Visualize your Time Series")
|
109 |
|
|
|
141 |
plot_acf(df["data"], ax=ax1)
|
142 |
plot_pacf(df["data"], ax=ax2)
|
143 |
else:
|
144 |
+
plot_acf(df[choice].dropna(), ax=ax1)
|
145 |
+
plot_pacf(df[choice].dropna(), ax=ax2)
|
146 |
st.pyplot(fig2)
|
147 |
+
|
|
|
|
|
148 |
# Split the df
|
149 |
train_size = int(0.8*len(df))
|
150 |
train = df.iloc[0:train_size,:]
|
|
|
155 |
options = sider.multiselect(
|
156 |
'Models',options=["ARIMA", "SARIMA", "XGBoost"])
|
157 |
models = {}
|
158 |
+
model_inv = {}
|
159 |
for option in options:
|
160 |
if option == "ARIMA":
|
|
|
161 |
sider.divider()
|
162 |
sider.subheader("ARIMA")
|
163 |
+
func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="arima")
|
164 |
+
if func == "None":
|
165 |
+
df["X_ARIMA"]= df["data"]
|
166 |
+
model_inv["ARIMA"] = lambda x :x
|
167 |
+
elif func == "log":
|
168 |
+
try:
|
169 |
+
df["X_ARIMA"] = np.log(df["data"])
|
170 |
+
model_inv["ARIMA"] = np.exp
|
171 |
+
except:
|
172 |
+
st.error("Unable to apply log, use the default data")
|
173 |
+
df["X_ARIMA"]= df["data"]
|
174 |
+
model_inv["ARIMA"] = lambda x :x
|
175 |
+
elif func == "sqrt":
|
176 |
+
try:
|
177 |
+
df["X_ARIMA"] = np.sqrt(df["data"])
|
178 |
+
model_inv["ARIMA"] = np.square
|
179 |
+
except:
|
180 |
+
st.error("Unable to apply sqrt, use the default data")
|
181 |
+
df["X_ARIMA"]= df["data"]
|
182 |
+
model_inv["ARIMA"] = lambda x :x
|
183 |
+
train_size = int(0.8*len(df))
|
184 |
+
train = df.iloc[0:train_size,:]
|
185 |
+
test = df.iloc[train_size:, :]
|
186 |
result = None
|
187 |
+
p_range_arima = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0], key="arima_p_range")
|
188 |
+
q_range_arima = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0], key="arima_q_range")
|
189 |
+
arima_params_dict ={
|
190 |
+
"start_p" : p_range_arima[0],
|
191 |
+
"start_q" : q_range_arima[0],
|
192 |
+
"max_p" : p_range_arima[1],
|
193 |
+
"max_q" : q_range_arima[1]
|
194 |
+
}
|
195 |
+
elif option == "SARIMA" :
|
196 |
+
sider.subheader("SARIMA")
|
197 |
+
func = sider.selectbox("Apply ", options=["None", "log", "sqrt"], index= 0, key="sarima")
|
198 |
+
if func == "None":
|
199 |
+
df["X_SARIMA"]= df["data"]
|
200 |
+
model_inv["SARIMA"] = lambda x :x
|
201 |
+
elif func == "log":
|
202 |
+
try:
|
203 |
+
df["X_SARIMA"] = np.log(df["data"])
|
204 |
+
model_inv["SARIMA"] = np.exp
|
205 |
+
except:
|
206 |
+
st.error("Unable to apply log, use the default data")
|
207 |
+
df["X_SARIMA"]= df["data"]
|
208 |
+
model_inv["SARIMA"] = lambda x :x
|
209 |
+
elif func == "sqrt":
|
210 |
+
try:
|
211 |
+
df["X_SARIMA"] = np.sqrt(df["data"])
|
212 |
+
model_inv["SARIMA"] = np.square
|
213 |
+
except:
|
214 |
+
st.error("Unable to apply sqrt, use the default data")
|
215 |
+
df["X_SARIMA"]= df["data"]
|
216 |
+
model_inv["SARIMA"] = lambda x :x
|
217 |
+
train_size = int(0.8*len(df))
|
218 |
+
train = df.iloc[0:train_size,:]
|
219 |
+
test = df.iloc[train_size:, :]
|
220 |
+
result = None
|
221 |
+
m = sider.slider("seasonal period m ",min_value=0, max_value=30, value=12)
|
222 |
+
p_range = sider.slider("p Range",min_value=0, max_value=30, value=[0, 0], key="sarima_p_range")
|
223 |
+
q_range = sider.slider("q Range",min_value=0, max_value=30, value=[0, 0], key="sarima_q_range")
|
224 |
+
P_range = sider.slider("P Range",min_value=0, max_value=30, value=[0, 0])
|
225 |
+
Q_range = sider.slider("Q Range",min_value=0, max_value=30, value=[0, 0])
|
226 |
+
sarima_params_dict ={
|
227 |
+
"seasonal":True,
|
228 |
+
"m":m,
|
229 |
+
"start_p" : p_range[0],
|
230 |
+
"start_q" : q_range[0],
|
231 |
+
"max_p" : p_range[1],
|
232 |
+
"max_q" : q_range[1],
|
233 |
+
"start_P" : P_range[0],
|
234 |
+
"start_Q" : Q_range[0],
|
235 |
+
"max_P" : P_range[1],
|
236 |
+
"max_Q" : q_range[1],
|
237 |
+
}
|
238 |
+
elif option == "XGBoost":
|
239 |
+
|
240 |
sider.subheader(option)
|
241 |
+
f1 = sider.selectbox("Apply First Transformation", options=["None", "log", "sqrt"], index= 0, key="xgboost_f1")
|
242 |
+
if f1 == "None":
|
243 |
+
df["X_XGBoost_1"]= df["data"]
|
244 |
+
model_inv["XGBoost"] = lambda x :x
|
245 |
+
elif f1 == "log":
|
246 |
+
try:
|
247 |
+
df["X_XGBoost_1"] = np.log(df["data"])
|
248 |
+
model_inv["XGBoost"] = np.exp
|
249 |
+
except:
|
250 |
+
st.error("Unable to apply log, use the default data")
|
251 |
+
df["X_XGBoost_1"]= df["data"]
|
252 |
+
model_inv["XGBoost"] = lambda x :x
|
253 |
+
elif f1 == "sqrt":
|
254 |
+
try:
|
255 |
+
df["X_XGBoost_1"] = np.sqrt(df["data"])
|
256 |
+
model_inv["XGBoost"] = np.square
|
257 |
+
except:
|
258 |
+
st.error("Unable to apply sqrt, use the default data")
|
259 |
+
df["X_XGBoost_1"]= df["data"]
|
260 |
+
model_inv["XGBoost"] = lambda x :x
|
261 |
+
f2 = sider.selectbox("Apply Second Tronsformation", options=["None","first_order_diff"], index= 0, key="xgboost_2")
|
262 |
+
if f2 == "None":
|
263 |
+
df["X_XGBoost"]= df["X_XGBoost_1"]
|
264 |
+
model_inv["XGBoost"] = lambda x :x
|
265 |
+
elif f2 == "first_order_diff":
|
266 |
+
|
267 |
+
try:
|
268 |
+
|
269 |
+
df["X_XGBoost"],xg_lags= first_order_diff(df["X_XGBoost_1"].bfill())
|
270 |
+
|
271 |
+
except:
|
272 |
+
st.error("Unable to apply first_order diff, use the default data")
|
273 |
+
train_size = int(0.8*len(df))
|
274 |
+
train = df.iloc[0:train_size,:]
|
275 |
+
test = df.iloc[train_size:, :]
|
276 |
max_depth= sider.slider("Max Depth",min_value=1, max_value=30, value=5)
|
277 |
lags= sider.slider("Lags features",min_value=1, max_value=30, value=5)
|
278 |
learning_rate= sider.number_input(label="Learning Rate ", min_value=0.0001, max_value=0.75, step=0.01, value=0.01)
|
279 |
n_estimators= sider.number_input(label="n_estimator ", min_value=100, max_value=5000, step=100, value=1000)
|
280 |
+
|
281 |
+
X_train, y_train = create_features(train,lags=lags, feature_col="X_XGBoost")
|
282 |
+
|
283 |
+
X_test, y_test = create_features(pd.concat([train.iloc[-lags:,:], test["X_XGBoost"]]), lags=lags, feature_col="X_XGBoost")
|
284 |
+
|
285 |
+
fit = sider.button("Train Models")
|
286 |
+
if fit:
|
287 |
+
for option in options:
|
288 |
+
if option == "ARIMA":
|
289 |
+
st.subheader("ARIMA")
|
290 |
+
result_arima = auto_arima(train["X_ARIMA"],
|
291 |
+
start_p=arima_params_dict["start_p"],
|
292 |
+
start_q= arima_params_dict["start_q"],
|
293 |
+
max_q = arima_params_dict["max_q"],
|
294 |
+
max_p = arima_params_dict["max_p"],
|
295 |
+
)
|
296 |
+
arimax_pred, conf = sarimax_forecast(result_arima, steps=len(test))
|
297 |
+
conf_int_arima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
|
298 |
+
models[option] = (result, conf_int_arima)
|
299 |
+
test[option] = arimax_pred
|
300 |
+
elif option == "SARIMA":
|
301 |
+
st.subheader("SARIMA")
|
302 |
+
result_sarima = auto_arima(train["X_SARIMA"], **sarima_params_dict)
|
303 |
+
sarimax_pred, conf = sarimax_forecast(result_sarima, steps=len(test))
|
304 |
+
conf_int_sarima = pd.DataFrame(conf, index = test.index, columns=['lower data', "upper data"])
|
305 |
+
models[option] = (result, conf_int_sarima)
|
306 |
+
test[option] = sarimax_pred
|
307 |
+
elif option =="XGBoost":
|
308 |
+
st.subheader("SARIMA")
|
309 |
model_xgb = xgboost(X_train, y_train, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
|
310 |
test[option] = model_xgb.predict(X_test)
|
311 |
xgb_fig, xgb_ax = plt.subplots()
|
312 |
+
xgb.plot_importance(model_xgb, ax = xgb_ax, max_num_features=5)
|
313 |
models[option] = (model_xgb, None)
|
314 |
st.pyplot(xgb_fig)
|
315 |
|
316 |
+
|
317 |
st.divider()
|
318 |
st.subheader("Prediction")
|
319 |
|
|
|
324 |
if option in test.columns:
|
325 |
c = None
|
326 |
if option=="ARIMA":
|
327 |
+
c = conf_int_arima
|
328 |
+
if option=="SARIMA":
|
329 |
+
c = conf_int_sarima
|
330 |
with pred_tabs[idx]:
|
331 |
+
fig = plotForcast(df[f"X_{option}"], test[option], confint=c)
|
332 |
st.plotly_chart(fig)
|
333 |
|
334 |
|
335 |
# Model error
|
336 |
errors = {}
|
337 |
+
metric_labels = ["MAE","MAPE", "RMSE"]
|
338 |
errors["Model"] = []
|
339 |
errors["Type"] = []
|
340 |
errors["error"] = []
|
341 |
|
342 |
for option in options:
|
343 |
if option in test.columns:
|
344 |
+
if option == "XGBoost" and f2 == "first_order_diff":
|
345 |
+
first = test["X_XGBoost_1"].iloc[:xg_lags].values
|
346 |
+
inv = model_inv["XGBoost"](
|
347 |
+
inverse_first_order_diff(test[option], xg_lags,first))
|
348 |
+
else:
|
349 |
+
inv = model_inv[option](test[option])
|
350 |
+
mae = mean_absolute_error(test["data"],inv )
|
351 |
+
mape = mean_absolute_percentage_error(test["data"], inv)
|
352 |
+
rmse = root_mean_squared_error(test["data"], inv)
|
353 |
errors["Model"].extend([option]*len(metric_labels))
|
354 |
errors["Type"].extend(metric_labels)
|
355 |
+
errors["error"].extend([mae,mape, rmse])
|
|
|
356 |
|
357 |
+
if fit:
|
358 |
st.divider()
|
359 |
st.subheader("Compare Models Errors")
|
360 |
errors_df = pd.DataFrame(errors)
|
361 |
+
erro_fig, ax= plt.subplots(nrows=2)
|
362 |
+
sns.barplot(data = errors_df[errors_df["Type"].isin(["MAE", "RMSE"])],
|
363 |
+
y="error", x = "Type",hue="Model", ax=ax[0], width=0.4)
|
364 |
+
sns.barplot(data = errors_df[errors_df["Type"].isin(["MAPE"])],
|
365 |
+
y="error", x = "Type",hue="Model", width=0.2,ax=ax[1])
|
366 |
st.pyplot(erro_fig)
|
models/arima.py
CHANGED
@@ -4,6 +4,7 @@ import statsmodels.api as sm
|
|
4 |
from itertools import product
|
5 |
from scipy import stats
|
6 |
from stqdm import stqdm
|
|
|
7 |
"""
|
8 |
@st.cache_data
|
9 |
def gridSearch(endog, order_ls:list, d= 0):
|
@@ -60,22 +61,26 @@ def valid_model(lb, jb):
|
|
60 |
|
61 |
#@st.cache_data
|
62 |
def sarimax_forecast(model, steps):
|
63 |
-
forecat = model.
|
64 |
-
|
65 |
-
|
66 |
-
return predicted, confint
|
67 |
|
68 |
@st.cache_data
|
69 |
-
def auto_arima():
|
70 |
-
|
71 |
-
|
|
|
72 |
seasonal=seasonal, # TRUE if seasonal series
|
73 |
-
d=
|
74 |
test='adf', # use adftest to find optimal 'd'
|
75 |
-
start_p=
|
76 |
-
max_p=
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
trace=True,
|
79 |
error_action='ignore',
|
80 |
-
suppress_warnings=True,
|
81 |
stepwise=True)
|
|
|
4 |
from itertools import product
|
5 |
from scipy import stats
|
6 |
from stqdm import stqdm
|
7 |
+
import pmdarima as pm
|
8 |
"""
|
9 |
@st.cache_data
|
10 |
def gridSearch(endog, order_ls:list, d= 0):
|
|
|
61 |
|
62 |
#@st.cache_data
|
63 |
def sarimax_forecast(model, steps):
|
64 |
+
forecat, confint = model.predict(n_periods=steps, return_conf_int=True)
|
65 |
+
|
66 |
+
return forecat, confint
|
|
|
67 |
|
68 |
@st.cache_data
|
69 |
+
def auto_arima(endog, m=0, seasonal=False, d=None,D=None, start_p=0, start_q=0, start_P=0, start_Q=0, max_p=12, max_q=12, max_P=0, max_Q=0):
|
70 |
+
print(m, seasonal, d, D, start_p,start_q, max_p, max_q)
|
71 |
+
return pm.auto_arima(endog,
|
72 |
+
m=m, # frequency of series
|
73 |
seasonal=seasonal, # TRUE if seasonal series
|
74 |
+
d=d, # let model determine 'd'
|
75 |
test='adf', # use adftest to find optimal 'd'
|
76 |
+
start_p=start_p, start_q=start_q, # minimum p and q
|
77 |
+
max_p=max_p, max_q=max_q, # maximum p and q
|
78 |
+
start_P=start_P,
|
79 |
+
start_Q= start_Q,
|
80 |
+
max_P=max_P,
|
81 |
+
max_Q = max_Q,
|
82 |
+
D=D, # let model determine 'D'
|
83 |
trace=True,
|
84 |
error_action='ignore',
|
85 |
+
suppress_warnings=True,
|
86 |
stepwise=True)
|
utils/dataprocess.py
CHANGED
@@ -32,11 +32,12 @@ def numberOfDiff(y):
|
|
32 |
d = d+1
|
33 |
return d, diff
|
34 |
|
35 |
-
|
36 |
-
def create_features(df, lags = 1):
|
37 |
"""
|
38 |
Creates time series features from datetime index
|
39 |
"""
|
|
|
40 |
df['date'] = df.index
|
41 |
df['hour'] = df['date'].dt.hour
|
42 |
df['dayofweek'] = df['date'].dt.dayofweek
|
@@ -46,8 +47,8 @@ def create_features(df, lags = 1):
|
|
46 |
df['dayofyear'] = df['date'].dt.dayofyear
|
47 |
df['dayofmonth'] = df['date'].dt.day
|
48 |
for i in range(1, lags):
|
49 |
-
df[f'lag_{i}'] = df[
|
50 |
-
df["target"] = df[
|
51 |
df.dropna(inplace=True)
|
52 |
y = df[["target"]]
|
53 |
X = df.drop(columns=["target", "date"])
|
@@ -72,6 +73,19 @@ def apply_exponential_average(data):
|
|
72 |
|
73 |
def first_order_diff(df):
|
74 |
lags = st.sidebar.slider("lags", min_value=1, max_value=30, value=1)
|
75 |
-
return df.diff(lags)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
def second_order_diff(df):
|
77 |
return df.diff().diff()
|
|
|
32 |
d = d+1
|
33 |
return d, diff
|
34 |
|
35 |
+
|
36 |
+
def create_features(df, lags = 1, feature_col="data"):
|
37 |
"""
|
38 |
Creates time series features from datetime index
|
39 |
"""
|
40 |
+
df =df[feature_col].to_frame()
|
41 |
df['date'] = df.index
|
42 |
df['hour'] = df['date'].dt.hour
|
43 |
df['dayofweek'] = df['date'].dt.dayofweek
|
|
|
47 |
df['dayofyear'] = df['date'].dt.dayofyear
|
48 |
df['dayofmonth'] = df['date'].dt.day
|
49 |
for i in range(1, lags):
|
50 |
+
df[f'lag_{i}'] = df[feature_col].shift(i)
|
51 |
+
df["target"] = df[feature_col].shift(lags)
|
52 |
df.dropna(inplace=True)
|
53 |
y = df[["target"]]
|
54 |
X = df.drop(columns=["target", "date"])
|
|
|
73 |
|
74 |
def first_order_diff(df):
|
75 |
lags = st.sidebar.slider("lags", min_value=1, max_value=30, value=1)
|
76 |
+
return df.diff(lags), lags
|
77 |
+
def inverse_first_order_diff(df, lags, first):
|
78 |
+
df =df
|
79 |
+
l = []
|
80 |
+
l.extend(first)
|
81 |
+
ls = df.values
|
82 |
+
|
83 |
+
for i in range(lags, len(ls)):
|
84 |
+
l.append(ls[i]+l[i-lags])
|
85 |
+
df = l
|
86 |
+
return df
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
def second_order_diff(df):
|
91 |
return df.diff().diff()
|
utils/graphics.py
CHANGED
@@ -57,7 +57,7 @@ def plotForcast(df, pred, confint):
|
|
57 |
fig.add_trace(
|
58 |
go.Line(
|
59 |
x = df.index,
|
60 |
-
y = df
|
61 |
name= "Observed"
|
62 |
)
|
63 |
)
|
|
|
57 |
fig.add_trace(
|
58 |
go.Line(
|
59 |
x = df.index,
|
60 |
+
y = df,
|
61 |
name= "Observed"
|
62 |
)
|
63 |
)
|