Abi224 commited on
Commit
b4bf53e
1 Parent(s): 75de786

Upload Final_1st_phase.py

Browse files
Files changed (1) hide show
  1. Final_1st_phase.py +333 -0
Final_1st_phase.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Jun 1 00:59:49 2022
4
+
5
+ @author: Abinash.m
6
+ """
7
+ import datetime as dt
8
+ import streamlit as st
9
+ import numpy as np
10
+ import pandas as pd
11
+ from ThymeBoost import ThymeBoost as tb
12
+ import pmdarima as pm
13
+ import numpy as np
14
+ from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score,mean_absolute_percentage_error
15
+ from statsmodels.tsa.stattools import adfuller,acf, pacf
16
+ from math import sqrt
17
+ import base64
18
+ from prophet import Prophet
19
+ from prophet.diagnostics import performance_metrics
20
+ from prophet.diagnostics import cross_validation
21
+ from prophet.plot import plot_cross_validation_metric
22
+ from statsmodels.tsa.stattools import kpss
23
+ from pmdarima import pipeline
24
+ from pmdarima import model_selection
25
+ from pmdarima import preprocessing as ppc
26
+ from pmdarima import arima
27
+ import matplotlib.pyplot as plt
28
+ from pmdarima.arima import auto_arima
29
+ import statsmodels.api as sm
30
+ import itertools
31
+ #from ThymeBoost import ThymeBoost as tb
32
+ from pmdarima.arima import AutoARIMA
33
+ from datetime import datetime
34
+ from math import sqrt
35
+ #----------------------------------------GETTING DATA------------------------------------------------
36
+ def get_df(data):
37
+ custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")
38
+ extension = data.name.split('.')[1]
39
+ if extension.upper() == 'CSV':
40
+ df = pd.read_csv(data,index_col=0,squeeze=True,parse_dates=True,
41
+ date_parser=custom_date_parser,
42
+ infer_datetime_format=True,dayfirst=True)
43
+ elif extension.upper() == 'XLSX':
44
+ df = pd.read_excel(data, engine='openpyxl')
45
+ elif extension.upper() == 'PICKLE':
46
+ df = pd.read_pickle(data)
47
+ return df
48
+
49
+ #-----------------------------------MODEL1-------------------------------------------------------
50
+
51
+
52
+ def arima(df,train,test):
53
+ model=auto_arima(train,start_p=0,d=1,start_q=0,
54
+ max_p=5,max_d=5,max_q=5, start_P=0,
55
+ D=1, start_Q=0, max_P=5,max_D=5,
56
+ max_Q=5, m=12, seasonal=True,
57
+ error_action='warn',trace=True,
58
+ supress_warnings=True,stepwise=True,
59
+ random_state=20,n_fits=50)
60
+ prediction = pd.DataFrame(model.predict(n_periods = len(test)),index=test.index)
61
+ prediction.columns = ['predicted_data']
62
+ test['predicted_data'] = prediction
63
+ mse = np.square(np.subtract(test.iloc[:,0],test['predicted_data'])).mean()
64
+ rmse = sqrt(mse)
65
+ #rmse= sqrt(mean_squared_error(test.iloc[:,0], test['predicted_data']))
66
+ r2_scor =r2_score(test.iloc[:,0],test['predicted_data'])
67
+ r2_scor =r2_scor*100
68
+ mae =mean_absolute_error(test.iloc[:,0],test['predicted_data'])
69
+ mape =mean_absolute_percentage_error(test.iloc[:,0],test['predicted_data'])
70
+ mape = mape*100
71
+ #st.write(rmse)
72
+ #st.write(r2_scor)
73
+ #st.write(mae)
74
+ #st.write(mape)
75
+
76
+
77
+ x =df.index[-1]
78
+ rng = pd.date_range(x, periods=25, freq='M')
79
+ pred = pd.DataFrame(model.predict(n_periods = 25),index=rng)
80
+ #future= model.predict(n_periods=43, typ='linear')
81
+ #pred = pd.DataFrame({ 'Date': rng, 'ARIMA': future})
82
+ #st.table(pred)
83
+
84
+ return rmse,r2_scor,mae,mape,pred
85
+
86
+
87
+ #-----------------------------------MODEL2---------------------------------------------------------
88
+
89
+
90
+
91
+ def sarima(df):
92
+ train = df[:int(len(df)*.75)]
93
+ test = df[int(len(df)*.75):]
94
+ p = d = q = range(0, 1)
95
+ pdq = list(itertools.product(p, d, q))
96
+ seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
97
+ order =[]
98
+ for param in pdq:
99
+ for param_seasonal in seasonal_pdq:
100
+ try:
101
+ mod = sm.tsa.statespace.SARIMAX(df,order=param,seasonal_order=param_seasonal,enforce_stationarity=False,enforce_invertibility=False)
102
+ results = mod.fit()
103
+ order.append((param,param_seasonal,results.aic))
104
+ #print('ARIMA{}x{}12 - AIC:{}'.format(param,param_seasonal,results.aic))
105
+ #st.write(results.aic)
106
+ except:
107
+ continue
108
+ order_df = pd.DataFrame(order,columns=['Order','Seasonal_order','AIC'])
109
+ order_df =order_df.sort_values('AIC')
110
+ pdq_order = order_df['Order'].iloc[0]
111
+ seasonal = order_df['Seasonal_order'].iloc[0]
112
+ mod = sm.tsa.statespace.SARIMAX(df,
113
+ order=(0, 0, 1),
114
+ seasonal_order=(1, 1, 1, 12),
115
+ enforce_stationarity=False,
116
+ enforce_invertibility=False)
117
+ sarima = mod.fit()
118
+ pred = sarima.get_prediction(start=len(train), dynamic=False)
119
+ pred_ci = pred.conf_int()
120
+ y_pred = pred.predicted_mean
121
+ y_pred = pd.DataFrame(y_pred)
122
+ #mse = mean_squared_error(test, y_pred)
123
+ #rmse =sqrt(mse)
124
+ mse = np.square(np.subtract(test,y_pred)).mean()
125
+ rmse = sqrt(mse)
126
+ r2_scor =r2_score(test,y_pred)
127
+
128
+ mae =mean_absolute_error(test,y_pred)
129
+ mape =mean_absolute_percentage_error(test, y_pred)
130
+ mape = mape*100
131
+ #pred_uc = sarima.get_forecast(steps=12)
132
+ #pred_ci = pred_uc.conf_int()
133
+ x =df.index[-1]
134
+ rng = pd.date_range(x, periods=25, freq='M')
135
+ pred_uc = sarima.get_forecast(steps=25)
136
+
137
+ pred_ci = pred_uc.conf_int()
138
+ forecast = pred_uc.predicted_mean
139
+ #pred = pd.DataFrame(forecast,index=rng)
140
+ pred = pd.DataFrame({ 'Date': rng, 'ARIMA': forecast})
141
+
142
+
143
+ #st.write(pred_ci)
144
+ #st.write(rmse)
145
+ #st.write(r2_scor)
146
+ #st.write(mae)
147
+ #st.write(mape)
148
+ #st.write(pred)
149
+ return rmse,r2_scor,mae,mape,pred
150
+ #---------------------------------------MODEL 3---------------------------------------------------------
151
+ def auto_model(df):
152
+ size = round(int(len(df)*.80))
153
+ train, test = model_selection.train_test_split(df.iloc[:,0], train_size=size)
154
+
155
+ # Let's create a pipeline with multiple stages... the Wineind dataset is
156
+ # seasonal, so we'll include a FourierFeaturizer so we can fit it without
157
+ # seasonality
158
+ pipe = pipeline.Pipeline([
159
+ ("fourier", ppc.FourierFeaturizer(m=12, k=4)),
160
+ ("arima", AutoARIMA(stepwise=True, trace=1, error_action="ignore",
161
+ seasonal=False, # because we use Fourier
162
+ suppress_warnings=True))
163
+ ])
164
+
165
+ pipe.fit(df)
166
+ x =df.index[-1]
167
+ rng = pd.date_range(x, periods=25, freq='M')
168
+ preds, conf_int = pipe.predict(n_periods=25, return_conf_int=True)
169
+
170
+ auto_pred=pd.DataFrame(preds)
171
+ m=pd.DataFrame(conf_int)
172
+ auto_pred['Upper Bound'] = m.iloc[:,1]
173
+ auto_pred['Lower Bound'] = m.iloc[:,0]
174
+ pred = pd.DataFrame({ 'Date': rng, 'Forecast value': preds})
175
+ pred = pd.DataFrame(pred)
176
+ pred = pred.set_index('Date')
177
+ return pred
178
+
179
+
180
+
181
+ #---------------------------------------MODEL 4----------------------------------------------------------
182
+ def enhanced_auto_ml_model(df):
183
+
184
+ if df is not None:
185
+ df.columns=['ds','y']
186
+ df['ds'] = pd.to_datetime(df['ds'],errors='coerce')
187
+
188
+
189
+
190
+ max_date = df['ds'].max()
191
+ #st.write(max_date)
192
+
193
+ periods_input = 25
194
+
195
+ if df is not None:
196
+ m = Prophet()
197
+ m.fit(df)
198
+
199
+ if df is not None:
200
+ future = m.make_future_dataframe(periods=periods_input,freq ='M')
201
+
202
+ forecast = m.predict(future)
203
+ fcst = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
204
+
205
+ fcst_filtered = fcst[fcst['ds'] > max_date]
206
+ #st.write(fcst_filtered)
207
+ metric_df = fcst.set_index('ds')[['yhat']].join(df.set_index('ds').y).reset_index()
208
+ metric_df.dropna(inplace=True)
209
+
210
+ #mse = mean_squared_error(metric_df.y, metric_df.yhat)
211
+ #rmse =sqrt(mse)
212
+ mse = np.square(np.subtract(metric_df.y,metric_df.yhat)).mean()
213
+ rmse = sqrt(mse)
214
+ r2_scor =r2_score(metric_df.y, metric_df.yhat)
215
+
216
+ mae =mean_absolute_error(metric_df.y, metric_df.yhat)
217
+ mape =mean_absolute_percentage_error(metric_df.y, metric_df.yhat)
218
+ mape = mape*100
219
+ pred = pd.DataFrame(fcst_filtered)
220
+ pred = pred.set_index('ds')
221
+
222
+
223
+ return rmse,r2_scor,mae,mape,pred
224
+ #---------------------------------------MODEL 5-------------------------------------------------------
225
+ def enhanced_arima_model(df,train,test):
226
+ boosted_model = tb.ThymeBoost(verbose=0)
227
+ model = boosted_model.autofit(train.iloc[:,0],
228
+ seasonal_period=0)
229
+ predicted_output = boosted_model.predict(model, forecast_horizon=len(train))
230
+ mse = mean_squared_error(train.iloc[:,0], predicted_output['predictions'])
231
+ rmse =sqrt(mse)
232
+ r2_scor =r2_score(train.iloc[:,0], predicted_output['predictions'])
233
+
234
+ mae =mean_absolute_error(train.iloc[:,0], predicted_output['predictions'])
235
+ mape =mean_absolute_percentage_error(train.iloc[:,0], predicted_output['predictions'])
236
+ mape = mape*100
237
+ boosted_model1 = tb.ThymeBoost(verbose=0)
238
+ model1 = boosted_model1.autofit(df.iloc[:,0],
239
+ seasonal_period=12)
240
+ x =df.index[-1]
241
+ rng = pd.date_range(x, periods=25, freq='M')
242
+ predicted_output1 = boosted_model1.predict(model1, forecast_horizon=len(rng))
243
+
244
+ return rmse,r2_scor,mae,mape,predicted_output1
245
+ #---------------------------------------DOWNLOAD THE FILE----------------------------------------
246
+ def download(df):
247
+ csv_exp = df.to_csv(index=True)
248
+ # When no file name is given, pandas returns the CSV as a string, nice.
249
+ b64 = base64.b64encode(csv_exp.encode()).decode() # some strings <-> bytes conversions necessary here
250
+ href = f'<a href="data:file/csv;base64,{b64}">Download CSV File</a> (right-click and save as ** &lt;forecast_name&gt;.csv**)'
251
+ st.markdown(href, unsafe_allow_html=True)
252
+ #st.table(df)
253
+ #---------------------------------------MAIN BLOCK-------------------------------------------------------
254
+ def main():
255
+ st.header('Upload the data with date column')
256
+ data = st.file_uploader("Upload file", type=['csv' ,'xlsx','pickle'])
257
+ if not data:
258
+ st.write("Upload a .csv or .xlsx file to get started")
259
+ return
260
+ df =get_df(data)
261
+ #pro = get_df1(data)
262
+ df =pd.DataFrame(df)
263
+ cols = st.selectbox(
264
+ 'Please select a column',df.columns.tolist())
265
+ df = df[cols]
266
+ #pro = pro[cols]
267
+ df =pd.DataFrame(df)
268
+ #pro = df.copy()
269
+ #df= pd.to_datetime(df.index,infer_datetime_format=True,format='%Y-%m-%d',exact=True)
270
+ pred = df.copy()
271
+ pred = pred.reset_index()
272
+ #pred= pd.to_datetime(pred.iloc[:,0],infer_datetime_format=True,format='%Y-%m-%d',exact=True)
273
+ train = df[:int(len(df)*.75)]
274
+ test = df[int(len(df)*.75):]
275
+ model1 = arima(df,train,test)
276
+ model2 =sarima(df)
277
+ model3 =enhanced_auto_ml_model(pred)
278
+ #model5=enhanced_arima_model(df, train,test)
279
+ model4 = auto_model(df)
280
+ my_dict ={'RMSE':[model1[0],model2[0],model3[0]],
281
+ 'R2_SCORE':[model1[1],model2[1],model3[1]],
282
+ 'MAE':[model1[2],model2[2],model3[2]],
283
+ 'MAPE':[model1[3],model2[3],model3[3]]
284
+ }
285
+ my_df=pd.DataFrame(my_dict,index=['ARIMA','SARIMA','ENHANCED AUTO ML MODEL'])
286
+ st.subheader('EVALUATION METRICS')
287
+ st.table(my_df)
288
+
289
+ #st.subheader('Auto_arima')
290
+ #model(df, train, test)
291
+ #st.subheader('SARIMAX')
292
+ #auto(df)
293
+ #st.subheader('MODEL 4')
294
+ #prophet(pred)
295
+ #st.subheader('MODEL 1')
296
+ #st.table(model1[4])
297
+ #st.subheader('MODEL 2')
298
+ #st.table(model2[4])
299
+ #st.table(df)
300
+
301
+ # if model5[3]<float(20):
302
+ # st.write('ENHANCED ARIMA MODEL is the best model for the dataset ')
303
+ # #csv_exp = model5[4].to_csv(index=True)
304
+ # download(model5[4])
305
+ # st.line_chart(model5[4].iloc[:,0])
306
+ # st.balloons()
307
+
308
+ if model3[3]<float(20):
309
+ st.write('ENHANCED AUTO ML MODEL is the best model for the dataset ')
310
+ download(model3[4])
311
+
312
+ st.line_chart(model3[4].iloc[:,0])
313
+ st.balloons()
314
+
315
+ elif model2[3]<float(20):
316
+ st.write('SARIMA MODEL is the best model for the dataset ')
317
+ download(model2[4])
318
+ st.line_chart(model2[4].iloc[:,1])
319
+ st.balloons()
320
+ elif model1[3]<float(20):
321
+ st.write('ARIMA MODEL is the best model for the dataset ')
322
+ download(model1[4])
323
+ st.line_chart(model1[4].iloc[:,0])
324
+ st.balloons()
325
+
326
+ else:
327
+ st.write('ARIMA MODEL WITH PIPELINE is the best model for the dataset ')
328
+ download(model4[4])
329
+ st.line_chart(model4.iloc[:,0])
330
+ st.balloons()
331
+
332
+
333
+ main()