Circhastic commited on
Commit
e2cb6ae
1 Parent(s): 39440ed

Fix app and removed modules

Browse files
app.py CHANGED
@@ -1,6 +1,195 @@
1
  import streamlit as st
2
  import pandas as pd
3
- from modules import *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  st.title("Sales Forecasting Dashboard")
6
  st.write("📈 Welcome User, start using the application by uploading your file in the sidebbar!")
@@ -10,10 +199,10 @@ st.set_page_config(
10
  page_icon="📈",
11
  layout="wide",
12
  initial_sidebar_state="expanded",
13
- )
14
 
15
- # if 'uploaded' not in st.session_state:
16
- # st.session_state.uploaded = 'uploaded'
17
 
18
  # Sidebar Menu
19
  with st.sidebar:
@@ -28,11 +217,27 @@ with st.sidebar:
28
  df = pd.read_csv(uploaded_file, parse_dates=True)
29
  st.write("Your uploaded data:")
30
  st.write(df)
 
31
  # Data pre-processing
32
- # df = preprocessor.drop(df)
33
- # df = preprocessor.date_format(df)
34
- # preprocessor.merge_sort(df)
35
- # df = preprocessor.group_to_three(df)
36
- # st.session_state.uploaded = True
 
37
  with open('sample.csv', 'rb') as f:
38
- st.download_button("Download our sample CSV", f, file_name='sample.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from datetime import datetime
4
+
5
+ import numpy as np
6
+ import pmdarima as pm
7
+ from pmdarima import auto_arima
8
+
9
+ import torch
10
+ from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
11
+
12
+ # Preprocessing
13
+ def merge(B, C, A):
14
+ i = j = k = 0
15
+
16
+ # Convert 'Date' columns to datetime.date objects
17
+ B['Date'] = pd.to_datetime(B['Date']).dt.date
18
+ C['Date'] = pd.to_datetime(C['Date']).dt.date
19
+ A['Date'] = pd.to_datetime(A['Date']).dt.date
20
+
21
+ while i < len(B) and j < len(C):
22
+ if B['Date'].iloc[i] <= C['Date'].iloc[j]:
23
+ A['Date'].iloc[k] = B['Date'].iloc[i]
24
+ A['Sales'].iloc[k] = B['Sales'].iloc[i]
25
+ i += 1
26
+
27
+ else:
28
+ A['Date'].iloc[k] = C['Date'].iloc[j]
29
+ A['Sales'].iloc[k] = C['Sales'].iloc[j]
30
+ j += 1
31
+ k += 1
32
+
33
+ while i < len(B):
34
+ A['Date'].iloc[k] = B['Date'].iloc[i]
35
+ A['Sales'].iloc[k] = B['Sales'].iloc[i]
36
+ i += 1
37
+ k += 1
38
+
39
+ while j < len(C):
40
+ A['Date'].iloc[k] = C['Date'].iloc[j]
41
+ A['Sales'].iloc[k] = C['Sales'].iloc[j]
42
+ j += 1
43
+ k += 1
44
+
45
+ return A
46
+
47
+ def merge_sort(dataframe):
48
+ if len(dataframe) > 1:
49
+ center = len(dataframe) // 2
50
+ left = dataframe.iloc[:center]
51
+ right = dataframe.iloc[center:]
52
+ merge_sort(left)
53
+ merge_sort(right)
54
+
55
+ return merge(left, right, dataframe)
56
+
57
+ else:
58
+ return dataframe
59
+
60
+ def drop (dataframe):
61
+ def get_columns_containing(dataframe, substrings):
62
+ return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
63
+
64
+ columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
65
+ dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
66
+ dataframe = dataframe.dropna()
67
+
68
+ return dataframe
69
+
70
+ def date_format(dataframe):
71
+ for i, d, s in dataframe.itertuples():
72
+ dataframe['Date'][i] = dataframe['Date'][i].strip()
73
+
74
+ for i, d, s in dataframe.itertuples():
75
+ new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
76
+ dataframe['Date'][i] = new_date
77
+
78
+ return dataframe
79
+
80
+ def group_to_three(dataframe):
81
+ dataframe['Date'] = pd.to_datetime(dataframe['Date'])
82
+ dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
83
+ dataframe = dataframe.replace(0, pd.np.nan).dropna()
84
+
85
+ return dataframe
86
+
87
+ # SARIMAX Model
88
+ def train_test(dataframe, n):
89
+ training_y = dataframe.iloc[:-n,0]
90
+ test_y = dataframe.iloc[-n:,0]
91
+ test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
92
+ training_X = dataframe.iloc[:-n,1:]
93
+ test_X = dataframe.iloc[-n:,1:]
94
+ future_X = dataframe.iloc[0:,1:]
95
+ return (training_y, test_y, test_y_series, training_X, test_X, future_X)
96
+
97
+ def model_fitting(dataframe, Exo):
98
+ futureModel = pm.auto_arima(dataframe['Sales'], X=Exo, start_p=1, start_q=1,
99
+ test='adf',min_p=1,min_q=1,
100
+ max_p=3, max_q=3, m=12,
101
+ start_P=0, seasonal=True,
102
+ d=None, D=1, trace=True,
103
+ error_action='ignore',
104
+ suppress_warnings=True,
105
+ stepwise=True)
106
+ model = futureModel
107
+ return model
108
+
109
+ def test_fitting(dataframe, Exo, trainY):
110
+ trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
111
+ test='adf',min_p=1,min_q=1,
112
+ max_p=3, max_q=3, m=12,
113
+ start_P=0, seasonal=True,
114
+ d=None, D=1, trace=True,
115
+ error_action='ignore',
116
+ suppress_warnings=True,
117
+ stepwise=True)
118
+ model = trainTestModel
119
+ return model
120
+
121
+ def forecast_accuracy(forecast, actual):
122
+ mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
123
+ rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
124
+ corr = np.corrcoef(forecast, actual)[0,1] # corr
125
+ mins = np.amin(np.hstack([forecast[:,None],
126
+ actual[:,None]]), axis=1)
127
+ maxs = np.amax(np.hstack([forecast[:,None],
128
+ actual[:,None]]), axis=1)
129
+ minmax = 1 - np.mean(mins/maxs) # minmax
130
+ return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
131
+
132
+ def sales_growth(dataframe, fittedValues):
133
+ sales_growth = fittedValues.to_frame()
134
+ sales_growth = sales_growth.reset_index()
135
+ sales_growth.columns = ("Date", "Sales")
136
+ sales_growth = sales_growth.set_index('Date')
137
+
138
+ sales_growth['Sales'] = (sales_growth['Sales']).round(2)
139
+
140
+ #Calculate and create the column for sales difference and growth
141
+ sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
142
+ sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
143
+
144
+ #Calculate and create the first row for sales difference and growth
145
+ sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
146
+ sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
147
+
148
+
149
+ return sales_growth
150
+
151
+ # TAPAS Model
152
+ model_name = "google/tapas-large-finetuned-wtq"
153
+ @st.cache
154
+ def load_tapas_model(model_name):
155
+ tokenizer = TapasTokenizer.from_pretrained(model_name)
156
+ model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
157
+ pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
158
+ return pipe
159
+
160
+ pipe = load_tapas_model(model_name)
161
+
162
+ def get_answer(table, query):
163
+ answers = pipe(table=table, query=query)
164
+ print(answers['coordinates']) # FOR DEBUGGING PURPOSES
165
+ return answers
166
+
167
+ def convert_answer(answer):
168
+ if answer['aggregator'] == 'SUM':
169
+ print(answer['answer']) # FOR DEBUGGING
170
+ cells = answer['cells']
171
+ converted = sum(float(value.replace(',', '')) for value in cells)
172
+ return converted
173
+
174
+ if answer['aggregator'] == 'AVERAGE':
175
+ print(answer['answer']) # FOR DEBUGGING
176
+ cells = answer['cells']
177
+ values = [float(value.replace(',', '')) for value in cells]
178
+ converted = sum(values) / len(values)
179
+ return converted
180
+
181
+ if answer['aggregator'] == 'COUNT':
182
+ print(answer['answer']) # FOR DEBUGGING
183
+ cells = answer['cells']
184
+ converted = sum(int(value.replace(',', '')) for value in cells)
185
+ return converted
186
+
187
+ else:
188
+ return answer
189
+
190
+ def get_converted_answer(table, query):
191
+ converted_answer = convert_answer(get_answer(table, query))
192
+ return converted_answer
193
 
194
  st.title("Sales Forecasting Dashboard")
195
  st.write("📈 Welcome User, start using the application by uploading your file in the sidebbar!")
 
199
  page_icon="📈",
200
  layout="wide",
201
  initial_sidebar_state="expanded",
202
+ )
203
 
204
+ if 'uploaded' not in st.session_state:
205
+ st.session_state.uploaded = 'uploaded'
206
 
207
  # Sidebar Menu
208
  with st.sidebar:
 
217
  df = pd.read_csv(uploaded_file, parse_dates=True)
218
  st.write("Your uploaded data:")
219
  st.write(df)
220
+
221
  # Data pre-processing
222
+ df = drop(df)
223
+ df = date_format(df)
224
+ merge_sort(df)
225
+ df = group_to_three(df)
226
+ st.session_state.uploaded = True
227
+
228
  with open('sample.csv', 'rb') as f:
229
+ st.download_button("Download our sample CSV", f, file_name='sample.csv')
230
+
231
+ if (st.session_state.uploaded):
232
+ st.line_chart(df)
233
+
234
+ forecast_button_clicked = st.button(
235
+ 'Start Forecasting',
236
+ key='forecast_button',
237
+ type="primary",
238
+ disabled=st.session_state.uploaded,
239
+ )
240
+
241
+ if (forecast_button_clicked):
242
+ # TODO call arima here
243
+ pass
modules/__init__.py DELETED
@@ -1 +0,0 @@
1
- __all__ = ["preprocessor", "arima", "tapas"]
 
 
modules/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (236 Bytes)
 
modules/__pycache__/arima.cpython-311.pyc DELETED
Binary file (5.32 kB)
 
modules/__pycache__/preprocessor.cpython-311.pyc DELETED
Binary file (5.09 kB)
 
modules/__pycache__/tapas.cpython-311.pyc DELETED
Binary file (2.91 kB)
 
modules/arima.py DELETED
@@ -1,68 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from datetime import datetime
4
- import pmdarima as pm
5
- from pmdarima import auto_arima
6
-
7
- def train_test(dataframe, n):
8
- training_y = dataframe.iloc[:-n,0]
9
- test_y = dataframe.iloc[-n:,0]
10
- test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
11
- training_X = dataframe.iloc[:-n,1:]
12
- test_X = dataframe.iloc[-n:,1:]
13
- future_X = dataframe.iloc[0:,1:]
14
- return (training_y, test_y, test_y_series, training_X, test_X, future_X)
15
-
16
- def model_fitting(dataframe, Exo):
17
- futureModel = pm.auto_arima(dataframe['Sales'], X=Exo, start_p=1, start_q=1,
18
- test='adf',min_p=1,min_q=1,
19
- max_p=3, max_q=3, m=12,
20
- start_P=0, seasonal=True,
21
- d=None, D=1, trace=True,
22
- error_action='ignore',
23
- suppress_warnings=True,
24
- stepwise=True)
25
- model = futureModel
26
- return model
27
-
28
- def test_fitting(dataframe, Exo, trainY):
29
- trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
30
- test='adf',min_p=1,min_q=1,
31
- max_p=3, max_q=3, m=12,
32
- start_P=0, seasonal=True,
33
- d=None, D=1, trace=True,
34
- error_action='ignore',
35
- suppress_warnings=True,
36
- stepwise=True)
37
- model = trainTestModel
38
- return model
39
-
40
- def forecast_accuracy(forecast, actual):
41
- mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
42
- rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
43
- corr = np.corrcoef(forecast, actual)[0,1] # corr
44
- mins = np.amin(np.hstack([forecast[:,None],
45
- actual[:,None]]), axis=1)
46
- maxs = np.amax(np.hstack([forecast[:,None],
47
- actual[:,None]]), axis=1)
48
- minmax = 1 - np.mean(mins/maxs) # minmax
49
- return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
50
-
51
- def sales_growth(dataframe, fittedValues):
52
- sales_growth = fittedValues.to_frame()
53
- sales_growth = sales_growth.reset_index()
54
- sales_growth.columns = ("Date", "Sales")
55
- sales_growth = sales_growth.set_index('Date')
56
-
57
- sales_growth['Sales'] = (sales_growth['Sales']).round(2)
58
-
59
- #Calculate and create the column for sales difference and growth
60
- sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
61
- sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
62
-
63
- #Calculate and create the first row for sales difference and growth
64
- sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
65
- sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
66
-
67
-
68
- return sales_growth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/preprocessor.py DELETED
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- from datetime import datetime
3
-
4
- def merge(B, C, A):
5
- i = j = k = 0
6
-
7
- # Convert 'Date' columns to datetime.date objects
8
- B['Date'] = pd.to_datetime(B['Date']).dt.date
9
- C['Date'] = pd.to_datetime(C['Date']).dt.date
10
- A['Date'] = pd.to_datetime(A['Date']).dt.date
11
-
12
- while i < len(B) and j < len(C):
13
- if B['Date'].iloc[i] <= C['Date'].iloc[j]:
14
- A['Date'].iloc[k] = B['Date'].iloc[i]
15
- A['Sales'].iloc[k] = B['Sales'].iloc[i]
16
- i += 1
17
-
18
- else:
19
- A['Date'].iloc[k] = C['Date'].iloc[j]
20
- A['Sales'].iloc[k] = C['Sales'].iloc[j]
21
- j += 1
22
- k += 1
23
-
24
- while i < len(B):
25
- A['Date'].iloc[k] = B['Date'].iloc[i]
26
- A['Sales'].iloc[k] = B['Sales'].iloc[i]
27
- i += 1
28
- k += 1
29
-
30
- while j < len(C):
31
- A['Date'].iloc[k] = C['Date'].iloc[j]
32
- A['Sales'].iloc[k] = C['Sales'].iloc[j]
33
- j += 1
34
- k += 1
35
-
36
- return A
37
-
38
- def merge_sort(dataframe):
39
- if len(dataframe) > 1:
40
- center = len(dataframe) // 2
41
- left = dataframe.iloc[:center]
42
- right = dataframe.iloc[center:]
43
- merge_sort(left)
44
- merge_sort(right)
45
-
46
- return merge(left, right, dataframe)
47
-
48
- else:
49
- return dataframe
50
-
51
- def drop (dataframe):
52
- def get_columns_containing(dataframe, substrings):
53
- return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
54
-
55
- columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
56
- dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
57
- dataframe = dataframe.dropna()
58
-
59
- return dataframe
60
-
61
- def date_format(dataframe):
62
- for i, d, s in dataframe.itertuples():
63
- dataframe['Date'][i] = dataframe['Date'][i].strip()
64
-
65
- for i, d, s in dataframe.itertuples():
66
- new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
67
- dataframe['Date'][i] = new_date
68
-
69
- return dataframe
70
-
71
- def group_to_three(dataframe):
72
- dataframe['Date'] = pd.to_datetime(dataframe['Date'])
73
- dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
74
- dataframe = dataframe.replace(0, pd.np.nan).dropna()
75
-
76
- return dataframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modules/tapas.py DELETED
@@ -1,43 +0,0 @@
1
- import torch
2
- from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
3
-
4
- model_name = "google/tapas-large-finetuned-wtq"
5
-
6
- # load the tokenizer and the model from huggingface model hub
7
- tokenizer = TapasTokenizer.from_pretrained(model_name)
8
- model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
9
-
10
- # load the model and tokenizer into a question-answering pipeline
11
- pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
12
-
13
- def get_answer(table, query):
14
- answers = pipe(table=table, query=query)
15
- print(answers['coordinates']) # FOR DEBUGGING PURPOSES
16
- return answers
17
-
18
- def convert_answer(answer):
19
- if answer['aggregator'] == 'SUM':
20
- print(answer['answer']) # FOR DEBUGGING
21
- cells = answer['cells']
22
- converted = sum(float(value.replace(',', '')) for value in cells)
23
- return converted
24
-
25
- if answer['aggregator'] == 'AVERAGE':
26
- print(answer['answer']) # FOR DEBUGGING
27
- cells = answer['cells']
28
- values = [float(value.replace(',', '')) for value in cells]
29
- converted = sum(values) / len(values)
30
- return converted
31
-
32
- if answer['aggregator'] == 'COUNT':
33
- print(answer['answer']) # FOR DEBUGGING
34
- cells = answer['cells']
35
- converted = sum(int(value.replace(',', '')) for value in cells)
36
- return converted
37
-
38
- else:
39
- return answer
40
-
41
- def get_converted_answer(table, query):
42
- converted_answer = convert_answer(get_answer(table, query))
43
- return converted_answer