YaakovY commited on
Commit
e0d5ca2
1 Parent(s): cbb5bad
Files changed (1) hide show
  1. main.py +220 -2
main.py CHANGED
@@ -21,7 +21,225 @@ def read_root():
21
  }
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @app.get("/ticker/{ticker}")
25
- def read_item(ticker: str):
 
 
 
 
 
 
 
26
 
27
- return ticker
 
21
  }
22
 
23
 
24
+ def get_data(ticker):
25
+ # Define the ticker symbol
26
+ tickerSymbol = ticker
27
+ days_period = 300
28
+
29
+ # Get data on this ticker
30
+ tickerData = yf.Ticker(tickerSymbol)
31
+
32
+ start_date = dt.datetime.today() - dt.timedelta(days=days_period)
33
+ end_date = dt.datetime.today()
34
+
35
+ df_all = tickerData.history(start=start_date, end=end_date, interval="1h")
36
+ df_all = df_all.drop(columns=["Dividends", "Stock Splits", "Volume"])
37
+ return df_all
38
+
39
+
40
+ def get_last_date_missing_hours(df):
41
+ # Assuming df is your DataFrame with the correct datetime index
42
+ df.index = pd.to_datetime(df.index) # Ensure datetime format
43
+
44
+ # Define the trading hours
45
+ trading_start = "09:30:00"
46
+ trading_end = "16:00:00"
47
+
48
+ # Normalize the timezone if necessary, here assuming the data might be timezone aware
49
+ df.index = df.index.tz_localize(None)
50
+
51
+ # Find the latest date in your data
52
+ latest_date = df.index.max().date()
53
+
54
+ # Generate a full range of expected trading hours for the latest date, ensuring it's timezone-naive
55
+ expected_hours = pd.date_range(
56
+ start=f"{latest_date} {trading_start}",
57
+ end=f"{latest_date} {trading_end}",
58
+ freq="H",
59
+ tz=None,
60
+ )
61
+
62
+ # Extract actual timestamps for the latest date, also as timezone-naive
63
+ actual_hours = df[df.index.date == latest_date].index.tz_localize(None)
64
+
65
+ # Determine missing hours
66
+ missing_hours = expected_hours.difference(actual_hours)
67
+ # Add missing hours to the DataFrame as empty rows
68
+ for hour in missing_hours:
69
+ if hour not in df.index:
70
+ df.loc[hour] = [pd.NA] * len(df.columns) # Initialize missing hours with NA
71
+
72
+ # Sort the DataFrame after inserting new rows to maintain the chronological order
73
+ df.sort_index(inplace=True)
74
+
75
+ # forward filling
76
+ # Ensure the index is in datetime format and normalized
77
+ df.index = pd.to_datetime(df.index)
78
+ df.index = df.index.tz_localize(None)
79
+
80
+ # Find the latest date in your data
81
+ latest_date = df.index.max().date()
82
+
83
+ # Select only the data for the latest day
84
+ latest_day_data = df[df.index.date == latest_date]
85
+
86
+ # Perform forward filling on this latest day data
87
+ latest_day_data_filled = latest_day_data.ffill()
88
+
89
+ # Replace the original latest day data in the DataFrame with the filled data
90
+ df.loc[df.index.date == latest_date] = latest_day_data_filled
91
+
92
+ # Optionally, ensure the entire DataFrame is sorted by index
93
+ df.sort_index(inplace=True)
94
+ return df
95
+
96
+
97
+ def prepare_df_for_model(df):
98
+ df.index = pd.to_datetime(df.index) # Ensure the index is datetime
99
+
100
+ # Extract date and time from the datetime index
101
+ df["Date"] = df.index.date
102
+ df["Time"] = df.index.time
103
+
104
+ # Filter out data for hours from 09:30 to 14:30 and the target at 15:30
105
+ df_hours = df[
106
+ df["Time"].isin(
107
+ [
108
+ pd.to_datetime("09:30:00").time(),
109
+ pd.to_datetime("10:30:00").time(),
110
+ pd.to_datetime("11:30:00").time(),
111
+ pd.to_datetime("12:30:00").time(),
112
+ pd.to_datetime("13:30:00").time(),
113
+ pd.to_datetime("14:30:00").time(),
114
+ ]
115
+ )
116
+ ]
117
+ df_target = df[df["Time"] == pd.to_datetime("15:30:00").time()][["Date", "Close"]]
118
+
119
+ # Rename the target close column for clarity
120
+ df_target.rename(columns={"Close": "Close_target"}, inplace=True)
121
+
122
+ # Pivot the hours data to have one row per day with all the columns
123
+ df_pivot = df_hours.pivot(
124
+ index="Date", columns="Time", values=["Open", "High", "Low", "Close"]
125
+ )
126
+
127
+ # Flatten the columns after pivoting and create a multi-level index
128
+ df_pivot.columns = [
129
+ "{}_{}".format(feature, time.strftime("%H:%M"))
130
+ for feature, time in df_pivot.columns
131
+ ]
132
+
133
+ # Join the pivot table with the target data
134
+ df_final = df_pivot.join(df_target.set_index("Date"))
135
+
136
+ # Convert the index back to datetime if it got changed to object type
137
+ df_final.index = pd.to_datetime(df_final.index)
138
+
139
+ df = df_final.dropna()
140
+ return df
141
+
142
+
143
+ def high_low_columns(df_final):
144
+ # Extract columns for 'High' and 'Low' values
145
+ high_columns = [col for col in df_final.columns if "High_" in col]
146
+ low_columns = [col for col in df_final.columns if "Low_" in col]
147
+
148
+ # Calculate 'max high' and 'min low' for each day
149
+ df_final["MAX_high"] = df_final[high_columns].max(axis=1)
150
+ df_final["MIN_low"] = df_final[low_columns].min(axis=1)
151
+
152
+ return df_final
153
+
154
+
155
+ def calc_percentage_change(df):
156
+ # Convert index to datetime if necessary (if not already done)
157
+ df.index = pd.to_datetime(df.index)
158
+
159
+ # Calculate the percentage change relative to 'Open_09:30' for each column
160
+ for column in df.columns:
161
+ if column != "Open_09:30":
162
+ df[column] = (df[column] - df["Open_09:30"]) / df["Open_09:30"] * 100
163
+ return df
164
+
165
+
166
+ def create_features(df):
167
+ """
168
+ Create time series features based on time series index.
169
+ """
170
+ df = df.copy()
171
+ df["dayofweek"] = df.index.dayofweek
172
+ df["quarter"] = df.index.quarter
173
+ df["month"] = df.index.month
174
+ df["year"] = df.index.year
175
+ df["dayofyear"] = df.index.dayofyear
176
+ df["dayofmonth"] = df.index.day
177
+ df["weekofyear"] = df.index.isocalendar().week
178
+ df["weekofyear"] = df["weekofyear"].astype("Int32")
179
+ return df
180
+
181
+
182
+ def train_test_split(df):
183
+ df.index = pd.to_datetime(df.index)
184
+ # Define the number of test instances (e.g., last 30 days)
185
+ num_test = 30
186
+
187
+ # Split data into features and target
188
+ X = df.drop(columns=["Close_target"])
189
+ y = df["Close_target"]
190
+
191
+ # Split the data into training and testing sets
192
+ X_train, y_train = X[:-num_test], y[:-num_test]
193
+ X_test, y_test = X[-num_test:], y[-num_test:]
194
+
195
+ # Train indices are earlier, and test indices include the last date
196
+ train_indices = df.index < df.index[-num_test]
197
+ test_indices = df.index >= df.index[-num_test]
198
+ return X_train, y_train, X_test, y_test
199
+
200
+
201
+ def run_xgboost(df):
202
+ X_train, y_train, X_test, y_test = train_test_split(df)
203
+ # Define the model
204
+ model = xgb.XGBRegressor(
205
+ n_estimators=100,
206
+ learning_rate=0.1,
207
+ max_depth=3,
208
+ subsample=0.8,
209
+ colsample_bytree=0.8,
210
+ objective="reg:squarederror",
211
+ )
212
+
213
+ # Train the model with evaluation
214
+ model.fit(
215
+ X_train,
216
+ y_train,
217
+ eval_metric="rmse",
218
+ eval_set=[(X_train, y_train), (X_test, y_test)],
219
+ verbose=True,
220
+ early_stopping_rounds=10,
221
+ )
222
+
223
+ # Making predictions
224
+ predictions = model.predict(X_test)
225
+
226
+ # Prediction for the latest date
227
+ latest_prediction = predictions[-1]
228
+
229
+ # Calculate and print RMSE for the test set
230
+ rmse = np.sqrt(mean_squared_error(y_test, predictions))
231
+
232
+ return {"latest_prediction": latest_prediction, "RMSE": rmse}
233
+
234
+
235
  @app.get("/ticker/{ticker}")
236
+ def prcess_ticker(ticker: str):
237
+ df = get_data(ticker)
238
+ df = get_last_date_missing_hours(df)
239
+ df = prepare_df_for_model(df)
240
+ df = high_low_columns(df)
241
+ df = calc_percentage_change(df)
242
+ df = create_features(df)
243
+ result = run_xgboost(df)
244
 
245
+ return result