mtzeve commited on
Commit
1f66d66
·
1 Parent(s): 92a94bb
__pycache__/news_preprocessing.cpython-311.pyc ADDED
Binary file (1.86 kB). View file
 
feature_engineering.py DELETED
@@ -1,32 +0,0 @@
1
- # %%
2
- import requests
3
- import pandas as pd
4
- import json
5
- import datetime
6
- import numpy as np
7
- from datetime import timedelta
8
-
9
- # %%
10
- def getNews(api_key,endpoint,ticker,from_date,to_date,num=1000):
11
- # Set the parameters for the request
12
- params = {
13
- "api_token": api_key,
14
- "s": ticker,
15
- "from": from_date,
16
- "to": to_date,
17
- "limit": num,
18
- }
19
-
20
- # Make the request to the API
21
- response = requests.get(endpoint, params=params)
22
-
23
- # Print the response from the API
24
- print(response.json())
25
-
26
- #Return a Pandas dataframe from the response
27
- return pd.DataFrame(response.json())
28
-
29
- # %%
30
-
31
-
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feature_pipeline.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 17,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -12,20 +12,20 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 18,
16
  "metadata": {},
17
  "outputs": [
18
  {
19
  "name": "stdout",
20
  "output_type": "stream",
21
  "text": [
22
- " 1. open 2. high 3. low 4. close 5. volume\n",
23
- "date \n",
24
- "2024-04-26 168.85 172.12 166.3700 168.29 109815725.0\n",
25
- "2024-04-25 158.96 170.88 158.3600 170.18 126427521.0\n",
26
- "2024-04-24 162.84 167.97 157.5100 162.13 181178020.0\n",
27
- "2024-04-23 143.33 147.26 141.1100 144.68 124545104.0\n",
28
- "2024-04-22 140.56 144.44 138.8025 142.05 107097564.0\n"
29
  ]
30
  }
31
  ],
@@ -48,53 +48,12 @@
48
  "cell_type": "code",
49
  "execution_count": null,
50
  "metadata": {},
51
- "outputs": [
52
- {
53
- "name": "stdout",
54
- "output_type": "stream",
55
- "text": [
56
- "Feature Group created successfully, explore it at \n",
57
- "https://c.app.hopsworks.ai:443/p/549016/fs/544838/fg/752979\n"
58
- ]
59
- },
60
- {
61
- "data": {
62
- "application/vnd.jupyter.widget-view+json": {
63
- "model_id": "394c6ab7da624ed388df0b9b8bff469a",
64
- "version_major": 2,
65
- "version_minor": 0
66
- },
67
- "text/plain": [
68
- "Uploading Dataframe: 0.00% | | Rows 0/3479 | Elapsed Time: 00:00 | Remaining Time: ?"
69
- ]
70
- },
71
- "metadata": {},
72
- "output_type": "display_data"
73
- },
74
- {
75
- "name": "stdout",
76
- "output_type": "stream",
77
- "text": [
78
- "Launching job: tsla_stock_1_offline_fg_materialization\n",
79
- "Job started successfully, you can follow the progress at \n",
80
- "https://c.app.hopsworks.ai/p/549016/jobs/named/tsla_stock_1_offline_fg_materialization/executions\n"
81
- ]
82
- },
83
- {
84
- "data": {
85
- "text/plain": [
86
- "(<hsfs.core.job.Job at 0x158c80fd0>, None)"
87
- ]
88
- },
89
- "metadata": {},
90
- "output_type": "display_data"
91
- }
92
- ],
93
  "source": []
94
  },
95
  {
96
  "cell_type": "code",
97
- "execution_count": 3,
98
  "metadata": {},
99
  "outputs": [
100
  {
@@ -102,17 +61,17 @@
102
  "output_type": "stream",
103
  "text": [
104
  "<class 'pandas.core.frame.DataFrame'>\n",
105
- "DatetimeIndex: 3479 entries, 2024-04-24 to 2010-06-29\n",
106
  "Data columns (total 5 columns):\n",
107
  " # Column Non-Null Count Dtype \n",
108
  "--- ------ -------------- ----- \n",
109
- " 0 1. open 3479 non-null float64\n",
110
- " 1 2. high 3479 non-null float64\n",
111
- " 2 3. low 3479 non-null float64\n",
112
- " 3 4. close 3479 non-null float64\n",
113
- " 4 5. volume 3479 non-null float64\n",
114
  "dtypes: float64(5)\n",
115
- "memory usage: 163.1 KB\n"
116
  ]
117
  }
118
  ],
@@ -122,7 +81,7 @@
122
  },
123
  {
124
  "cell_type": "code",
125
- "execution_count": 4,
126
  "metadata": {},
127
  "outputs": [
128
  {
@@ -130,12 +89,12 @@
130
  "text/plain": [
131
  "{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n",
132
  " '2. Symbol': 'TSLA',\n",
133
- " '3. Last Refreshed': '2024-04-24',\n",
134
  " '4. Output Size': 'Full size',\n",
135
  " '5. Time Zone': 'US/Eastern'}"
136
  ]
137
  },
138
- "execution_count": 4,
139
  "metadata": {},
140
  "output_type": "execute_result"
141
  }
@@ -146,7 +105,7 @@
146
  },
147
  {
148
  "cell_type": "code",
149
- "execution_count": 5,
150
  "metadata": {},
151
  "outputs": [
152
  {
@@ -169,19 +128,19 @@
169
  },
170
  {
171
  "cell_type": "code",
172
- "execution_count": 6,
173
  "metadata": {},
174
  "outputs": [
175
  {
176
  "name": "stdout",
177
  "output_type": "stream",
178
  "text": [
179
- " date 1. open 2. high 3. low 4. close 5. volume\n",
180
- "0 2024-04-24 162.84 167.97 157.5100 162.13 181178020.0\n",
181
- "1 2024-04-23 143.33 147.26 141.1100 144.68 124545104.0\n",
182
- "2 2024-04-22 140.56 144.44 138.8025 142.05 107097564.0\n",
183
- "3 2024-04-19 148.97 150.94 146.2200 147.05 87074500.0\n",
184
- "4 2024-04-18 151.25 152.20 148.7000 149.93 96098830.0\n"
185
  ]
186
  }
187
  ],
@@ -193,7 +152,7 @@
193
  },
194
  {
195
  "cell_type": "code",
196
- "execution_count": 7,
197
  "metadata": {},
198
  "outputs": [
199
  {
@@ -232,7 +191,7 @@
232
  },
233
  {
234
  "cell_type": "code",
235
- "execution_count": 8,
236
  "metadata": {},
237
  "outputs": [],
238
  "source": [
@@ -241,7 +200,7 @@
241
  },
242
  {
243
  "cell_type": "code",
244
- "execution_count": 9,
245
  "metadata": {},
246
  "outputs": [],
247
  "source": [
@@ -253,7 +212,7 @@
253
  },
254
  {
255
  "cell_type": "code",
256
- "execution_count": 10,
257
  "metadata": {},
258
  "outputs": [
259
  {
@@ -288,48 +247,48 @@
288
  " <tbody>\n",
289
  " <tr>\n",
290
  " <th>0</th>\n",
291
- " <td>2024-04-24</td>\n",
292
- " <td>162.84</td>\n",
293
- " <td>167.9700</td>\n",
294
- " <td>157.5100</td>\n",
295
- " <td>162.13</td>\n",
296
- " <td>181178020.0</td>\n",
297
  " </tr>\n",
298
  " <tr>\n",
299
  " <th>1</th>\n",
300
- " <td>2024-04-23</td>\n",
301
- " <td>143.33</td>\n",
302
- " <td>147.2600</td>\n",
303
- " <td>141.1100</td>\n",
304
- " <td>144.68</td>\n",
305
- " <td>124545104.0</td>\n",
306
  " </tr>\n",
307
  " <tr>\n",
308
  " <th>2</th>\n",
309
- " <td>2024-04-22</td>\n",
310
- " <td>140.56</td>\n",
311
- " <td>144.4400</td>\n",
312
- " <td>138.8025</td>\n",
313
- " <td>142.05</td>\n",
314
- " <td>107097564.0</td>\n",
315
  " </tr>\n",
316
  " <tr>\n",
317
  " <th>3</th>\n",
318
- " <td>2024-04-19</td>\n",
319
- " <td>148.97</td>\n",
320
- " <td>150.9400</td>\n",
321
- " <td>146.2200</td>\n",
322
- " <td>147.05</td>\n",
323
- " <td>87074500.0</td>\n",
324
  " </tr>\n",
325
  " <tr>\n",
326
  " <th>4</th>\n",
327
- " <td>2024-04-18</td>\n",
328
- " <td>151.25</td>\n",
329
- " <td>152.2000</td>\n",
330
- " <td>148.7000</td>\n",
331
- " <td>149.93</td>\n",
332
- " <td>96098830.0</td>\n",
333
  " </tr>\n",
334
  " <tr>\n",
335
  " <th>...</th>\n",
@@ -341,73 +300,73 @@
341
  " <td>...</td>\n",
342
  " </tr>\n",
343
  " <tr>\n",
344
- " <th>3474</th>\n",
345
  " <td>2010-07-06</td>\n",
346
  " <td>20.00</td>\n",
347
  " <td>20.0000</td>\n",
348
- " <td>15.8300</td>\n",
349
  " <td>16.11</td>\n",
350
  " <td>6866900.0</td>\n",
351
  " </tr>\n",
352
  " <tr>\n",
353
- " <th>3475</th>\n",
354
  " <td>2010-07-02</td>\n",
355
  " <td>23.00</td>\n",
356
  " <td>23.1000</td>\n",
357
- " <td>18.7100</td>\n",
358
  " <td>19.20</td>\n",
359
  " <td>5139800.0</td>\n",
360
  " </tr>\n",
361
  " <tr>\n",
362
- " <th>3476</th>\n",
363
  " <td>2010-07-01</td>\n",
364
  " <td>25.00</td>\n",
365
  " <td>25.9200</td>\n",
366
- " <td>20.2700</td>\n",
367
  " <td>21.96</td>\n",
368
  " <td>8218800.0</td>\n",
369
  " </tr>\n",
370
  " <tr>\n",
371
- " <th>3477</th>\n",
372
  " <td>2010-06-30</td>\n",
373
  " <td>25.79</td>\n",
374
  " <td>30.4192</td>\n",
375
- " <td>23.3000</td>\n",
376
  " <td>23.83</td>\n",
377
  " <td>17187100.0</td>\n",
378
  " </tr>\n",
379
  " <tr>\n",
380
- " <th>3478</th>\n",
381
  " <td>2010-06-29</td>\n",
382
  " <td>19.00</td>\n",
383
  " <td>25.0000</td>\n",
384
- " <td>17.5400</td>\n",
385
  " <td>23.89</td>\n",
386
  " <td>18766300.0</td>\n",
387
  " </tr>\n",
388
  " </tbody>\n",
389
  "</table>\n",
390
- "<p>3479 rows × 6 columns</p>\n",
391
  "</div>"
392
  ],
393
  "text/plain": [
394
- " date 1. open 2. high 3. low 4. close 5. volume\n",
395
- "0 2024-04-24 162.84 167.9700 157.5100 162.13 181178020.0\n",
396
- "1 2024-04-23 143.33 147.2600 141.1100 144.68 124545104.0\n",
397
- "2 2024-04-22 140.56 144.4400 138.8025 142.05 107097564.0\n",
398
- "3 2024-04-19 148.97 150.9400 146.2200 147.05 87074500.0\n",
399
- "4 2024-04-18 151.25 152.2000 148.7000 149.93 96098830.0\n",
400
- "... ... ... ... ... ... ...\n",
401
- "3474 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n",
402
- "3475 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n",
403
- "3476 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n",
404
- "3477 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n",
405
- "3478 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n",
406
  "\n",
407
- "[3479 rows x 6 columns]"
408
  ]
409
  },
410
- "execution_count": 10,
411
  "metadata": {},
412
  "output_type": "execute_result"
413
  }
@@ -418,7 +377,7 @@
418
  },
419
  {
420
  "cell_type": "code",
421
- "execution_count": 11,
422
  "metadata": {},
423
  "outputs": [],
424
  "source": [
@@ -428,7 +387,7 @@
428
  },
429
  {
430
  "cell_type": "code",
431
- "execution_count": 12,
432
  "metadata": {},
433
  "outputs": [
434
  {
@@ -445,7 +404,7 @@
445
  },
446
  {
447
  "cell_type": "code",
448
- "execution_count": 13,
449
  "metadata": {},
450
  "outputs": [],
451
  "source": [
@@ -461,18 +420,18 @@
461
  },
462
  {
463
  "cell_type": "code",
464
- "execution_count": 14,
465
  "metadata": {},
466
  "outputs": [
467
  {
468
  "data": {
469
  "application/vnd.jupyter.widget-view+json": {
470
- "model_id": "91ef74ded4714a1492bdc24b176c4f1e",
471
  "version_major": 2,
472
  "version_minor": 0
473
  },
474
  "text/plain": [
475
- "Uploading Dataframe: 0.00% | | Rows 0/3479 | Elapsed Time: 00:00 | Remaining Time: ?"
476
  ]
477
  },
478
  "metadata": {},
@@ -490,10 +449,10 @@
490
  {
491
  "data": {
492
  "text/plain": [
493
- "(<hsfs.core.job.Job at 0x177b01510>, None)"
494
  ]
495
  },
496
- "execution_count": 14,
497
  "metadata": {},
498
  "output_type": "execute_result"
499
  }
@@ -504,10 +463,65 @@
504
  },
505
  {
506
  "cell_type": "code",
507
- "execution_count": null,
508
  "metadata": {},
509
- "outputs": [],
510
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  }
512
  ],
513
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 3,
16
  "metadata": {},
17
  "outputs": [
18
  {
19
  "name": "stdout",
20
  "output_type": "stream",
21
  "text": [
22
+ " 1. open 2. high 3. low 4. close 5. volume\n",
23
+ "date \n",
24
+ "2024-04-29 188.42 198.87 184.54 194.05 243869678.0\n",
25
+ "2024-04-26 168.85 172.12 166.37 168.29 109815725.0\n",
26
+ "2024-04-25 158.96 170.88 158.36 170.18 126427521.0\n",
27
+ "2024-04-24 162.84 167.97 157.51 162.13 181178020.0\n",
28
+ "2024-04-23 143.33 147.26 141.11 144.68 124545104.0\n"
29
  ]
30
  }
31
  ],
 
48
  "cell_type": "code",
49
  "execution_count": null,
50
  "metadata": {},
51
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "source": []
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": 4,
57
  "metadata": {},
58
  "outputs": [
59
  {
 
61
  "output_type": "stream",
62
  "text": [
63
  "<class 'pandas.core.frame.DataFrame'>\n",
64
+ "DatetimeIndex: 3482 entries, 2024-04-29 to 2010-06-29\n",
65
  "Data columns (total 5 columns):\n",
66
  " # Column Non-Null Count Dtype \n",
67
  "--- ------ -------------- ----- \n",
68
+ " 0 1. open 3482 non-null float64\n",
69
+ " 1 2. high 3482 non-null float64\n",
70
+ " 2 3. low 3482 non-null float64\n",
71
+ " 3 4. close 3482 non-null float64\n",
72
+ " 4 5. volume 3482 non-null float64\n",
73
  "dtypes: float64(5)\n",
74
+ "memory usage: 163.2 KB\n"
75
  ]
76
  }
77
  ],
 
81
  },
82
  {
83
  "cell_type": "code",
84
+ "execution_count": 5,
85
  "metadata": {},
86
  "outputs": [
87
  {
 
89
  "text/plain": [
90
  "{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n",
91
  " '2. Symbol': 'TSLA',\n",
92
+ " '3. Last Refreshed': '2024-04-29',\n",
93
  " '4. Output Size': 'Full size',\n",
94
  " '5. Time Zone': 'US/Eastern'}"
95
  ]
96
  },
97
+ "execution_count": 5,
98
  "metadata": {},
99
  "output_type": "execute_result"
100
  }
 
105
  },
106
  {
107
  "cell_type": "code",
108
+ "execution_count": 6,
109
  "metadata": {},
110
  "outputs": [
111
  {
 
128
  },
129
  {
130
  "cell_type": "code",
131
+ "execution_count": 7,
132
  "metadata": {},
133
  "outputs": [
134
  {
135
  "name": "stdout",
136
  "output_type": "stream",
137
  "text": [
138
+ " date 1. open 2. high 3. low 4. close 5. volume\n",
139
+ "0 2024-04-29 188.42 198.87 184.54 194.05 243869678.0\n",
140
+ "1 2024-04-26 168.85 172.12 166.37 168.29 109815725.0\n",
141
+ "2 2024-04-25 158.96 170.88 158.36 170.18 126427521.0\n",
142
+ "3 2024-04-24 162.84 167.97 157.51 162.13 181178020.0\n",
143
+ "4 2024-04-23 143.33 147.26 141.11 144.68 124545104.0\n"
144
  ]
145
  }
146
  ],
 
152
  },
153
  {
154
  "cell_type": "code",
155
+ "execution_count": 8,
156
  "metadata": {},
157
  "outputs": [
158
  {
 
191
  },
192
  {
193
  "cell_type": "code",
194
+ "execution_count": 9,
195
  "metadata": {},
196
  "outputs": [],
197
  "source": [
 
200
  },
201
  {
202
  "cell_type": "code",
203
+ "execution_count": 10,
204
  "metadata": {},
205
  "outputs": [],
206
  "source": [
 
212
  },
213
  {
214
  "cell_type": "code",
215
+ "execution_count": 11,
216
  "metadata": {},
217
  "outputs": [
218
  {
 
247
  " <tbody>\n",
248
  " <tr>\n",
249
  " <th>0</th>\n",
250
+ " <td>2024-04-29</td>\n",
251
+ " <td>188.42</td>\n",
252
+ " <td>198.8700</td>\n",
253
+ " <td>184.54</td>\n",
254
+ " <td>194.05</td>\n",
255
+ " <td>243869678.0</td>\n",
256
  " </tr>\n",
257
  " <tr>\n",
258
  " <th>1</th>\n",
259
+ " <td>2024-04-26</td>\n",
260
+ " <td>168.85</td>\n",
261
+ " <td>172.1200</td>\n",
262
+ " <td>166.37</td>\n",
263
+ " <td>168.29</td>\n",
264
+ " <td>109815725.0</td>\n",
265
  " </tr>\n",
266
  " <tr>\n",
267
  " <th>2</th>\n",
268
+ " <td>2024-04-25</td>\n",
269
+ " <td>158.96</td>\n",
270
+ " <td>170.8800</td>\n",
271
+ " <td>158.36</td>\n",
272
+ " <td>170.18</td>\n",
273
+ " <td>126427521.0</td>\n",
274
  " </tr>\n",
275
  " <tr>\n",
276
  " <th>3</th>\n",
277
+ " <td>2024-04-24</td>\n",
278
+ " <td>162.84</td>\n",
279
+ " <td>167.9700</td>\n",
280
+ " <td>157.51</td>\n",
281
+ " <td>162.13</td>\n",
282
+ " <td>181178020.0</td>\n",
283
  " </tr>\n",
284
  " <tr>\n",
285
  " <th>4</th>\n",
286
+ " <td>2024-04-23</td>\n",
287
+ " <td>143.33</td>\n",
288
+ " <td>147.2600</td>\n",
289
+ " <td>141.11</td>\n",
290
+ " <td>144.68</td>\n",
291
+ " <td>124545104.0</td>\n",
292
  " </tr>\n",
293
  " <tr>\n",
294
  " <th>...</th>\n",
 
300
  " <td>...</td>\n",
301
  " </tr>\n",
302
  " <tr>\n",
303
+ " <th>3477</th>\n",
304
  " <td>2010-07-06</td>\n",
305
  " <td>20.00</td>\n",
306
  " <td>20.0000</td>\n",
307
+ " <td>15.83</td>\n",
308
  " <td>16.11</td>\n",
309
  " <td>6866900.0</td>\n",
310
  " </tr>\n",
311
  " <tr>\n",
312
+ " <th>3478</th>\n",
313
  " <td>2010-07-02</td>\n",
314
  " <td>23.00</td>\n",
315
  " <td>23.1000</td>\n",
316
+ " <td>18.71</td>\n",
317
  " <td>19.20</td>\n",
318
  " <td>5139800.0</td>\n",
319
  " </tr>\n",
320
  " <tr>\n",
321
+ " <th>3479</th>\n",
322
  " <td>2010-07-01</td>\n",
323
  " <td>25.00</td>\n",
324
  " <td>25.9200</td>\n",
325
+ " <td>20.27</td>\n",
326
  " <td>21.96</td>\n",
327
  " <td>8218800.0</td>\n",
328
  " </tr>\n",
329
  " <tr>\n",
330
+ " <th>3480</th>\n",
331
  " <td>2010-06-30</td>\n",
332
  " <td>25.79</td>\n",
333
  " <td>30.4192</td>\n",
334
+ " <td>23.30</td>\n",
335
  " <td>23.83</td>\n",
336
  " <td>17187100.0</td>\n",
337
  " </tr>\n",
338
  " <tr>\n",
339
+ " <th>3481</th>\n",
340
  " <td>2010-06-29</td>\n",
341
  " <td>19.00</td>\n",
342
  " <td>25.0000</td>\n",
343
+ " <td>17.54</td>\n",
344
  " <td>23.89</td>\n",
345
  " <td>18766300.0</td>\n",
346
  " </tr>\n",
347
  " </tbody>\n",
348
  "</table>\n",
349
+ "<p>3482 rows × 6 columns</p>\n",
350
  "</div>"
351
  ],
352
  "text/plain": [
353
+ " date 1. open 2. high 3. low 4. close 5. volume\n",
354
+ "0 2024-04-29 188.42 198.8700 184.54 194.05 243869678.0\n",
355
+ "1 2024-04-26 168.85 172.1200 166.37 168.29 109815725.0\n",
356
+ "2 2024-04-25 158.96 170.8800 158.36 170.18 126427521.0\n",
357
+ "3 2024-04-24 162.84 167.9700 157.51 162.13 181178020.0\n",
358
+ "4 2024-04-23 143.33 147.2600 141.11 144.68 124545104.0\n",
359
+ "... ... ... ... ... ... ...\n",
360
+ "3477 2010-07-06 20.00 20.0000 15.83 16.11 6866900.0\n",
361
+ "3478 2010-07-02 23.00 23.1000 18.71 19.20 5139800.0\n",
362
+ "3479 2010-07-01 25.00 25.9200 20.27 21.96 8218800.0\n",
363
+ "3480 2010-06-30 25.79 30.4192 23.30 23.83 17187100.0\n",
364
+ "3481 2010-06-29 19.00 25.0000 17.54 23.89 18766300.0\n",
365
  "\n",
366
+ "[3482 rows x 6 columns]"
367
  ]
368
  },
369
+ "execution_count": 11,
370
  "metadata": {},
371
  "output_type": "execute_result"
372
  }
 
377
  },
378
  {
379
  "cell_type": "code",
380
+ "execution_count": 12,
381
  "metadata": {},
382
  "outputs": [],
383
  "source": [
 
387
  },
388
  {
389
  "cell_type": "code",
390
+ "execution_count": 13,
391
  "metadata": {},
392
  "outputs": [
393
  {
 
404
  },
405
  {
406
  "cell_type": "code",
407
+ "execution_count": 14,
408
  "metadata": {},
409
  "outputs": [],
410
  "source": [
 
420
  },
421
  {
422
  "cell_type": "code",
423
+ "execution_count": 15,
424
  "metadata": {},
425
  "outputs": [
426
  {
427
  "data": {
428
  "application/vnd.jupyter.widget-view+json": {
429
+ "model_id": "ae6a0214d34943cabcdd66d70198ae3a",
430
  "version_major": 2,
431
  "version_minor": 0
432
  },
433
  "text/plain": [
434
+ "Uploading Dataframe: 0.00% | | Rows 0/3482 | Elapsed Time: 00:00 | Remaining Time: ?"
435
  ]
436
  },
437
  "metadata": {},
 
449
  {
450
  "data": {
451
  "text/plain": [
452
+ "(<hsfs.core.job.Job at 0x162ac3e50>, None)"
453
  ]
454
  },
455
+ "execution_count": 15,
456
  "metadata": {},
457
  "output_type": "execute_result"
458
  }
 
463
  },
464
  {
465
  "cell_type": "code",
466
+ "execution_count": 18,
467
  "metadata": {},
468
+ "outputs": [
469
+ {
470
+ "name": "stdout",
471
+ "output_type": "stream",
472
+ "text": [
473
+ "Feature Group created successfully, explore it at \n",
474
+ "https://c.app.hopsworks.ai:443/p/549016/fs/544838/fg/766341\n"
475
+ ]
476
+ },
477
+ {
478
+ "data": {
479
+ "application/vnd.jupyter.widget-view+json": {
480
+ "model_id": "74f0d70aeb3942c093321c530120434e",
481
+ "version_major": 2,
482
+ "version_minor": 0
483
+ },
484
+ "text/plain": [
485
+ "Uploading Dataframe: 0.00% | | Rows 0/712 | Elapsed Time: 00:00 | Remaining Time: ?"
486
+ ]
487
+ },
488
+ "metadata": {},
489
+ "output_type": "display_data"
490
+ },
491
+ {
492
+ "name": "stdout",
493
+ "output_type": "stream",
494
+ "text": [
495
+ "Launching job: news_sentiment_1_offline_fg_materialization\n",
496
+ "Job started successfully, you can follow the progress at \n",
497
+ "https://c.app.hopsworks.ai/p/549016/jobs/named/news_sentiment_1_offline_fg_materialization/executions\n"
498
+ ]
499
+ },
500
+ {
501
+ "data": {
502
+ "text/plain": [
503
+ "(<hsfs.core.job.Job at 0x164180710>, None)"
504
+ ]
505
+ },
506
+ "execution_count": 18,
507
+ "metadata": {},
508
+ "output_type": "execute_result"
509
+ }
510
+ ],
511
+ "source": [
512
+ "# Create feature group for historical news data\n",
513
+ "news_df = pd.read_csv('/Users/manos/Documents/BDS/MLops_mod/news_articles.csv')\n",
514
+ "\n",
515
+ "news_sentiment_fg = fs.get_or_create_feature_group(\n",
516
+ " name='news_sentiment',\n",
517
+ " description='News sentiment from Polygon',\n",
518
+ " version=1,\n",
519
+ " primary_key=['date'],\n",
520
+ " online_enabled=True,\n",
521
+ ")\n",
522
+ "\n",
523
+ "news_sentiment_fg.insert(news_df)"
524
+ ]
525
  }
526
  ],
527
  "metadata": {
feature_pipeline.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ # %%
6
+ from alpha_vantage.timeseries import TimeSeries
7
+ import pandas as pd
8
+
9
+ load_dotenv()
10
+
11
+ api_key = os.environ.get('stocks_api') # Replace this with your actual API key
12
+ ts = TimeSeries(key=api_key, output_format='pandas')
13
+
14
+ # Fetch daily adjusted stock prices; adjust the symbol as needed
15
+ data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
16
+
17
+ print(data.head())
18
+
19
+ # %%
20
+
21
+
22
+ # %%
23
+ data.info()
24
+
25
+ # %%
26
+ meta_data
27
+
28
+ # %%
29
+ # Define your file path and name
30
+ file_path = '/Users/manos/Documents/BDS/MLops_mod/TSLA_stock_price.csv' # Customize the path and filename
31
+
32
+ # Save the DataFrame to CSV
33
+ data.to_csv(file_path)
34
+
35
+ print(f"Data saved to {file_path}")
36
+
37
+
38
+ # %%
39
+ # Load and display the data from CSV to confirm
40
+ tsla_df = pd.read_csv(file_path)
41
+ print(tsla_df.head())
42
+
43
+
44
+ # %%
45
+ import hopsworks
46
+
47
+ project = hopsworks.login()
48
+ fs = project.get_feature_store()
49
+
50
+
51
+ # %%
52
+ import re
53
+
54
+ # %%
55
+ def clean_column_name(name):
56
+ # Remove all non-letter characters
57
+ cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
58
+ return cleaned_name
59
+
60
+
61
+ # %%
62
+ tsla_df
63
+
64
+ # %%
65
+ # Assuming 'tsla_df' is your DataFrame
66
+ tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
67
+
68
+
69
+ # %%
70
+ print(tsla_df.columns)
71
+
72
+
73
+ # %%
74
+ # Define a feature group
75
+ tesla_fg = fs.get_or_create_feature_group(
76
+ name="tsla_stock",
77
+ description="Tesla stock dataset from alpha vantage",
78
+ version=1,
79
+ primary_key=["date"],
80
+ online_enabled=True,
81
+ )
82
+
83
+ # %%
84
+ tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
85
+
86
+ # %%
87
+ # Create feature group for historical news data
88
+ news_df = pd.read_csv('/Users/manos/Documents/BDS/MLops_mod/news_articles.csv')
89
+
90
+ news_sentiment_fg = fs.get_or_create_feature_group(
91
+ name='news_sentiment',
92
+ description='News sentiment from Polygon',
93
+ version=1,
94
+ primary_key=['date'],
95
+ online_enabled=True,
96
+ )
97
+
98
+ news_sentiment_fg.insert(news_df)
99
+
100
+
feature_preprocessing.ipynb DELETED
@@ -1,116 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 44,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from calendar import monthrange\n",
10
- "from feature_engineering import *\n",
11
- "import glob\n",
12
- "import pandas as pd\n",
13
- "from dotenv import load_dotenv\n",
14
- "import os\n",
15
- "\n",
16
- "load_dotenv()\n",
17
- "\n",
18
- "# Set the API endpoint and your API key\n",
19
- "endpoint = \"https://api.marketaux.com/v1/news/all?symbols=TSLA&filter_entities=true&published_after=2021&language=en&api_token=iy6rRX4oxFrouZocXr8JNpOzaxZLk3UvMfoMGxYs\"\n",
20
- "api_key = os.environ.get('news_api')\n",
21
- "\n",
22
- "# Set the ticker symbol\n",
23
- "ticker = \"TSLA\" #TSLA"
24
- ]
25
- },
26
- {
27
- "cell_type": "code",
28
- "execution_count": 34,
29
- "metadata": {},
30
- "outputs": [],
31
- "source": [
32
- "def getNews_historical(api_key,endpoint,ticker,year,month,num=1000):\n",
33
- " \n",
34
- " for start,end in zip([1,15],[16,monthrange(year, month)[1]]):\n",
35
- " \n",
36
- " from_date = '{}-{:02d}-{:02d}'.format(year,month,start)\n",
37
- " to_date = '{}-{:02d}-{:02d}'.format(year,month,end)\n",
38
- " \n",
39
- " print('Grabbing News data between {}-{}'.format(from_date,to_date)) \n",
40
- " news = getNews(api_key,endpoint,ticker,from_date,to_date)\n",
41
- " \n",
42
- " print('Number of articles: ',len(news.index))\n",
43
- " news.head(n=num)\n",
44
- "\n",
45
- " # Store the dataframe as a CSV file\n",
46
- " news.to_csv(\"/Users/manos/Documents/BDS/MLops_mod/TSLA_news_{}_to_{}.csv\".format(from_date,to_date))"
47
- ]
48
- },
49
- {
50
- "cell_type": "code",
51
- "execution_count": 36,
52
- "metadata": {},
53
- "outputs": [
54
- {
55
- "name": "stdout",
56
- "output_type": "stream",
57
- "text": [
58
- "{'meta': {'found': 58203, 'returned': 3, 'limit': 3, 'page': 1}, 'data': [{'uuid': 'a2f5f0e0-937a-4333-9aa7-da32fb0ede1f', 'title': \"What's next for Big Tech? See what SA analysts have to say\", 'description': 'Technology stocks have dropped over the past couple of weeks. See what SA analysts have to say about the overall state of tech and the economy.', 'keywords': '', 'snippet': 'Technology stocks have dropped over the past couple of weeks, and it was further seen with the selloff in Meta Platforms (META) and weak GDP data, as the two ac...', 'url': 'https://seekingalpha.com/news/4094186-tech-stocks-dive-see-what-sa-analysts-have-to-say', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/184997191/image_184997191.jpg?io=getty-c-w750', 'language': 'en', 'published_at': '2024-04-26T12:20:54.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 11.309888, 'sentiment_score': 0, 'highlights': [{'highlight': '<em>Tesla</em> (TSLA) -3.5% .\\n\\nTech ETFs', 'sentiment': 0, 'highlighted_in': 'main_text'}]}], 'similar': []}, {'uuid': '650adf2f-d62f-478d-9322-05d3e7d7532d', 'title': 'Stellantis And Tesla: Combine These Stocks For The Ultimate Automotive Portfolio (STLA)', 'description': 'Tesla and Stellantis are two automakers that complement each other. Find out why I see both STLA and TSLA stocks as currently undervalued.', 'keywords': '', 'snippet': 'Tramino/iStock Unreleased via Getty Images\\n\\nStellantis N.V. (NYSE:STLA) and Tesla, Inc. (TSLA) are two very distinct automakers that, in my view, perfectly comp...', 'url': 'https://seekingalpha.com/article/4686610-stellantis-tesla-combine-these-stocks-for-ultimate-automotive-portfolio', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/1305717707/image_1305717707.jpg?io=getty-c-w1536', 'language': 'en', 'published_at': '2024-04-26T10:58:06.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 22.866589, 'sentiment_score': 0.173982, 'highlights': [{'highlight': '(NYSE:STLA) and <em>Tesla</em>, <em>Inc</em>. (TSLA) are two very distinct automakers that, in my view, perfectly complement each other. By entering a 50/50 balanced position in the two companies, you can create an “artificial” automaker in your portfolio that is bound to dominate the industry and provide superior returns for shareholders.', 'sentiment': 0.8519, 'highlighted_in': 'main_text'}, {'highlight': 'The brands of “TESSA” include:\\n\\n<em>Tesla</em>, the leading global EV brand and #1 most valuable car brand in the world. Because of Tesla’s aggressive price policy lately, I believe it makes almost no economic sense to buy an EV that is not a <em>Tesla</em>, for the majority of consumers. More on this shortly.', 'sentiment': 0.2089, 'highlighted_in': 'main_text'}, {'highlight': 'The two overall car brand portfolios encompass all market segments\\n\\nGoing beyond EVs, I see “TESSA’s” car portfolio to cover all segments, again because of the complementarity of <em>Tesla</em> and Stellantis. The below chart outlines how all car segments are covered by either <em>Tesla</em> or Stellantis.', 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'To be fair, both Stellantis and <em>Tesla</em> margins declined in 2023, and in the case of <em>Tesla</em>, the company just reported that margins are now down to 5.5% after Q1 price cuts.\\n\\nHowever, I believe that Tesla’s margins at the moment do not tell the full story.', 'sentiment': 0.0258, 'highlighted_in': 'main_text'}, {'highlight': 'It is precisely because it enjoyed a 25%+ operating margin back in 2021 that <em>Tesla</em> was able to grow its company with aggressive pricing in the past 2 years.\\n\\nToday, for the majority of use cases, I believe buying an EV that is not a <em>Tesla</em> does not make rational sense.', 'sentiment': 0.4019, 'highlighted_in': 'main_text'}, {'highlight': 'These are cars that have starting prices that are significantly higher than <em>Tesla</em>, but with worse reviews, worse technology and limited access to Tesla’s SuperCharger system. Even EV-native car brands, such as Rivian and Polestar, have difficulty in competing with <em>Tesla</em>.', 'sentiment': -0.9294, 'highlighted_in': 'main_text'}, {'highlight': 'A Rivian R2 starts at $45,000, which is almost $7,000 more than the base <em>Tesla</em> Model 3.\\n\\nI believe that <em>Tesla</em> is using its margins to grow the EV category, converting ICE consumers, and simultaneously gain the monster share of that growing market.', 'sentiment': 0.743, 'highlighted_in': 'main_text'}, {'highlight': \"Key Financial Metrics for <em>Tesla</em>, Q1 24 (Tesla's Q1 Shareholders Presentation)\\n\\nKey Financial Metrics for Stellantis, Q1 24 (Stellantis' Q1 Shareholder Presentation)\\n\\nThis financial data tells the same story: <em>Tesla</em> and Stellantis complement each other.\", 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'VOO since 2021 (Seeking Alpha)\\n\\nWhat matters for my thesis is that <em>Tesla</em> and Stellantis are complementary in how they reward shareholders and how the market prices their stocks. <em>Tesla</em> is a tech company, looking at the long term, and shareholders need to be patient to see returns.', 'sentiment': 0.5859, 'highlighted_in': 'main_text'}, {'highlight': 'In that case, <em>Tesla</em> might generate returns significantly higher than Stellantis, to the point that it would have seemed silly to “dilute” a <em>Tesla</em> investment with another stock.', 'sentiment': 0.0258, 'highlighted_in': 'main_text'}, {'highlight': 'Stellantis And <em>Tesla</em>: Combine These Stocks For The Ultimate Automotive Portfolio (STLA)', 'sentiment': 0, 'highlighted_in': 'title'}]}], 'similar': []}, {'uuid': '47a58bd4-3a8d-40fe-8a89-934d0d695ea4', 'title': 'Tesla is being investigated by the NHTSA for Autopilot software fix (NASDAQ:TSLA)', 'description': \"The National Highway Traffic Safety Administration is investigating whether Tesla's recall of 2 million vehicles for Autopilot safeguards is sufficient.\", 'keywords': '', 'snippet': \"The National Highway Traffic Safety Administration confirmed on Friday that the safety regulator has opened an investigation into whether Tesla's (NASDAQ:TSLA) ...\", 'url': 'https://seekingalpha.com/news/4094754-tesla-is-being-investigated-by-the-nhtsa-for-autopilot-software-fix', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/1415090444/image_1415090444.jpg?io=getty-c-w750', 'language': 'en', 'published_at': '2024-04-26T10:50:20.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 51.845444, 'sentiment_score': 0.42985, 'highlights': [{'highlight': \"The National Highway Traffic Safety Administration confirmed on Friday that the safety regulator has opened an investigation into whether Tesla's (<em>NASDAQ:TSLA</em>) recall of more than 2 million vehicles announced in December to install new Autopilot safeguards is adequate.\", 'sentiment': 0.836, 'highlighted_in': 'main_text'}, {'highlight': \"While <em>Tesla</em> has released software updates to address potential issues, NHTSA cited Tesla's statement that a portion of the remedy both requires the owner to opt in and allows a driver to readily reverse it.\", 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'In December, <em>Tesla</em> (TSLA) said its largest-ever recall was to better ensure drivers pay attention when using its advanced driver assistance system.\\n\\nShares of <em>Tesla</em> (TSLA) rose 1.17% in premarket trading on Friday to $172.17. The EV stock is down 31.51% on a year-to-date basis. Short interest stands at 3.84% of the total float.', 'sentiment': 0.8834, 'highlighted_in': 'main_text'}, {'highlight': '<em>Tesla</em> is being investigated by the NHTSA for Autopilot software fix (<em>NASDAQ:TSLA</em>)', 'sentiment': 0, 'highlighted_in': 'title'}]}], 'similar': [{'uuid': 'b269d18a-6ea0-4554-a20e-047c623513f9', 'title': 'US probes Tesla recall of 2 million vehicles over Autopilot, citing concerns By Reuters', 'description': 'US probes Tesla recall of 2 million vehicles over Autopilot, citing concerns', 'keywords': '', 'snippet': \"WASHINGTON (Reuters) - U.S. auto safety regulators said Friday they have opened an investigation into whether Tesla (NASDAQ: )'s recall of more than 2 million v...\", 'url': 'https://www.investing.com/news/stock-market-news/us-probes-tesla-recall-of-2-million-vehicles-over-autopilot-citing-concerns-3400236', 'image_url': 'https://i-invdn-com.investing.com/news/moved_LYNXMPEJ580NE_L.jpg', 'language': 'en', 'published_at': '2024-04-26T09:51:10.000000Z', 'source': 'investing.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 25.2132, 'sentiment_score': 0.432933, 'highlights': [{'highlight': \"WASHINGTON (Reuters) - U.S. auto safety regulators said Friday they have opened an investigation into whether <em>Tesla</em> (NASDAQ: )'s recall of more than 2 million vehicles announced in December to install new Autopilot safeguards is adequate.\", 'sentiment': 0.7269, 'highlighted_in': 'main_text'}, {'highlight': 'The agency said <em>Tesla</em> has issued software updates to address issues that appear related to its concerns but has not made them \"a part of the recall or otherwise determined to remedy a defect that poses an unreasonable safety risk.\"', 'sentiment': 0.5719, 'highlighted_in': 'main_text'}, {'highlight': 'US probes <em>Tesla</em> recall of 2 million vehicles over Autopilot, citing concerns By Reuters', 'sentiment': 0, 'highlighted_in': 'title'}]}]}]}]}\n"
59
- ]
60
- }
61
- ],
62
- "source": [
63
- "response = requests.get(endpoint)\n",
64
- "data = response.json()\n",
65
- "print(data) # See what the data looks like\n"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": 39,
71
- "metadata": {},
72
- "outputs": [
73
- {
74
- "name": "stdout",
75
- "output_type": "stream",
76
- "text": [
77
- "Grabbing News data between 2022-01-01-2022-01-16\n"
78
- ]
79
- },
80
- {
81
- "ename": "ValueError",
82
- "evalue": "All arrays must be of the same length",
83
- "output_type": "error",
84
- "traceback": [
85
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
86
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
87
- "\u001b[1;32m/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb Cell 4\u001b[0m line \u001b[0;36m4\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mfor\u001b[39;00m year \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m2022\u001b[39m,\u001b[39m2023\u001b[39m):\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfor\u001b[39;00m month \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m,\u001b[39m13\u001b[39m):\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m getNews_historical(api_key,endpoint,ticker,year,month)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39mif\u001b[39;00m year \u001b[39m==\u001b[39m \u001b[39m2023\u001b[39m \u001b[39mand\u001b[39;00m month \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
88
- "\u001b[1;32m/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb Cell 4\u001b[0m line \u001b[0;36m9\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m to_date \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{:02d}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{:02d}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mformat(year,month,end)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mGrabbing News data between \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mformat(from_date,to_date)) \n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m news \u001b[39m=\u001b[39m getNews(api_key,endpoint,ticker,from_date,to_date)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mNumber of articles: \u001b[39m\u001b[39m'\u001b[39m,\u001b[39mlen\u001b[39m(news\u001b[39m.\u001b[39mindex))\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m news\u001b[39m.\u001b[39mhead(n\u001b[39m=\u001b[39mnum)\n",
89
- "File \u001b[0;32m~/Documents/BDS/MLops_mod/feature_engineering.py:27\u001b[0m, in \u001b[0;36mgetNews\u001b[0;34m(api_key, endpoint, ticker, from_date, to_date, num)\u001b[0m\n\u001b[1;32m 21\u001b[0m response \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(endpoint, params\u001b[39m=\u001b[39mparams)\n\u001b[1;32m 23\u001b[0m \u001b[39m# Print the response from the API\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m#print(response.json())\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \n\u001b[1;32m 26\u001b[0m \u001b[39m#Return a Pandas dataframe from the response\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[39mreturn\u001b[39;00m pd\u001b[39m.\u001b[39mDataFrame(response\u001b[39m.\u001b[39mjson())\n",
90
- "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/frame.py:662\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 656\u001b[0m mgr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_init_mgr(\n\u001b[1;32m 657\u001b[0m data, axes\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m: index, \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: columns}, dtype\u001b[39m=\u001b[39mdtype, copy\u001b[39m=\u001b[39mcopy\n\u001b[1;32m 658\u001b[0m )\n\u001b[1;32m 660\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(data, \u001b[39mdict\u001b[39m):\n\u001b[1;32m 661\u001b[0m \u001b[39m# GH#38939 de facto copy defaults to False only in non-dict cases\u001b[39;00m\n\u001b[0;32m--> 662\u001b[0m mgr \u001b[39m=\u001b[39m dict_to_mgr(data, index, columns, dtype\u001b[39m=\u001b[39mdtype, copy\u001b[39m=\u001b[39mcopy, typ\u001b[39m=\u001b[39mmanager)\n\u001b[1;32m 663\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(data, ma\u001b[39m.\u001b[39mMaskedArray):\n\u001b[1;32m 664\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mma\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmrecords\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mmrecords\u001b[39;00m\n",
91
- "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:493\u001b[0m, in \u001b[0;36mdict_to_mgr\u001b[0;34m(data, index, columns, dtype, typ, copy)\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 490\u001b[0m \u001b[39m# dtype check to exclude e.g. range objects, scalars\u001b[39;00m\n\u001b[1;32m 491\u001b[0m arrays \u001b[39m=\u001b[39m [x\u001b[39m.\u001b[39mcopy() \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(x, \u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39melse\u001b[39;00m x \u001b[39mfor\u001b[39;00m x \u001b[39min\u001b[39;00m arrays]\n\u001b[0;32m--> 493\u001b[0m \u001b[39mreturn\u001b[39;00m arrays_to_mgr(arrays, columns, index, dtype\u001b[39m=\u001b[39mdtype, typ\u001b[39m=\u001b[39mtyp, consolidate\u001b[39m=\u001b[39mcopy)\n",
92
- "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:118\u001b[0m, in \u001b[0;36marrays_to_mgr\u001b[0;34m(arrays, columns, index, dtype, verify_integrity, typ, consolidate)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[39mif\u001b[39;00m verify_integrity:\n\u001b[1;32m 116\u001b[0m \u001b[39m# figure out the index, if necessary\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[39mif\u001b[39;00m index \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m index \u001b[39m=\u001b[39m _extract_index(arrays)\n\u001b[1;32m 119\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 120\u001b[0m index \u001b[39m=\u001b[39m ensure_index(index)\n",
93
- "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:666\u001b[0m, in \u001b[0;36m_extract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 664\u001b[0m lengths \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(raw_lengths))\n\u001b[1;32m 665\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(lengths) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m--> 666\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mAll arrays must be of the same length\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 668\u001b[0m \u001b[39mif\u001b[39;00m have_dicts:\n\u001b[1;32m 669\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 670\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mMixing dicts with non-Series may lead to ambiguous ordering.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 671\u001b[0m )\n",
94
- "\u001b[0;31mValueError\u001b[0m: All arrays must be of the same length"
95
- ]
96
- }
97
- ],
98
- "source": [
99
- "# Grab old data\n",
100
- "for year in range(2022,2023):\n",
101
- " for month in range(1,13):\n",
102
- " getNews_historical(api_key,endpoint,ticker,year,month)\n",
103
- " if year == 2023 and month == 1:\n",
104
- " break"
105
- ]
106
- }
107
- ],
108
- "metadata": {
109
- "language_info": {
110
- "name": "python"
111
- },
112
- "orig_nbformat": 4
113
- },
114
- "nbformat": 4,
115
- "nbformat_minor": 2
116
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feature_view.ipynb CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 10,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import modal\n",
11
+ "import hopsworks\n",
12
+ "import pandas as pd\n",
13
+ "import numpy as np\n",
14
+ "from calendar import monthrange\n",
15
+ "from feature_engineering import *\n",
16
+ "import glob\n",
17
+ "import pandas as pd\n",
18
+ "from dotenv import load_dotenv\n",
19
+ "import os"
20
+ ]
21
+ }
22
+ ],
23
+ "metadata": {
24
+ "kernelspec": {
25
+ "display_name": "base",
26
+ "language": "python",
27
+ "name": "python3"
28
+ },
29
+ "language_info": {
30
+ "codemirror_mode": {
31
+ "name": "ipython",
32
+ "version": 3
33
+ },
34
+ "file_extension": ".py",
35
+ "mimetype": "text/x-python",
36
+ "name": "python",
37
+ "nbconvert_exporter": "python",
38
+ "pygments_lexer": "ipython3",
39
+ "version": "3.11.4"
40
+ },
41
+ "orig_nbformat": 4
42
+ },
43
+ "nbformat": 4,
44
+ "nbformat_minor": 2
45
+ }
historical_news.ipynb ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from dotenv import load_dotenv\n",
17
+ "from datetime import datetime, timedelta\n",
18
+ "import requests\n",
19
+ "import os\n",
20
+ "import time\n",
21
+ "import pandas as pd \n",
22
+ "from news_preprocessing import *"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 5,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "Fetched 50 articles from 2022-04-01 to 2022-05-21\n",
35
+ "Fetched 50 articles from 2022-05-22 to 2022-07-11\n",
36
+ "Fetched 50 articles from 2022-07-12 to 2022-08-31\n",
37
+ "Fetched 50 articles from 2022-09-01 to 2022-10-21\n",
38
+ "Fetched 50 articles from 2022-10-22 to 2022-12-11\n",
39
+ "Rate limit reached. Waiting to retry...\n",
40
+ "Fetched 50 articles from 2022-12-12 to 2023-01-31\n",
41
+ "Fetched 50 articles from 2023-02-01 to 2023-03-23\n",
42
+ "Fetched 50 articles from 2023-03-24 to 2023-05-13\n",
43
+ "Fetched 50 articles from 2023-05-14 to 2023-07-03\n",
44
+ "Fetched 50 articles from 2023-07-04 to 2023-08-23\n",
45
+ "Rate limit reached. Waiting to retry...\n",
46
+ "Fetched 50 articles from 2023-08-24 to 2023-10-13\n",
47
+ "Fetched 50 articles from 2023-10-14 to 2023-12-03\n",
48
+ "Fetched 50 articles from 2023-12-04 to 2024-01-23\n",
49
+ "Fetched 50 articles from 2024-01-24 to 2024-03-14\n",
50
+ "Fetched 50 articles from 2024-03-15 to 2024-04-01\n",
51
+ "Total articles fetched: 750\n"
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "def fetch_news(api_key, ticker, start_date, end_date):\n",
57
+ " base_url = os.environ.get(\"endpointnewsp\")\n",
58
+ " headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
59
+ " all_news = []\n",
60
+ " \n",
61
+ " current_date = start_date\n",
62
+ "\n",
63
+ " while current_date <= end_date:\n",
64
+ " batch_end_date = current_date + timedelta(days=50)\n",
65
+ " if batch_end_date > end_date:\n",
66
+ " batch_end_date = end_date\n",
67
+ "\n",
68
+ " params = {\n",
69
+ " \"ticker\": ticker,\n",
70
+ " \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n",
71
+ " \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n",
72
+ " \"limit\": 50,\n",
73
+ " \"sort\": \"published_utc\"\n",
74
+ " }\n",
75
+ "\n",
76
+ " try:\n",
77
+ " response = requests.get(base_url, headers=headers, params=params)\n",
78
+ " if response.status_code == 200:\n",
79
+ " data = response.json()\n",
80
+ " articles = data.get('results', [])\n",
81
+ " all_news.extend(articles)\n",
82
+ " print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n",
83
+ " current_date = batch_end_date + timedelta(days=1)\n",
84
+ " elif response.status_code == 429:\n",
85
+ " print(\"Rate limit reached. Waiting to retry...\")\n",
86
+ " time.sleep(60) # Wait for 60 seconds or as recommended by the API\n",
87
+ " continue # Retry the current request\n",
88
+ " else:\n",
89
+ " print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n",
90
+ " break\n",
91
+ " except Exception as e:\n",
92
+ " print(f\"An error occurred: {e}\")\n",
93
+ " break\n",
94
+ "\n",
95
+ " return all_news\n",
96
+ "\n",
97
+ "# Example usage\n",
98
+ "api_key = os.environ.get('newsp_api')\n",
99
+ "ticker = 'TSLA'\n",
100
+ "start_date = datetime(2022, 4, 1) # start date\n",
101
+ "end_date = datetime(2024, 4, 1)\n",
102
+ "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n",
103
+ "print(f\"Total articles fetched: {len(news_articles)}\")\n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Process the news articles\n",
113
+ "df = process_news_articles(news_articles)"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 8,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "df.to_csv('news_articles.csv', index=False)\n"
123
+ ]
124
+ }
125
+ ],
126
+ "metadata": {
127
+ "kernelspec": {
128
+ "display_name": "base",
129
+ "language": "python",
130
+ "name": "python3"
131
+ },
132
+ "language_info": {
133
+ "codemirror_mode": {
134
+ "name": "ipython",
135
+ "version": 3
136
+ },
137
+ "file_extension": ".py",
138
+ "mimetype": "text/x-python",
139
+ "name": "python",
140
+ "nbconvert_exporter": "python",
141
+ "pygments_lexer": "ipython3",
142
+ "version": "3.11.4"
143
+ },
144
+ "orig_nbformat": 4
145
+ },
146
+ "nbformat": 4,
147
+ "nbformat_minor": 2
148
+ }
historical_stock.ipynb ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from dotenv import load_dotenv\n",
10
+ "import os \n",
11
+ "from alpha_vantage.timeseries import TimeSeries\n",
12
+ "import pandas as pd\n",
13
+ "import hopsworks\n",
14
+ "import re \n",
15
+ "import modal \n"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ " 1. open 2. high 3. low 4. close 5. volume\n",
28
+ "date \n",
29
+ "2024-04-30 186.98 190.95 182.8401 183.28 127031787.0\n",
30
+ "2024-04-29 188.42 198.87 184.5400 194.05 243869678.0\n",
31
+ "2024-04-26 168.85 172.12 166.3700 168.29 109815725.0\n",
32
+ "2024-04-25 158.96 170.88 158.3600 170.18 126427521.0\n",
33
+ "2024-04-24 162.84 167.97 157.5100 162.13 181178020.0\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "load_dotenv()\n",
39
+ "\n",
40
+ "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
41
+ "ts = TimeSeries(key=api_key, output_format='pandas')\n",
42
+ "\n",
43
+ "# Fetch daily adjusted stock prices; adjust the symbol as needed\n",
44
+ "data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n",
45
+ "\n",
46
+ "print(data.head())"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# Define your file path and name\n",
56
+ "file_path = '/Users/manos/Documents/BDS/MLops_mod/TSLA_stock_price.csv' # Customize the path and filename\n",
57
+ "\n",
58
+ "# Save the DataFrame to CSV\n",
59
+ "data.to_csv(file_path)\n",
60
+ "\n",
61
+ "print(f\"Data saved to {file_path}\")"
62
+ ]
63
+ }
64
+ ],
65
+ "metadata": {
66
+ "kernelspec": {
67
+ "display_name": "base",
68
+ "language": "python",
69
+ "name": "python3"
70
+ },
71
+ "language_info": {
72
+ "codemirror_mode": {
73
+ "name": "ipython",
74
+ "version": 3
75
+ },
76
+ "file_extension": ".py",
77
+ "mimetype": "text/x-python",
78
+ "name": "python",
79
+ "nbconvert_exporter": "python",
80
+ "pygments_lexer": "ipython3",
81
+ "version": "3.11.4"
82
+ },
83
+ "orig_nbformat": 4
84
+ },
85
+ "nbformat": 4,
86
+ "nbformat_minor": 2
87
+ }
news_articles.csv CHANGED
The diff for this file is too large to render. See raw diff
 
news_exp.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
news_preprocessing.ipynb ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from dotenv import load_dotenv\n",
10
+ "from datetime import datetime, timedelta\n",
11
+ "import requests\n",
12
+ "import os\n",
13
+ "import time\n",
14
+ "import pandas as pd \n",
15
+ "from textblob import TextBlob"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 5,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "def process_news_articles(news_articles):\n",
25
+ " # Convert list of dictionaries to DataFrame\n",
26
+ " df = pd.DataFrame(news_articles)\n",
27
+ "\n",
28
+ " # Drop rows where the description is NaN\n",
29
+ " df = df.dropna(subset=['description'])\n",
30
+ "\n",
31
+ " # Fill missing 'amp_url' and 'keywords' with specific placeholders\n",
32
+ " df['amp_url'] = df['amp_url'].fillna('No URL provided')\n",
33
+ " df['keywords'] = df['keywords'].fillna('No keywords')\n",
34
+ "\n",
35
+ " # Sentiment analysis on descriptions\n",
36
+ " df['sentiment'] = df['description'].apply(lambda text: TextBlob(text).sentiment.polarity)\n",
37
+ "\n",
38
+ " # Convert 'published_utc' to datetime and extract date and time\n",
39
+ " df['published_utc'] = pd.to_datetime(df['published_utc'])\n",
40
+ " df['date'] = df['published_utc'].dt.date\n",
41
+ " df['time'] = df['published_utc'].dt.time\n",
42
+ "\n",
43
+ " # Drop unnecessary columns\n",
44
+ " df.drop(['published_utc'], axis=1, inplace=True)\n",
45
+ " # set date to index\n",
46
+ " df = df.set_index(\"date\")\n",
47
+ " df.index = pd.to_datetime(df.index)\n",
48
+ "\n",
49
+ " return df\n",
50
+ "\n"
51
+ ]
52
+ }
53
+ ],
54
+ "metadata": {
55
+ "kernelspec": {
56
+ "display_name": "base",
57
+ "language": "python",
58
+ "name": "python3"
59
+ },
60
+ "language_info": {
61
+ "codemirror_mode": {
62
+ "name": "ipython",
63
+ "version": 3
64
+ },
65
+ "file_extension": ".py",
66
+ "mimetype": "text/x-python",
67
+ "name": "python",
68
+ "nbconvert_exporter": "python",
69
+ "pygments_lexer": "ipython3",
70
+ "version": "3.11.4"
71
+ },
72
+ "orig_nbformat": 4
73
+ },
74
+ "nbformat": 4,
75
+ "nbformat_minor": 2
76
+ }
news_preprocessing.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from dotenv import load_dotenv
3
+ from datetime import datetime, timedelta
4
+ import requests
5
+ import os
6
+ import time
7
+ import pandas as pd
8
+ from textblob import TextBlob
9
+
10
+ # %%
11
+ def process_news_articles(news_articles):
12
+ # Convert list of dictionaries to DataFrame
13
+ df = pd.DataFrame(news_articles)
14
+
15
+ # Drop rows where the description is NaN
16
+ df = df.dropna(subset=['description'])
17
+
18
+ # Fill missing 'amp_url' and 'keywords' with specific placeholders
19
+ df['amp_url'] = df['amp_url'].fillna('No URL provided')
20
+ df['keywords'] = df['keywords'].fillna('No keywords')
21
+
22
+ # Sentiment analysis on descriptions
23
+ df['sentiment'] = df['description'].apply(lambda text: TextBlob(text).sentiment.polarity)
24
+
25
+ # Convert 'published_utc' to datetime and extract date and time
26
+ df['published_utc'] = pd.to_datetime(df['published_utc'])
27
+ df['date'] = df['published_utc'].dt.date
28
+ df['time'] = df['published_utc'].dt.time
29
+
30
+ # Drop unnecessary columns
31
+ df.drop(['published_utc'], axis=1, inplace=True)
32
+ # set date to index
33
+ df = df.set_index("date")
34
+ df.index = pd.to_datetime(df.index)
35
+
36
+ return df
37
+
38
+
39
+
40
+
requirements.txt CHANGED
@@ -12,3 +12,7 @@ pandas==1.5.1
12
  #Pillow==10.2.0
13
  scikit-learn==1.4.0
14
  seaborn==0.13.2
 
 
 
 
 
12
  #Pillow==10.2.0
13
  scikit-learn==1.4.0
14
  seaborn==0.13.2
15
+ python-dotenv
16
+ requests
17
+ alpha_vantage
18
+ textblob