fklitt commited on
Commit
efa7e3e
β€’
1 Parent(s): 67d3433

Updated_11

Browse files
Files changed (27) hide show
  1. LSTM_model.keras +0 -0
  2. historical_news.ipynb β†’ Stocks news prediction/Notebooks/1_historical_news.ipynb +25 -47
  3. Stocks news prediction/Notebooks/2_historical_stock.ipynb +127 -0
  4. news_preprocessing.ipynb β†’ Stocks news prediction/Notebooks/3_news_preprocessing.ipynb +5 -1
  5. stock_preprocessing.ipynb β†’ Stocks news prediction/Notebooks/4_stock_preprocessing.ipynb +31 -348
  6. Stocks news prediction/Notebooks/5_feature_pipeline.ipynb +493 -0
  7. feature_view.ipynb β†’ Stocks news prediction/Notebooks/6_feature_view.ipynb +38 -126
  8. Stocks news prediction/Notebooks/7_training_pipeline.ipynb +839 -0
  9. Stocks news prediction/Notebooks/8_inference_pipeline.ipynb +315 -0
  10. Stocks news prediction/SML/__pycache__/feature_pipeline.cpython-311.pyc +0 -0
  11. Stocks news prediction/SML/__pycache__/news_preprocessing.cpython-311.pyc +0 -0
  12. feature_pipeline.py β†’ Stocks news prediction/SML/feature_pipeline.py +20 -61
  13. feature_view.py β†’ Stocks news prediction/SML/feature_view.py +11 -37
  14. Stocks news prediction/SML/historical_news.py +120 -0
  15. Stocks news prediction/SML/historical_stock.py +51 -0
  16. news_preprocessing.py β†’ Stocks news prediction/SML/news_preprocessing.py +7 -3
  17. stock_preprocessing.py β†’ Stocks news prediction/SML/stock_preprocessing.py +17 -21
  18. Stocks news prediction/SML/training_pipeline.py +256 -0
  19. TSLA_stock_price.csv β†’ Stocks news prediction/TSLA_stock_price.csv +0 -0
  20. news_articles.csv β†’ Stocks news prediction/news_articles.csv +0 -0
  21. news_articles_ema.csv β†’ Stocks news prediction/news_articles_ema.csv +0 -0
  22. feature_engineering.ipynb +0 -73
  23. feature_pipeline.ipynb +0 -775
  24. feature_view_freddie.py +0 -95
  25. historical_stock.ipynb +0 -257
  26. requirements.txt +1 -0
  27. training_pipeline.ipynb +0 -167
LSTM_model.keras DELETED
Binary file (291 kB)
 
historical_news.ipynb β†’ Stocks news prediction/Notebooks/1_historical_news.ipynb RENAMED
@@ -1,20 +1,5 @@
1
  {
2
  "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from dotenv import load_dotenv\n",
10
- "from datetime import datetime, timedelta\n",
11
- "import requests\n",
12
- "import os\n",
13
- "import time\n",
14
- "import pandas as pd \n",
15
- "from news_preprocessing import *"
16
- ]
17
- },
18
  {
19
  "cell_type": "code",
20
  "execution_count": 2,
@@ -32,6 +17,14 @@
32
  }
33
  ],
34
  "source": [
 
 
 
 
 
 
 
 
35
  "load_dotenv()"
36
  ]
37
  },
@@ -44,32 +37,23 @@
44
  "name": "stdout",
45
  "output_type": "stream",
46
  "text": [
47
- "Fetched 50 articles from 2022-05-06 to 2022-06-25\n",
48
- "Fetched 50 articles from 2022-06-26 to 2022-08-15\n",
49
- "Fetched 50 articles from 2022-08-16 to 2022-10-05\n",
50
- "Fetched 50 articles from 2022-10-06 to 2022-11-25\n",
51
- "Fetched 50 articles from 2022-11-26 to 2023-01-15\n",
52
- "Rate limit reached. Waiting to retry...\n",
53
- "Fetched 50 articles from 2023-01-16 to 2023-03-07\n",
54
- "Fetched 50 articles from 2023-03-08 to 2023-04-27\n",
55
- "Fetched 50 articles from 2023-04-28 to 2023-06-17\n",
56
- "Fetched 50 articles from 2023-06-18 to 2023-08-07\n",
57
- "Fetched 50 articles from 2023-08-08 to 2023-09-27\n",
58
  "Rate limit reached. Waiting to retry...\n",
59
- "Fetched 50 articles from 2023-09-28 to 2023-11-17\n",
60
- "Fetched 50 articles from 2023-11-18 to 2024-01-07\n",
61
- "Fetched 50 articles from 2024-01-08 to 2024-02-27\n",
62
- "Fetched 50 articles from 2024-02-28 to 2024-04-18\n",
63
- "Fetched 50 articles from 2024-04-19 to 2024-05-05\n",
64
- "Total articles fetched: 750\n"
65
  ]
66
  }
67
  ],
68
  "source": [
69
- "import os\n",
70
- "import requests\n",
71
- "from datetime import datetime, timedelta\n",
72
- "import pandas as pd\n",
73
  "\n",
74
  "def fetch_news(api_key, ticker, start_date, end_date):\n",
75
  " base_url = os.environ.get(\"endpointnewsp\")\n",
@@ -97,10 +81,10 @@
97
  " data = response.json()\n",
98
  " articles = data.get('results', [])\n",
99
  " \n",
100
- " # Create DataFrame from articles\n",
101
  " df = pd.DataFrame(articles)\n",
102
  " \n",
103
- " # Add primary_key column if ticker is found\n",
104
  " df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
105
  " \n",
106
  " all_news.append(df) # Append DataFrame to the list\n",
@@ -119,7 +103,7 @@
119
  "\n",
120
  " return pd.concat(all_news, ignore_index=True)\n",
121
  "\n",
122
- "# Example usage\n",
123
  "api_key = os.environ.get('newsp_api')\n",
124
  "ticker = 'TSLA'\n",
125
  "end_date = datetime.now() - timedelta(days=1) # Yesterday's date\n",
@@ -263,7 +247,8 @@
263
  "metadata": {},
264
  "outputs": [],
265
  "source": [
266
- "df.to_csv('news_articles.csv', index=False)\n"
 
267
  ]
268
  },
269
  {
@@ -638,13 +623,6 @@
638
  "source": [
639
  "df_processed.head()"
640
  ]
641
- },
642
- {
643
- "cell_type": "code",
644
- "execution_count": null,
645
- "metadata": {},
646
- "outputs": [],
647
- "source": []
648
  }
649
  ],
650
  "metadata": {
 
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 2,
 
17
  }
18
  ],
19
  "source": [
20
+ "#Importing necessary libraries\n",
21
+ "from dotenv import load_dotenv\n",
22
+ "from datetime import datetime, timedelta\n",
23
+ "import requests\n",
24
+ "import os\n",
25
+ "import time\n",
26
+ "import pandas as pd \n",
27
+ "from SML import news_preprocessing #Importing everything from 'news_preprocessing'\n",
28
  "load_dotenv()"
29
  ]
30
  },
 
37
  "name": "stdout",
38
  "output_type": "stream",
39
  "text": [
40
+ "Fetched 50 articles from 2022-05-07 to 2022-06-26\n",
41
+ "Fetched 50 articles from 2022-06-27 to 2022-08-16\n",
42
+ "Fetched 50 articles from 2022-08-17 to 2022-10-06\n",
43
+ "Fetched 50 articles from 2022-10-07 to 2022-11-26\n",
44
+ "Fetched 50 articles from 2022-11-27 to 2023-01-16\n",
 
 
 
 
 
 
45
  "Rate limit reached. Waiting to retry...\n",
46
+ "Fetched 50 articles from 2023-01-17 to 2023-03-08\n",
47
+ "Fetched 50 articles from 2023-03-09 to 2023-04-28\n",
48
+ "Fetched 50 articles from 2023-04-29 to 2023-06-18\n",
49
+ "Fetched 50 articles from 2023-06-19 to 2023-08-08\n",
50
+ "Fetched 50 articles from 2023-08-09 to 2023-09-28\n",
51
+ "Rate limit reached. Waiting to retry...\n"
52
  ]
53
  }
54
  ],
55
  "source": [
56
+ "#Defining a function for fetching news\n",
 
 
 
57
  "\n",
58
  "def fetch_news(api_key, ticker, start_date, end_date):\n",
59
  " base_url = os.environ.get(\"endpointnewsp\")\n",
 
81
  " data = response.json()\n",
82
  " articles = data.get('results', [])\n",
83
  " \n",
84
+ " # Creating a DataFrame from articles\n",
85
  " df = pd.DataFrame(articles)\n",
86
  " \n",
87
+ " # Adding primary_key column if ticker is found\n",
88
  " df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
89
  " \n",
90
  " all_news.append(df) # Append DataFrame to the list\n",
 
103
  "\n",
104
  " return pd.concat(all_news, ignore_index=True)\n",
105
  "\n",
106
+ "#Usage\n",
107
  "api_key = os.environ.get('newsp_api')\n",
108
  "ticker = 'TSLA'\n",
109
  "end_date = datetime.now() - timedelta(days=1) # Yesterday's date\n",
 
247
  "metadata": {},
248
  "outputs": [],
249
  "source": [
250
+ "#Putting the news articles into a csv\n",
251
+ "df.to_csv('news_articles.csv', index=False)"
252
  ]
253
  },
254
  {
 
623
  "source": [
624
  "df_processed.head()"
625
  ]
 
 
 
 
 
 
 
626
  }
627
  ],
628
  "metadata": {
Stocks news prediction/Notebooks/2_historical_stock.ipynb ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "True"
12
+ ]
13
+ },
14
+ "execution_count": 3,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "#Importing necessary librabries\n",
21
+ "from dotenv import load_dotenv\n",
22
+ "import os \n",
23
+ "from alpha_vantage.timeseries import TimeSeries\n",
24
+ "import pandas as pd\n",
25
+ "import hopsworks\n",
26
+ "import re \n",
27
+ "import modal \n",
28
+ "#prepocessing\n",
29
+ "import requests\n",
30
+ "import pandas as pd\n",
31
+ "import json\n",
32
+ "#import pandas_market_calendars as mcal\n",
33
+ "import datetime\n",
34
+ "import numpy as np\n",
35
+ "from datetime import timedelta\n",
36
+ "load_dotenv() #Making the .env file work"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ " 1. open 2. high 3. low 4. close 5. volume ticker\n",
49
+ "date \n",
50
+ "2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
51
+ "2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
52
+ "2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
53
+ "2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
54
+ "2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n"
55
+ ]
56
+ }
57
+ ],
58
+ "source": [
59
+ "#Setting up API key to being able to fetch stocks from Alpha Vantage\n",
60
+ "\n",
61
+ "api_key = os.environ.get('stocks_api') \n",
62
+ "ts = TimeSeries(key=api_key, output_format='pandas')\n",
63
+ "\n",
64
+ "#Defining a function to fetch stocks\n",
65
+ "\n",
66
+ "def fetch_stock_prices(symbol):\n",
67
+ " # Fetch daily adjusted stock prices; adjust the symbol as needed\n",
68
+ " data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')\n",
69
+ " \n",
70
+ " # Add a new column named 'ticker' and fill it with the ticker name\n",
71
+ " data['ticker'] = symbol\n",
72
+ " \n",
73
+ " return data\n",
74
+ "\n",
75
+ "#Usage\n",
76
+ "symbol = 'TSLA'\n",
77
+ "stock_data = fetch_stock_prices(symbol)\n",
78
+ "print(stock_data.head())"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {},
85
+ "outputs": [
86
+ {
87
+ "name": "stdout",
88
+ "output_type": "stream",
89
+ "text": [
90
+ "Data saved to TSLA_stock_price.csv\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "# Defining the file path and name\n",
96
+ "file_path = 'TSLA_stock_price.csv' \n",
97
+ "\n",
98
+ "# Saving the DataFrame to CSV\n",
99
+ "stock_data.to_csv(file_path)\n",
100
+ "\n",
101
+ "print(f\"Data saved to {file_path}\")"
102
+ ]
103
+ }
104
+ ],
105
+ "metadata": {
106
+ "kernelspec": {
107
+ "display_name": "base",
108
+ "language": "python",
109
+ "name": "python3"
110
+ },
111
+ "language_info": {
112
+ "codemirror_mode": {
113
+ "name": "ipython",
114
+ "version": 3
115
+ },
116
+ "file_extension": ".py",
117
+ "mimetype": "text/x-python",
118
+ "name": "python",
119
+ "nbconvert_exporter": "python",
120
+ "pygments_lexer": "ipython3",
121
+ "version": "3.11.9"
122
+ },
123
+ "orig_nbformat": 4
124
+ },
125
+ "nbformat": 4,
126
+ "nbformat_minor": 2
127
+ }
news_preprocessing.ipynb β†’ Stocks news prediction/Notebooks/3_news_preprocessing.ipynb RENAMED
@@ -6,6 +6,7 @@
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
9
  "from dotenv import load_dotenv\n",
10
  "from datetime import datetime, timedelta\n",
11
  "import requests\n",
@@ -21,6 +22,7 @@
21
  "metadata": {},
22
  "outputs": [],
23
  "source": [
 
24
  "def process_news_articles(news_articles):\n",
25
  " # Convert list of dictionaries to DataFrame\n",
26
  " df = pd.DataFrame(news_articles)\n",
@@ -40,7 +42,7 @@
40
  " df['date'] = df['published_utc'].dt.date\n",
41
  " df['time'] = df['published_utc'].dt.time\n",
42
  "\n",
43
- " # Drop unnecessary columns\n",
44
  " df.drop(['published_utc'], axis=1, inplace=True)\n",
45
  " # set date to index\n",
46
  " df = df.set_index(\"date\")\n",
@@ -57,6 +59,8 @@
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
 
 
60
  "def exponential_moving_average(df, window):\n",
61
  " # Calculate EMA on the 'sentiment' column\n",
62
  " df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()\n",
 
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
+ "#Importing necessary libraries\n",
10
  "from dotenv import load_dotenv\n",
11
  "from datetime import datetime, timedelta\n",
12
  "import requests\n",
 
22
  "metadata": {},
23
  "outputs": [],
24
  "source": [
25
+ "#Defining a function to process news articles\n",
26
  "def process_news_articles(news_articles):\n",
27
  " # Convert list of dictionaries to DataFrame\n",
28
  " df = pd.DataFrame(news_articles)\n",
 
42
  " df['date'] = df['published_utc'].dt.date\n",
43
  " df['time'] = df['published_utc'].dt.time\n",
44
  "\n",
45
+ " # Dropping unnecessary columns\n",
46
  " df.drop(['published_utc'], axis=1, inplace=True)\n",
47
  " # set date to index\n",
48
  " df = df.set_index(\"date\")\n",
 
59
  "metadata": {},
60
  "outputs": [],
61
  "source": [
62
+ "#Defining a function for the exponential moving average\n",
63
+ "\n",
64
  "def exponential_moving_average(df, window):\n",
65
  " # Calculate EMA on the 'sentiment' column\n",
66
  " df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()\n",
stock_preprocessing.ipynb β†’ Stocks news prediction/Notebooks/4_stock_preprocessing.ipynb RENAMED
@@ -2,10 +2,22 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 17,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
 
9
  "from dotenv import load_dotenv\n",
10
  "import os \n",
11
  "from alpha_vantage.timeseries import TimeSeries\n",
@@ -20,7 +32,8 @@
20
  "import pandas_market_calendars as mcal\n",
21
  "import datetime\n",
22
  "import numpy as np\n",
23
- "from datetime import datetime, timedelta\n"
 
24
  ]
25
  },
26
  {
@@ -43,8 +56,7 @@
43
  }
44
  ],
45
  "source": [
46
- "load_dotenv()\n",
47
- "\n",
48
  "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
49
  "ts = TimeSeries(key=api_key, output_format='pandas')\n",
50
  "\n",
@@ -54,168 +66,6 @@
54
  "print(data.head())"
55
  ]
56
  },
57
- {
58
- "cell_type": "code",
59
- "execution_count": 3,
60
- "metadata": {},
61
- "outputs": [
62
- {
63
- "data": {
64
- "text/html": [
65
- "<div>\n",
66
- "<style scoped>\n",
67
- " .dataframe tbody tr th:only-of-type {\n",
68
- " vertical-align: middle;\n",
69
- " }\n",
70
- "\n",
71
- " .dataframe tbody tr th {\n",
72
- " vertical-align: top;\n",
73
- " }\n",
74
- "\n",
75
- " .dataframe thead th {\n",
76
- " text-align: right;\n",
77
- " }\n",
78
- "</style>\n",
79
- "<table border=\"1\" class=\"dataframe\">\n",
80
- " <thead>\n",
81
- " <tr style=\"text-align: right;\">\n",
82
- " <th></th>\n",
83
- " <th>1. open</th>\n",
84
- " <th>2. high</th>\n",
85
- " <th>3. low</th>\n",
86
- " <th>4. close</th>\n",
87
- " <th>5. volume</th>\n",
88
- " </tr>\n",
89
- " <tr>\n",
90
- " <th>date</th>\n",
91
- " <th></th>\n",
92
- " <th></th>\n",
93
- " <th></th>\n",
94
- " <th></th>\n",
95
- " <th></th>\n",
96
- " </tr>\n",
97
- " </thead>\n",
98
- " <tbody>\n",
99
- " <tr>\n",
100
- " <th>2024-05-02</th>\n",
101
- " <td>182.86</td>\n",
102
- " <td>184.6000</td>\n",
103
- " <td>176.0200</td>\n",
104
- " <td>180.01</td>\n",
105
- " <td>89148041.0</td>\n",
106
- " </tr>\n",
107
- " <tr>\n",
108
- " <th>2024-05-01</th>\n",
109
- " <td>182.00</td>\n",
110
- " <td>185.8600</td>\n",
111
- " <td>179.0100</td>\n",
112
- " <td>179.99</td>\n",
113
- " <td>92829719.0</td>\n",
114
- " </tr>\n",
115
- " <tr>\n",
116
- " <th>2024-04-30</th>\n",
117
- " <td>186.98</td>\n",
118
- " <td>190.9500</td>\n",
119
- " <td>182.8401</td>\n",
120
- " <td>183.28</td>\n",
121
- " <td>127031787.0</td>\n",
122
- " </tr>\n",
123
- " <tr>\n",
124
- " <th>2024-04-29</th>\n",
125
- " <td>188.42</td>\n",
126
- " <td>198.8700</td>\n",
127
- " <td>184.5400</td>\n",
128
- " <td>194.05</td>\n",
129
- " <td>243869678.0</td>\n",
130
- " </tr>\n",
131
- " <tr>\n",
132
- " <th>2024-04-26</th>\n",
133
- " <td>168.85</td>\n",
134
- " <td>172.1200</td>\n",
135
- " <td>166.3700</td>\n",
136
- " <td>168.29</td>\n",
137
- " <td>109815725.0</td>\n",
138
- " </tr>\n",
139
- " <tr>\n",
140
- " <th>...</th>\n",
141
- " <td>...</td>\n",
142
- " <td>...</td>\n",
143
- " <td>...</td>\n",
144
- " <td>...</td>\n",
145
- " <td>...</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <th>2010-07-06</th>\n",
149
- " <td>20.00</td>\n",
150
- " <td>20.0000</td>\n",
151
- " <td>15.8300</td>\n",
152
- " <td>16.11</td>\n",
153
- " <td>6866900.0</td>\n",
154
- " </tr>\n",
155
- " <tr>\n",
156
- " <th>2010-07-02</th>\n",
157
- " <td>23.00</td>\n",
158
- " <td>23.1000</td>\n",
159
- " <td>18.7100</td>\n",
160
- " <td>19.20</td>\n",
161
- " <td>5139800.0</td>\n",
162
- " </tr>\n",
163
- " <tr>\n",
164
- " <th>2010-07-01</th>\n",
165
- " <td>25.00</td>\n",
166
- " <td>25.9200</td>\n",
167
- " <td>20.2700</td>\n",
168
- " <td>21.96</td>\n",
169
- " <td>8218800.0</td>\n",
170
- " </tr>\n",
171
- " <tr>\n",
172
- " <th>2010-06-30</th>\n",
173
- " <td>25.79</td>\n",
174
- " <td>30.4192</td>\n",
175
- " <td>23.3000</td>\n",
176
- " <td>23.83</td>\n",
177
- " <td>17187100.0</td>\n",
178
- " </tr>\n",
179
- " <tr>\n",
180
- " <th>2010-06-29</th>\n",
181
- " <td>19.00</td>\n",
182
- " <td>25.0000</td>\n",
183
- " <td>17.5400</td>\n",
184
- " <td>23.89</td>\n",
185
- " <td>18766300.0</td>\n",
186
- " </tr>\n",
187
- " </tbody>\n",
188
- "</table>\n",
189
- "<p>3485 rows Γ— 5 columns</p>\n",
190
- "</div>"
191
- ],
192
- "text/plain": [
193
- " 1. open 2. high 3. low 4. close 5. volume\n",
194
- "date \n",
195
- "2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n",
196
- "2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n",
197
- "2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n",
198
- "2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n",
199
- "2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n",
200
- "... ... ... ... ... ...\n",
201
- "2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n",
202
- "2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n",
203
- "2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n",
204
- "2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n",
205
- "2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n",
206
- "\n",
207
- "[3485 rows x 5 columns]"
208
- ]
209
- },
210
- "execution_count": 3,
211
- "metadata": {},
212
- "output_type": "execute_result"
213
- }
214
- ],
215
- "source": [
216
- "data"
217
- ]
218
- },
219
  {
220
  "cell_type": "code",
221
  "execution_count": 4,
@@ -241,6 +91,7 @@
241
  }
242
  ],
243
  "source": [
 
244
  "data.info()"
245
  ]
246
  },
@@ -265,6 +116,7 @@
265
  }
266
  ],
267
  "source": [
 
268
  "meta_data"
269
  ]
270
  },
@@ -293,6 +145,7 @@
293
  "metadata": {},
294
  "outputs": [],
295
  "source": [
 
296
  "def next_business_day(today):\n",
297
  " \n",
298
  " # Real tomorrow\n",
@@ -320,6 +173,7 @@
320
  "metadata": {},
321
  "outputs": [],
322
  "source": [
 
323
  "def extract_business_day(start_date,end_date):\n",
324
  " \"\"\"\n",
325
  " Given a start_date and end_date.\n",
@@ -331,27 +185,27 @@
331
  " e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
332
  " \"\"\"\n",
333
  " \n",
334
- " # Save for later\n",
335
  " end_date_save = end_date\n",
336
  " \n",
337
- " # Get the NYSE calendar\n",
338
  " cal = mcal.get_calendar('NYSE')\n",
339
  "\n",
340
- " # Get the NYSE calendar's open and close times for the specified period\n",
341
  " schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
342
  " \n",
343
  " # Only need a list of dates when it's open (not open and close times)\n",
344
  " isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
345
  " \n",
346
- " # Go over all days: \n",
347
  " delta = datetime.timedelta(days=1)\n",
348
  " start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
349
  " end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
350
  " \n",
351
- " # Extract days from the timedelta object\n",
352
  " num_days = (end_date - start_date).days + 1\n",
353
  " \n",
354
- " # Create boolean array for days being open (1) and closed (0) \n",
355
  " is_open = np.zeros(num_days)\n",
356
  " \n",
357
  " # iterate over range of dates\n",
@@ -386,6 +240,7 @@
386
  "metadata": {},
387
  "outputs": [],
388
  "source": [
 
389
  "def clean_column_name(name):\n",
390
  " # Remove all non-letter characters\n",
391
  " cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
@@ -617,178 +472,13 @@
617
  "data.head()"
618
  ]
619
  },
620
- {
621
- "cell_type": "code",
622
- "execution_count": 13,
623
- "metadata": {},
624
- "outputs": [
625
- {
626
- "data": {
627
- "text/html": [
628
- "<div>\n",
629
- "<style scoped>\n",
630
- " .dataframe tbody tr th:only-of-type {\n",
631
- " vertical-align: middle;\n",
632
- " }\n",
633
- "\n",
634
- " .dataframe tbody tr th {\n",
635
- " vertical-align: top;\n",
636
- " }\n",
637
- "\n",
638
- " .dataframe thead th {\n",
639
- " text-align: right;\n",
640
- " }\n",
641
- "</style>\n",
642
- "<table border=\"1\" class=\"dataframe\">\n",
643
- " <thead>\n",
644
- " <tr style=\"text-align: right;\">\n",
645
- " <th></th>\n",
646
- " <th>date</th>\n",
647
- " <th>open</th>\n",
648
- " <th>high</th>\n",
649
- " <th>low</th>\n",
650
- " <th>close</th>\n",
651
- " <th>volume</th>\n",
652
- " </tr>\n",
653
- " </thead>\n",
654
- " <tbody>\n",
655
- " <tr>\n",
656
- " <th>0</th>\n",
657
- " <td>2024-05-02</td>\n",
658
- " <td>182.86</td>\n",
659
- " <td>184.6000</td>\n",
660
- " <td>176.0200</td>\n",
661
- " <td>180.01</td>\n",
662
- " <td>89148041.0</td>\n",
663
- " </tr>\n",
664
- " <tr>\n",
665
- " <th>1</th>\n",
666
- " <td>2024-05-01</td>\n",
667
- " <td>182.00</td>\n",
668
- " <td>185.8600</td>\n",
669
- " <td>179.0100</td>\n",
670
- " <td>179.99</td>\n",
671
- " <td>92829719.0</td>\n",
672
- " </tr>\n",
673
- " <tr>\n",
674
- " <th>2</th>\n",
675
- " <td>2024-04-30</td>\n",
676
- " <td>186.98</td>\n",
677
- " <td>190.9500</td>\n",
678
- " <td>182.8401</td>\n",
679
- " <td>183.28</td>\n",
680
- " <td>127031787.0</td>\n",
681
- " </tr>\n",
682
- " <tr>\n",
683
- " <th>3</th>\n",
684
- " <td>2024-04-29</td>\n",
685
- " <td>188.42</td>\n",
686
- " <td>198.8700</td>\n",
687
- " <td>184.5400</td>\n",
688
- " <td>194.05</td>\n",
689
- " <td>243869678.0</td>\n",
690
- " </tr>\n",
691
- " <tr>\n",
692
- " <th>4</th>\n",
693
- " <td>2024-04-26</td>\n",
694
- " <td>168.85</td>\n",
695
- " <td>172.1200</td>\n",
696
- " <td>166.3700</td>\n",
697
- " <td>168.29</td>\n",
698
- " <td>109815725.0</td>\n",
699
- " </tr>\n",
700
- " <tr>\n",
701
- " <th>...</th>\n",
702
- " <td>...</td>\n",
703
- " <td>...</td>\n",
704
- " <td>...</td>\n",
705
- " <td>...</td>\n",
706
- " <td>...</td>\n",
707
- " <td>...</td>\n",
708
- " </tr>\n",
709
- " <tr>\n",
710
- " <th>3480</th>\n",
711
- " <td>2010-07-06</td>\n",
712
- " <td>20.00</td>\n",
713
- " <td>20.0000</td>\n",
714
- " <td>15.8300</td>\n",
715
- " <td>16.11</td>\n",
716
- " <td>6866900.0</td>\n",
717
- " </tr>\n",
718
- " <tr>\n",
719
- " <th>3481</th>\n",
720
- " <td>2010-07-02</td>\n",
721
- " <td>23.00</td>\n",
722
- " <td>23.1000</td>\n",
723
- " <td>18.7100</td>\n",
724
- " <td>19.20</td>\n",
725
- " <td>5139800.0</td>\n",
726
- " </tr>\n",
727
- " <tr>\n",
728
- " <th>3482</th>\n",
729
- " <td>2010-07-01</td>\n",
730
- " <td>25.00</td>\n",
731
- " <td>25.9200</td>\n",
732
- " <td>20.2700</td>\n",
733
- " <td>21.96</td>\n",
734
- " <td>8218800.0</td>\n",
735
- " </tr>\n",
736
- " <tr>\n",
737
- " <th>3483</th>\n",
738
- " <td>2010-06-30</td>\n",
739
- " <td>25.79</td>\n",
740
- " <td>30.4192</td>\n",
741
- " <td>23.3000</td>\n",
742
- " <td>23.83</td>\n",
743
- " <td>17187100.0</td>\n",
744
- " </tr>\n",
745
- " <tr>\n",
746
- " <th>3484</th>\n",
747
- " <td>2010-06-29</td>\n",
748
- " <td>19.00</td>\n",
749
- " <td>25.0000</td>\n",
750
- " <td>17.5400</td>\n",
751
- " <td>23.89</td>\n",
752
- " <td>18766300.0</td>\n",
753
- " </tr>\n",
754
- " </tbody>\n",
755
- "</table>\n",
756
- "<p>3485 rows Γ— 6 columns</p>\n",
757
- "</div>"
758
- ],
759
- "text/plain": [
760
- " date open high low close volume\n",
761
- "0 2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0\n",
762
- "1 2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0\n",
763
- "2 2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0\n",
764
- "3 2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0\n",
765
- "4 2024-04-26 168.85 172.1200 166.3700 168.29 109815725.0\n",
766
- "... ... ... ... ... ... ...\n",
767
- "3480 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0\n",
768
- "3481 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0\n",
769
- "3482 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0\n",
770
- "3483 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0\n",
771
- "3484 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0\n",
772
- "\n",
773
- "[3485 rows x 6 columns]"
774
- ]
775
- },
776
- "execution_count": 13,
777
- "metadata": {},
778
- "output_type": "execute_result"
779
- }
780
- ],
781
- "source": [
782
- "data"
783
- ]
784
- },
785
  {
786
  "cell_type": "code",
787
  "execution_count": 42,
788
  "metadata": {},
789
  "outputs": [],
790
  "source": [
791
- "# Define the date range you're interested in\n",
792
  "yesterday =datetime.now()-timedelta(days=1)\n",
793
  "two_years_back = yesterday - timedelta(days=684)"
794
  ]
@@ -799,7 +489,7 @@
799
  "metadata": {},
800
  "outputs": [],
801
  "source": [
802
- "# Filter the DataFrame to this range\n",
803
  "filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]"
804
  ]
805
  },
@@ -943,13 +633,6 @@
943
  "source": [
944
  "filtered_df.shape"
945
  ]
946
- },
947
- {
948
- "cell_type": "code",
949
- "execution_count": null,
950
- "metadata": {},
951
- "outputs": [],
952
- "source": []
953
  }
954
  ],
955
  "metadata": {
@@ -968,7 +651,7 @@
968
  "name": "python",
969
  "nbconvert_exporter": "python",
970
  "pygments_lexer": "ipython3",
971
- "version": "3.11.4"
972
  },
973
  "orig_nbformat": 4
974
  },
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "True"
12
+ ]
13
+ },
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
  "source": [
20
+ "#Importing necessary libraries\n",
21
  "from dotenv import load_dotenv\n",
22
  "import os \n",
23
  "from alpha_vantage.timeseries import TimeSeries\n",
 
32
  "import pandas_market_calendars as mcal\n",
33
  "import datetime\n",
34
  "import numpy as np\n",
35
+ "from datetime import datetime, timedelta\n",
36
+ "load_dotenv()"
37
  ]
38
  },
39
  {
 
56
  }
57
  ],
58
  "source": [
59
+ "#Connecting to Alpha vantage using API key\n",
 
60
  "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
61
  "ts = TimeSeries(key=api_key, output_format='pandas')\n",
62
  "\n",
 
66
  "print(data.head())"
67
  ]
68
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  {
70
  "cell_type": "code",
71
  "execution_count": 4,
 
91
  }
92
  ],
93
  "source": [
94
+ "#Looking at data info\n",
95
  "data.info()"
96
  ]
97
  },
 
116
  }
117
  ],
118
  "source": [
119
+ "#Looking at the meta data\n",
120
  "meta_data"
121
  ]
122
  },
 
145
  "metadata": {},
146
  "outputs": [],
147
  "source": [
148
+ "#Defining a function to find the next business day\n",
149
  "def next_business_day(today):\n",
150
  " \n",
151
  " # Real tomorrow\n",
 
173
  "metadata": {},
174
  "outputs": [],
175
  "source": [
176
+ "#Defining a function to extract business day\n",
177
  "def extract_business_day(start_date,end_date):\n",
178
  " \"\"\"\n",
179
  " Given a start_date and end_date.\n",
 
185
  " e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
186
  " \"\"\"\n",
187
  " \n",
188
+ " # Saving for later\n",
189
  " end_date_save = end_date\n",
190
  " \n",
191
+ " # Getting the NYSE calendar\n",
192
  " cal = mcal.get_calendar('NYSE')\n",
193
  "\n",
194
+ " # Getting the NYSE calendar's open and close times for the specified period\n",
195
  " schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
196
  " \n",
197
  " # Only need a list of dates when it's open (not open and close times)\n",
198
  " isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
199
  " \n",
200
+ " # Going over all days: \n",
201
  " delta = datetime.timedelta(days=1)\n",
202
  " start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
203
  " end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
204
  " \n",
205
+ " # Extracting days from the timedelta object\n",
206
  " num_days = (end_date - start_date).days + 1\n",
207
  " \n",
208
+ " # Creating a boolean array for days being open (1) and closed (0) \n",
209
  " is_open = np.zeros(num_days)\n",
210
  " \n",
211
  " # iterate over range of dates\n",
 
240
  "metadata": {},
241
  "outputs": [],
242
  "source": [
243
+ "#Defining a function to clean the column names\n",
244
  "def clean_column_name(name):\n",
245
  " # Remove all non-letter characters\n",
246
  " cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
 
472
  "data.head()"
473
  ]
474
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  {
476
  "cell_type": "code",
477
  "execution_count": 42,
478
  "metadata": {},
479
  "outputs": [],
480
  "source": [
481
+ "# Define the date range we're interested in\n",
482
  "yesterday =datetime.now()-timedelta(days=1)\n",
483
  "two_years_back = yesterday - timedelta(days=684)"
484
  ]
 
489
  "metadata": {},
490
  "outputs": [],
491
  "source": [
492
+ "# Filtering the DataFrame to this range\n",
493
  "filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]"
494
  ]
495
  },
 
633
  "source": [
634
  "filtered_df.shape"
635
  ]
 
 
 
 
 
 
 
636
  }
637
  ],
638
  "metadata": {
 
651
  "name": "python",
652
  "nbconvert_exporter": "python",
653
  "pygments_lexer": "ipython3",
654
+ "version": "3.11.9"
655
  },
656
  "orig_nbformat": 4
657
  },
Stocks news prediction/Notebooks/5_feature_pipeline.ipynb ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 25,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Connection closed.\n",
13
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
14
+ ]
15
+ },
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "\n",
21
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
22
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "# Import necessary libraries\n",
28
+ "import pandas as pd # For data manipulation using DataFrames\n",
29
+ "import numpy as np # For numerical operations\n",
30
+ "import matplotlib.pyplot as plt # For data visualization\n",
31
+ "import os # For operating system-related tasks\n",
32
+ "import joblib # For saving and loading models\n",
33
+ "import hopsworks # For getting access to hopsworks\n",
34
+ "import re\n",
35
+ "\n",
36
+ "# Import specific modules from scikit-learn\n",
37
+ "from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing\n",
38
+ "from sklearn.metrics import accuracy_score # For evaluating model accuracy\n",
39
+ "\n",
40
+ "from dotenv import load_dotenv\n",
41
+ "import os\n",
42
+ "load_dotenv()\n",
43
+ "\n",
44
+ "#Connecting to hopsworks\n",
45
+ "api_key = os.environ.get('hopsworks_api')\n",
46
+ "project = hopsworks.login(api_key_value=api_key)\n",
47
+ "fs = project.get_feature_store()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 26,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
60
+ "0 2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
61
+ "1 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
62
+ "2 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
63
+ "3 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
64
+ "4 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n"
65
+ ]
66
+ }
67
+ ],
68
+ "source": [
69
+ "# Load and display the data from CSV to confirm\n",
70
+ "tsla_df = pd.read_csv('TSLA_stock_price.csv')\n",
71
+ "print(tsla_df.head()) "
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 27,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "#Defining a function to clean the column names\n",
81
+ "def clean_column_name(name):\n",
82
+ " # Remove all non-letter characters\n",
83
+ " cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
84
+ " return cleaned_name"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 28,
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/html": [
95
+ "<div>\n",
96
+ "<style scoped>\n",
97
+ " .dataframe tbody tr th:only-of-type {\n",
98
+ " vertical-align: middle;\n",
99
+ " }\n",
100
+ "\n",
101
+ " .dataframe tbody tr th {\n",
102
+ " vertical-align: top;\n",
103
+ " }\n",
104
+ "\n",
105
+ " .dataframe thead th {\n",
106
+ " text-align: right;\n",
107
+ " }\n",
108
+ "</style>\n",
109
+ "<table border=\"1\" class=\"dataframe\">\n",
110
+ " <thead>\n",
111
+ " <tr style=\"text-align: right;\">\n",
112
+ " <th></th>\n",
113
+ " <th>date</th>\n",
114
+ " <th>1. open</th>\n",
115
+ " <th>2. high</th>\n",
116
+ " <th>3. low</th>\n",
117
+ " <th>4. close</th>\n",
118
+ " <th>5. volume</th>\n",
119
+ " <th>ticker</th>\n",
120
+ " </tr>\n",
121
+ " </thead>\n",
122
+ " <tbody>\n",
123
+ " <tr>\n",
124
+ " <th>0</th>\n",
125
+ " <td>2024-05-03</td>\n",
126
+ " <td>182.10</td>\n",
127
+ " <td>184.7800</td>\n",
128
+ " <td>178.4200</td>\n",
129
+ " <td>181.19</td>\n",
130
+ " <td>75491539.0</td>\n",
131
+ " <td>TSLA</td>\n",
132
+ " </tr>\n",
133
+ " <tr>\n",
134
+ " <th>1</th>\n",
135
+ " <td>2024-05-02</td>\n",
136
+ " <td>182.86</td>\n",
137
+ " <td>184.6000</td>\n",
138
+ " <td>176.0200</td>\n",
139
+ " <td>180.01</td>\n",
140
+ " <td>89148041.0</td>\n",
141
+ " <td>TSLA</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>2</th>\n",
145
+ " <td>2024-05-01</td>\n",
146
+ " <td>182.00</td>\n",
147
+ " <td>185.8600</td>\n",
148
+ " <td>179.0100</td>\n",
149
+ " <td>179.99</td>\n",
150
+ " <td>92829719.0</td>\n",
151
+ " <td>TSLA</td>\n",
152
+ " </tr>\n",
153
+ " <tr>\n",
154
+ " <th>3</th>\n",
155
+ " <td>2024-04-30</td>\n",
156
+ " <td>186.98</td>\n",
157
+ " <td>190.9500</td>\n",
158
+ " <td>182.8401</td>\n",
159
+ " <td>183.28</td>\n",
160
+ " <td>127031787.0</td>\n",
161
+ " <td>TSLA</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <th>4</th>\n",
165
+ " <td>2024-04-29</td>\n",
166
+ " <td>188.42</td>\n",
167
+ " <td>198.8700</td>\n",
168
+ " <td>184.5400</td>\n",
169
+ " <td>194.05</td>\n",
170
+ " <td>243869678.0</td>\n",
171
+ " <td>TSLA</td>\n",
172
+ " </tr>\n",
173
+ " <tr>\n",
174
+ " <th>...</th>\n",
175
+ " <td>...</td>\n",
176
+ " <td>...</td>\n",
177
+ " <td>...</td>\n",
178
+ " <td>...</td>\n",
179
+ " <td>...</td>\n",
180
+ " <td>...</td>\n",
181
+ " <td>...</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>3481</th>\n",
185
+ " <td>2010-07-06</td>\n",
186
+ " <td>20.00</td>\n",
187
+ " <td>20.0000</td>\n",
188
+ " <td>15.8300</td>\n",
189
+ " <td>16.11</td>\n",
190
+ " <td>6866900.0</td>\n",
191
+ " <td>TSLA</td>\n",
192
+ " </tr>\n",
193
+ " <tr>\n",
194
+ " <th>3482</th>\n",
195
+ " <td>2010-07-02</td>\n",
196
+ " <td>23.00</td>\n",
197
+ " <td>23.1000</td>\n",
198
+ " <td>18.7100</td>\n",
199
+ " <td>19.20</td>\n",
200
+ " <td>5139800.0</td>\n",
201
+ " <td>TSLA</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>3483</th>\n",
205
+ " <td>2010-07-01</td>\n",
206
+ " <td>25.00</td>\n",
207
+ " <td>25.9200</td>\n",
208
+ " <td>20.2700</td>\n",
209
+ " <td>21.96</td>\n",
210
+ " <td>8218800.0</td>\n",
211
+ " <td>TSLA</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>3484</th>\n",
215
+ " <td>2010-06-30</td>\n",
216
+ " <td>25.79</td>\n",
217
+ " <td>30.4192</td>\n",
218
+ " <td>23.3000</td>\n",
219
+ " <td>23.83</td>\n",
220
+ " <td>17187100.0</td>\n",
221
+ " <td>TSLA</td>\n",
222
+ " </tr>\n",
223
+ " <tr>\n",
224
+ " <th>3485</th>\n",
225
+ " <td>2010-06-29</td>\n",
226
+ " <td>19.00</td>\n",
227
+ " <td>25.0000</td>\n",
228
+ " <td>17.5400</td>\n",
229
+ " <td>23.89</td>\n",
230
+ " <td>18766300.0</td>\n",
231
+ " <td>TSLA</td>\n",
232
+ " </tr>\n",
233
+ " </tbody>\n",
234
+ "</table>\n",
235
+ "<p>3486 rows Γ— 7 columns</p>\n",
236
+ "</div>"
237
+ ],
238
+ "text/plain": [
239
+ " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
240
+ "0 2024-05-03 182.10 184.7800 178.4200 181.19 75491539.0 TSLA\n",
241
+ "1 2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0 TSLA\n",
242
+ "2 2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0 TSLA\n",
243
+ "3 2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0 TSLA\n",
244
+ "4 2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0 TSLA\n",
245
+ "... ... ... ... ... ... ... ...\n",
246
+ "3481 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0 TSLA\n",
247
+ "3482 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0 TSLA\n",
248
+ "3483 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0 TSLA\n",
249
+ "3484 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0 TSLA\n",
250
+ "3485 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0 TSLA\n",
251
+ "\n",
252
+ "[3486 rows x 7 columns]"
253
+ ]
254
+ },
255
+ "execution_count": 28,
256
+ "metadata": {},
257
+ "output_type": "execute_result"
258
+ }
259
+ ],
260
+ "source": [
261
+ "tsla_df"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 30,
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "name": "stdout",
271
+ "output_type": "stream",
272
+ "text": [
273
+ "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "# Cleaning up column names for 'tsla_df'\n",
279
+ "tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]\n",
280
+ "print(tsla_df.columns)"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 31,
286
+ "metadata": {},
287
+ "outputs": [],
288
+ "source": [
289
+ "# Converting the \"date\" column to timestamp\n",
290
+ "tsla_df['date'] = pd.to_datetime(tsla_df['date'])"
291
+ ]
292
+ },
293
+ {
294
+ "cell_type": "code",
295
+ "execution_count": 32,
296
+ "metadata": {},
297
+ "outputs": [],
298
+ "source": [
299
+ "# Defining the stocks feature group\n",
300
+ "tesla_fg = fs.get_or_create_feature_group(\n",
301
+ " name=\"tesla_stock\",\n",
302
+ " description=\"Tesla stock dataset from alpha vantage\",\n",
303
+ " version=1,\n",
304
+ " primary_key=[\"ticker\"],\n",
305
+ " event_time=['date'],\n",
306
+ " online_enabled=False,\n",
307
+ ")"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 18,
313
+ "metadata": {},
314
+ "outputs": [
315
+ {
316
+ "name": "stdout",
317
+ "output_type": "stream",
318
+ "text": [
319
+ "Feature Group created successfully, explore it at \n",
320
+ "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/786781\n"
321
+ ]
322
+ },
323
+ {
324
+ "data": {
325
+ "application/vnd.jupyter.widget-view+json": {
326
+ "model_id": "b3248b9d522a467db9ce202ef5815fe9",
327
+ "version_major": 2,
328
+ "version_minor": 0
329
+ },
330
+ "text/plain": [
331
+ "Uploading Dataframe: 0.00% | | Rows 0/3486 | Elapsed Time: 00:00 | Remaining Time: ?"
332
+ ]
333
+ },
334
+ "metadata": {},
335
+ "output_type": "display_data"
336
+ },
337
+ {
338
+ "name": "stdout",
339
+ "output_type": "stream",
340
+ "text": [
341
+ "Launching job: tesla_stock_1_offline_fg_materialization\n",
342
+ "Job started successfully, you can follow the progress at \n",
343
+ "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
344
+ ]
345
+ },
346
+ {
347
+ "data": {
348
+ "text/plain": [
349
+ "(<hsfs.core.job.Job at 0x19cffe27490>, None)"
350
+ ]
351
+ },
352
+ "execution_count": 18,
353
+ "metadata": {},
354
+ "output_type": "execute_result"
355
+ }
356
+ ],
357
+ "source": [
358
+ "#Inserting the stock data into the stocks feature group\n",
359
+ "tesla_fg.insert(tsla_df, write_options={\"wait_for_job\" : False})"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 19,
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "#Collecting news df\n",
369
+ "news_df = pd.read_csv('news_articles_ema.csv')"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": 20,
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "#Dropping exp mean 7 days\n",
379
+ "news_df_updated = news_df.drop(columns=['exp_mean_7_days'])"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": 21,
385
+ "metadata": {},
386
+ "outputs": [],
387
+ "source": [
388
+ "#Updating date to datetime\n",
389
+ "news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "code",
394
+ "execution_count": 22,
395
+ "metadata": {},
396
+ "outputs": [
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "2024-05-06 13:43:12,343 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
402
+ "\n"
403
+ ]
404
+ }
405
+ ],
406
+ "source": [
407
+ "#Defining the news feature group\n",
408
+ "news_sentiment_fg = fs.get_or_create_feature_group(\n",
409
+ " name='news_sentiment_updated',\n",
410
+ " description='News sentiment from Polygon',\n",
411
+ " version=1,\n",
412
+ " primary_key=['ticker'],\n",
413
+ " event_time=['date'],\n",
414
+ " online_enabled=False,\n",
415
+ ")"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 23,
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "name": "stdout",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "Feature Group created successfully, explore it at \n",
428
+ "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787796\n"
429
+ ]
430
+ },
431
+ {
432
+ "data": {
433
+ "application/vnd.jupyter.widget-view+json": {
434
+ "model_id": "524bb5481c34441ba708a4c14edac44b",
435
+ "version_major": 2,
436
+ "version_minor": 0
437
+ },
438
+ "text/plain": [
439
+ "Uploading Dataframe: 0.00% | | Rows 0/66 | Elapsed Time: 00:00 | Remaining Time: ?"
440
+ ]
441
+ },
442
+ "metadata": {},
443
+ "output_type": "display_data"
444
+ },
445
+ {
446
+ "name": "stdout",
447
+ "output_type": "stream",
448
+ "text": [
449
+ "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
450
+ "Job started successfully, you can follow the progress at \n",
451
+ "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n"
452
+ ]
453
+ },
454
+ {
455
+ "data": {
456
+ "text/plain": [
457
+ "(<hsfs.core.job.Job at 0x19c811c2e90>, None)"
458
+ ]
459
+ },
460
+ "execution_count": 23,
461
+ "metadata": {},
462
+ "output_type": "execute_result"
463
+ }
464
+ ],
465
+ "source": [
466
+ "#Inserting the news data into the news feature group\n",
467
+ "news_sentiment_fg.insert(news_df_updated)"
468
+ ]
469
+ }
470
+ ],
471
+ "metadata": {
472
+ "kernelspec": {
473
+ "display_name": "base",
474
+ "language": "python",
475
+ "name": "python3"
476
+ },
477
+ "language_info": {
478
+ "codemirror_mode": {
479
+ "name": "ipython",
480
+ "version": 3
481
+ },
482
+ "file_extension": ".py",
483
+ "mimetype": "text/x-python",
484
+ "name": "python",
485
+ "nbconvert_exporter": "python",
486
+ "pygments_lexer": "ipython3",
487
+ "version": "3.11.9"
488
+ },
489
+ "orig_nbformat": 4
490
+ },
491
+ "nbformat": 4,
492
+ "nbformat_minor": 2
493
+ }
feature_view.ipynb β†’ Stocks news prediction/Notebooks/6_feature_view.ipynb RENAMED
@@ -2,55 +2,31 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "# Import necessary libraries\n",
10
- "import pandas as pd # For data manipulation using DataFrames\n",
11
- "import numpy as np # For numerical operations\n",
12
- "import matplotlib.pyplot as plt # For data visualization\n",
13
- "import os # For operating system-related tasks\n",
14
- "import joblib # For saving and loading models\n",
15
- "import hopsworks # For getting access to hopsworks\n",
16
- "\n",
17
- "\n",
18
- "\n",
19
- "# Import specific modules from scikit-learn\n",
20
- "from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing\n",
21
- "from sklearn.metrics import accuracy_score # For evaluating model accuracy"
22
- ]
23
- },
24
- {
25
- "cell_type": "code",
26
- "execution_count": 2,
27
  "metadata": {},
28
  "outputs": [
29
  {
30
  "name": "stdout",
31
  "output_type": "stream",
32
  "text": [
 
 
 
 
 
33
  " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
34
  "0 2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
35
  "1 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
36
  "2 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
37
  "3 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
38
  "4 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n",
39
- "Connected. Call `.close()` to terminate connection gracefully.\n",
40
- "\n",
41
- "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
42
- "Connected. Call `.close()` to terminate connection gracefully.\n",
43
- "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n",
44
- "2024-05-06 13:44:59,122 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
45
- "\n",
46
- "Feature Group created successfully, explore it at \n",
47
- "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787797\n"
48
  ]
49
  },
50
  {
51
  "data": {
52
  "application/vnd.jupyter.widget-view+json": {
53
- "model_id": "1b857e05ae714fc09a2a7fcd05f56a73",
54
  "version_major": 2,
55
  "version_minor": 0
56
  },
@@ -65,19 +41,15 @@
65
  "name": "stdout",
66
  "output_type": "stream",
67
  "text": [
68
- "Launching job: tesla_stock_2_offline_fg_materialization\n",
69
  "Job started successfully, you can follow the progress at \n",
70
- "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_2_offline_fg_materialization/executions\n",
71
- "2024-05-06 13:45:08,516 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
72
- "\n",
73
- "Feature Group created successfully, explore it at \n",
74
- "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/785786\n"
75
  ]
76
  },
77
  {
78
  "data": {
79
  "application/vnd.jupyter.widget-view+json": {
80
- "model_id": "98e7ee7cb2c943b8893d0ae2a7254104",
81
  "version_major": 2,
82
  "version_minor": 0
83
  },
@@ -92,57 +64,11 @@
92
  "name": "stdout",
93
  "output_type": "stream",
94
  "text": [
95
- "Launching job: news_sentiment_updated_2_offline_fg_materialization\n",
96
  "Job started successfully, you can follow the progress at \n",
97
- "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_2_offline_fg_materialization/executions\n"
98
- ]
99
- }
100
- ],
101
- "source": [
102
- "from feature_pipeline import tesla_fg\n",
103
- "from feature_pipeline import news_sentiment_fg"
104
- ]
105
- },
106
- {
107
- "cell_type": "code",
108
- "execution_count": 3,
109
- "metadata": {},
110
- "outputs": [
111
- {
112
- "data": {
113
- "text/plain": [
114
- "True"
115
- ]
116
- },
117
- "execution_count": 3,
118
- "metadata": {},
119
- "output_type": "execute_result"
120
- }
121
- ],
122
- "source": [
123
- "from dotenv import load_dotenv\n",
124
- "import os\n",
125
- "\n",
126
- "load_dotenv()"
127
- ]
128
- },
129
- {
130
- "cell_type": "code",
131
- "execution_count": 4,
132
- "metadata": {},
133
- "outputs": [
134
- {
135
- "name": "stdout",
136
- "output_type": "stream",
137
- "text": [
138
  "Connection closed.\n",
139
- "Connected. Call `.close()` to terminate connection gracefully.\n"
140
- ]
141
- },
142
- {
143
- "name": "stdout",
144
- "output_type": "stream",
145
- "text": [
146
  "\n",
147
  "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
148
  "Connected. Call `.close()` to terminate connection gracefully.\n"
@@ -150,6 +76,23 @@
150
  }
151
  ],
152
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  "api_key = os.environ.get('hopsworks_api')\n",
154
  "project = hopsworks.login(api_key_value=api_key)\n",
155
  "fs = project.get_feature_store()"
@@ -161,17 +104,19 @@
161
  "metadata": {},
162
  "outputs": [],
163
  "source": [
 
 
164
  "def create_stocks_feature_view(fs, version):\n",
165
  "\n",
166
  " # Loading in the feature groups\n",
167
  " tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
168
  " news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)\n",
169
  "\n",
170
- " # Define the query\n",
171
  " ds_query = tesla_fg.select(['date', 'open', 'ticker'])\\\n",
172
  " .join(news_sentiment_fg.select(['sentiment']))\n",
173
  "\n",
174
- " # Create the feature view\n",
175
  " feature_view = fs.create_feature_view(\n",
176
  " name='tesla_stocks_fv',\n",
177
  " query=ds_query,\n",
@@ -196,6 +141,7 @@
196
  }
197
  ],
198
  "source": [
 
199
  "try:\n",
200
  " feature_view = fs.get_feature_view(\"tesla_stocks_fv\", version=1)\n",
201
  " tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
@@ -209,6 +155,7 @@
209
  "metadata": {},
210
  "outputs": [],
211
  "source": [
 
212
  "def fix_data_from_feature_view(df,start_date,end_date):\n",
213
  " df = df.sort_values(\"date\")\n",
214
  " df = df.reset_index()\n",
@@ -230,41 +177,6 @@
230
  " \n",
231
  " return filtered_df"
232
  ]
233
- },
234
- {
235
- "cell_type": "code",
236
- "execution_count": 7,
237
- "metadata": {},
238
- "outputs": [],
239
- "source": [
240
- "#def create_stocks_feature_view(fs, version):\n",
241
- "\n",
242
- " #Loading in the feature groups\n",
243
- "# tesla_fg = fs.get_feature_group('tesla_stock', version = 3)\n",
244
- "# news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)\n",
245
- "\n",
246
- "# ds_query = tesla_fg.select(['date','open', 'ticker'])\\\n",
247
- "# .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))\n",
248
- " \n",
249
- "# return (fs.create_tesla_feature_view(\n",
250
- "# name = 'tsla_stocks_fv',\n",
251
- "# query = ds_query,\n",
252
- "# labels=['ticker']\n",
253
- "# ), tesla_fg)"
254
- ]
255
- },
256
- {
257
- "cell_type": "code",
258
- "execution_count": 8,
259
- "metadata": {},
260
- "outputs": [],
261
- "source": [
262
- "#try:\n",
263
- "# feature_view = fs.get_feature_view(\"tsla_stocks_fv\", version=1)\n",
264
- "# tesla_fg = fs.get_feature_group('tesla_stock', version=3)\n",
265
- "#except:\n",
266
- "# feature_view, tesla_fg = create_stocks_feature_view(fs, 1)"
267
- ]
268
  }
269
  ],
270
  "metadata": {
@@ -283,7 +195,7 @@
283
  "name": "python",
284
  "nbconvert_exporter": "python",
285
  "pygments_lexer": "ipython3",
286
- "version": "3.11.4"
287
  }
288
  },
289
  "nbformat": 4,
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 3,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
+ "Connection closed.\n",
13
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
14
+ "\n",
15
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
16
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
17
  " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
18
  "0 2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
19
  "1 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
20
  "2 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
21
  "3 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
22
  "4 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n",
23
+ "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
 
 
 
 
 
 
 
 
24
  ]
25
  },
26
  {
27
  "data": {
28
  "application/vnd.jupyter.widget-view+json": {
29
+ "model_id": "db4ef90d03b0464f957c18365d8d636f",
30
  "version_major": 2,
31
  "version_minor": 0
32
  },
 
41
  "name": "stdout",
42
  "output_type": "stream",
43
  "text": [
44
+ "Launching job: tesla_stock_1_offline_fg_materialization\n",
45
  "Job started successfully, you can follow the progress at \n",
46
+ "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
 
 
 
 
47
  ]
48
  },
49
  {
50
  "data": {
51
  "application/vnd.jupyter.widget-view+json": {
52
+ "model_id": "9043e7043c1843288091f7c3a6bbd83e",
53
  "version_major": 2,
54
  "version_minor": 0
55
  },
 
64
  "name": "stdout",
65
  "output_type": "stream",
66
  "text": [
67
+ "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
68
  "Job started successfully, you can follow the progress at \n",
69
+ "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  "Connection closed.\n",
71
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
 
 
 
 
 
 
72
  "\n",
73
  "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
74
  "Connected. Call `.close()` to terminate connection gracefully.\n"
 
76
  }
77
  ],
78
  "source": [
79
+ "# Importing necessary libraries\n",
80
+ "import pandas as pd # For data manipulation using DataFrames\n",
81
+ "import numpy as np # For numerical operations\n",
82
+ "import matplotlib.pyplot as plt # For data visualization\n",
83
+ "import os # For operating system-related tasks\n",
84
+ "import joblib # For saving and loading models\n",
85
+ "import hopsworks # For getting access to hopsworks\n",
86
+ "\n",
87
+ "from SML import feature_pipeline #Loading in the tesla_fg\n",
88
+ "\n",
89
+ "#Making the notebook able to fetch from the .env file\n",
90
+ "from dotenv import load_dotenv\n",
91
+ "import os\n",
92
+ "\n",
93
+ "load_dotenv()\n",
94
+ "\n",
95
+ "#Getting connected to hopsworks\n",
96
  "api_key = os.environ.get('hopsworks_api')\n",
97
  "project = hopsworks.login(api_key_value=api_key)\n",
98
  "fs = project.get_feature_store()"
 
104
  "metadata": {},
105
  "outputs": [],
106
  "source": [
107
+ "#Defining the function to create feature view\n",
108
+ "\n",
109
  "def create_stocks_feature_view(fs, version):\n",
110
  "\n",
111
  " # Loading in the feature groups\n",
112
  " tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
113
  " news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)\n",
114
  "\n",
115
+ " # Defining the query\n",
116
  " ds_query = tesla_fg.select(['date', 'open', 'ticker'])\\\n",
117
  " .join(news_sentiment_fg.select(['sentiment']))\n",
118
  "\n",
119
+ " # Creating the feature view\n",
120
  " feature_view = fs.create_feature_view(\n",
121
  " name='tesla_stocks_fv',\n",
122
  " query=ds_query,\n",
 
141
  }
142
  ],
143
  "source": [
144
+ "#Creating the feature view\n",
145
  "try:\n",
146
  " feature_view = fs.get_feature_view(\"tesla_stocks_fv\", version=1)\n",
147
  " tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
 
155
  "metadata": {},
156
  "outputs": [],
157
  "source": [
158
+ "#Defining a function to get fixed data from the feature view\n",
159
  "def fix_data_from_feature_view(df,start_date,end_date):\n",
160
  " df = df.sort_values(\"date\")\n",
161
  " df = df.reset_index()\n",
 
177
  " \n",
178
  " return filtered_df"
179
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  }
181
  ],
182
  "metadata": {
 
195
  "name": "python",
196
  "nbconvert_exporter": "python",
197
  "pygments_lexer": "ipython3",
198
+ "version": "3.11.9"
199
  }
200
  },
201
  "nbformat": 4,
Stocks news prediction/Notebooks/7_training_pipeline.ipynb ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
13
+ "\n",
14
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
15
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
16
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
17
+ ]
18
+ }
19
+ ],
20
+ "source": [
21
+ "#Importing necessary libraries\n",
22
+ "import hopsworks\n",
23
+ "import hsfs\n",
24
+ "from dotenv import load_dotenv\n",
25
+ "import os\n",
26
+ "import pandas as pd\n",
27
+ "import numpy as np\n",
28
+ "from sklearn.preprocessing import OneHotEncoder\n",
29
+ "from sklearn.preprocessing import MinMaxScaler\n",
30
+ "from sklearn.metrics import mean_squared_error\n",
31
+ "from hsml.schema import Schema\n",
32
+ "from hsml.model_schema import ModelSchema\n",
33
+ "from tensorflow.keras.models import Sequential\n",
34
+ "from tensorflow.keras.layers import Input, LSTM, Dense, Dropout\n",
35
+ "from sklearn.preprocessing import StandardScaler # Import StandardScaler from scikit-learn\n",
36
+ "import joblib\n",
37
+ "\n",
38
+ "load_dotenv()\n",
39
+ "\n",
40
+ "#Connecting to hopsworks\n",
41
+ "api_key = os.environ.get('hopsworks_api')\n",
42
+ "project = hopsworks.login(api_key_value=api_key)\n",
43
+ "fs = project.get_feature_store()\n",
44
+ "\n",
45
+ "#Another connection to hopsworks\n",
46
+ "api_key = os.getenv('hopsworks_api')\n",
47
+ "connection = hsfs.connection()\n",
48
+ "fs = connection.get_feature_store()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 3,
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "#Getting the feature view\n",
58
+ "feature_view = fs.get_feature_view(\n",
59
+ " name='tesla_stocks_fv',\n",
60
+ " version=1\n",
61
+ ")"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 4,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "#Setting up train & test split dates\n",
71
+ "train_start = \"2022-06-22\"\n",
72
+ "train_end = \"2023-12-31\"\n",
73
+ "\n",
74
+ "test_start = '2024-01-01'\n",
75
+ "test_end = \"2024-05-03\""
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "name": "stdout",
85
+ "output_type": "stream",
86
+ "text": [
87
+ "Training dataset job started successfully, you can follow the progress at \n",
88
+ "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stocks_fv_1_create_fv_td_07052024082715/executions\n",
89
+ "2024-05-07 10:28:31,852 WARNING: VersionWarning: Incremented version to `6`.\n",
90
+ "\n"
91
+ ]
92
+ },
93
+ {
94
+ "data": {
95
+ "text/plain": [
96
+ "(6, <hsfs.core.job.Job at 0x1c3ac2719d0>)"
97
+ ]
98
+ },
99
+ "execution_count": 5,
100
+ "metadata": {},
101
+ "output_type": "execute_result"
102
+ }
103
+ ],
104
+ "source": [
105
+ "#Creating the train/test split on the feature view with the split dates\n",
106
+ "feature_view.create_train_test_split(\n",
107
+ " train_start=train_start,\n",
108
+ " train_end=train_end,\n",
109
+ " test_start=test_start,\n",
110
+ " test_end=test_end,\n",
111
+ " data_format='csv',\n",
112
+ " coalesce= True,\n",
113
+ " statistics_config={'histogram':True,'correlations':True})"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 6,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "#Collecting the split from feature view\n",
123
+ "X_train, X_test, y_train, y_test = feature_view.get_train_test_split(6)"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": 7,
129
+ "metadata": {},
130
+ "outputs": [
131
+ {
132
+ "data": {
133
+ "text/html": [
134
+ "<div>\n",
135
+ "<style scoped>\n",
136
+ " .dataframe tbody tr th:only-of-type {\n",
137
+ " vertical-align: middle;\n",
138
+ " }\n",
139
+ "\n",
140
+ " .dataframe tbody tr th {\n",
141
+ " vertical-align: top;\n",
142
+ " }\n",
143
+ "\n",
144
+ " .dataframe thead th {\n",
145
+ " text-align: right;\n",
146
+ " }\n",
147
+ "</style>\n",
148
+ "<table border=\"1\" class=\"dataframe\">\n",
149
+ " <thead>\n",
150
+ " <tr style=\"text-align: right;\">\n",
151
+ " <th></th>\n",
152
+ " <th>date</th>\n",
153
+ " <th>ticker</th>\n",
154
+ " <th>sentiment</th>\n",
155
+ " </tr>\n",
156
+ " </thead>\n",
157
+ " <tbody>\n",
158
+ " <tr>\n",
159
+ " <th>0</th>\n",
160
+ " <td>2022-12-14T00:00:00.000Z</td>\n",
161
+ " <td>TSLA</td>\n",
162
+ " <td>0.102207</td>\n",
163
+ " </tr>\n",
164
+ " <tr>\n",
165
+ " <th>1</th>\n",
166
+ " <td>2023-02-21T00:00:00.000Z</td>\n",
167
+ " <td>TSLA</td>\n",
168
+ " <td>0.155833</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>2</th>\n",
172
+ " <td>2023-08-17T00:00:00.000Z</td>\n",
173
+ " <td>TSLA</td>\n",
174
+ " <td>0.024046</td>\n",
175
+ " </tr>\n",
176
+ " <tr>\n",
177
+ " <th>3</th>\n",
178
+ " <td>2022-09-16T00:00:00.000Z</td>\n",
179
+ " <td>TSLA</td>\n",
180
+ " <td>0.087306</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>4</th>\n",
184
+ " <td>2023-08-28T00:00:00.000Z</td>\n",
185
+ " <td>TSLA</td>\n",
186
+ " <td>0.024046</td>\n",
187
+ " </tr>\n",
188
+ " <tr>\n",
189
+ " <th>...</th>\n",
190
+ " <td>...</td>\n",
191
+ " <td>...</td>\n",
192
+ " <td>...</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>378</th>\n",
196
+ " <td>2023-02-10T00:00:00.000Z</td>\n",
197
+ " <td>TSLA</td>\n",
198
+ " <td>0.155833</td>\n",
199
+ " </tr>\n",
200
+ " <tr>\n",
201
+ " <th>379</th>\n",
202
+ " <td>2023-05-08T00:00:00.000Z</td>\n",
203
+ " <td>TSLA</td>\n",
204
+ " <td>0.141296</td>\n",
205
+ " </tr>\n",
206
+ " <tr>\n",
207
+ " <th>380</th>\n",
208
+ " <td>2022-09-08T00:00:00.000Z</td>\n",
209
+ " <td>TSLA</td>\n",
210
+ " <td>0.087306</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>381</th>\n",
214
+ " <td>2023-07-06T00:00:00.000Z</td>\n",
215
+ " <td>TSLA</td>\n",
216
+ " <td>0.119444</td>\n",
217
+ " </tr>\n",
218
+ " <tr>\n",
219
+ " <th>382</th>\n",
220
+ " <td>2023-10-27T00:00:00.000Z</td>\n",
221
+ " <td>TSLA</td>\n",
222
+ " <td>0.164868</td>\n",
223
+ " </tr>\n",
224
+ " </tbody>\n",
225
+ "</table>\n",
226
+ "<p>383 rows Γ— 3 columns</p>\n",
227
+ "</div>"
228
+ ],
229
+ "text/plain": [
230
+ " date ticker sentiment\n",
231
+ "0 2022-12-14T00:00:00.000Z TSLA 0.102207\n",
232
+ "1 2023-02-21T00:00:00.000Z TSLA 0.155833\n",
233
+ "2 2023-08-17T00:00:00.000Z TSLA 0.024046\n",
234
+ "3 2022-09-16T00:00:00.000Z TSLA 0.087306\n",
235
+ "4 2023-08-28T00:00:00.000Z TSLA 0.024046\n",
236
+ ".. ... ... ...\n",
237
+ "378 2023-02-10T00:00:00.000Z TSLA 0.155833\n",
238
+ "379 2023-05-08T00:00:00.000Z TSLA 0.141296\n",
239
+ "380 2022-09-08T00:00:00.000Z TSLA 0.087306\n",
240
+ "381 2023-07-06T00:00:00.000Z TSLA 0.119444\n",
241
+ "382 2023-10-27T00:00:00.000Z TSLA 0.164868\n",
242
+ "\n",
243
+ "[383 rows x 3 columns]"
244
+ ]
245
+ },
246
+ "execution_count": 7,
247
+ "metadata": {},
248
+ "output_type": "execute_result"
249
+ }
250
+ ],
251
+ "source": [
252
+ "#Inspecting X_train\n",
253
+ "X_train"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 8,
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "#Converting date into datetime\n",
263
+ "X_train['date'] = pd.to_datetime(X_train['date']).dt.date\n",
264
+ "X_test['date'] = pd.to_datetime(X_test['date']).dt.date\n",
265
+ "X_train['date'] = pd.to_datetime(X_train['date'])\n",
266
+ "X_test['date'] = pd.to_datetime(X_test['date'])"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 9,
272
+ "metadata": {},
273
+ "outputs": [
274
+ {
275
+ "data": {
276
+ "text/html": [
277
+ "<div>\n",
278
+ "<style scoped>\n",
279
+ " .dataframe tbody tr th:only-of-type {\n",
280
+ " vertical-align: middle;\n",
281
+ " }\n",
282
+ "\n",
283
+ " .dataframe tbody tr th {\n",
284
+ " vertical-align: top;\n",
285
+ " }\n",
286
+ "\n",
287
+ " .dataframe thead th {\n",
288
+ " text-align: right;\n",
289
+ " }\n",
290
+ "</style>\n",
291
+ "<table border=\"1\" class=\"dataframe\">\n",
292
+ " <thead>\n",
293
+ " <tr style=\"text-align: right;\">\n",
294
+ " <th></th>\n",
295
+ " <th>date</th>\n",
296
+ " <th>ticker</th>\n",
297
+ " <th>sentiment</th>\n",
298
+ " </tr>\n",
299
+ " </thead>\n",
300
+ " <tbody>\n",
301
+ " <tr>\n",
302
+ " <th>0</th>\n",
303
+ " <td>2022-12-14</td>\n",
304
+ " <td>TSLA</td>\n",
305
+ " <td>0.102207</td>\n",
306
+ " </tr>\n",
307
+ " <tr>\n",
308
+ " <th>1</th>\n",
309
+ " <td>2023-02-21</td>\n",
310
+ " <td>TSLA</td>\n",
311
+ " <td>0.155833</td>\n",
312
+ " </tr>\n",
313
+ " <tr>\n",
314
+ " <th>2</th>\n",
315
+ " <td>2023-08-17</td>\n",
316
+ " <td>TSLA</td>\n",
317
+ " <td>0.024046</td>\n",
318
+ " </tr>\n",
319
+ " <tr>\n",
320
+ " <th>3</th>\n",
321
+ " <td>2022-09-16</td>\n",
322
+ " <td>TSLA</td>\n",
323
+ " <td>0.087306</td>\n",
324
+ " </tr>\n",
325
+ " <tr>\n",
326
+ " <th>4</th>\n",
327
+ " <td>2023-08-28</td>\n",
328
+ " <td>TSLA</td>\n",
329
+ " <td>0.024046</td>\n",
330
+ " </tr>\n",
331
+ " </tbody>\n",
332
+ "</table>\n",
333
+ "</div>"
334
+ ],
335
+ "text/plain": [
336
+ " date ticker sentiment\n",
337
+ "0 2022-12-14 TSLA 0.102207\n",
338
+ "1 2023-02-21 TSLA 0.155833\n",
339
+ "2 2023-08-17 TSLA 0.024046\n",
340
+ "3 2022-09-16 TSLA 0.087306\n",
341
+ "4 2023-08-28 TSLA 0.024046"
342
+ ]
343
+ },
344
+ "execution_count": 9,
345
+ "metadata": {},
346
+ "output_type": "execute_result"
347
+ }
348
+ ],
349
+ "source": [
350
+ "X_train.head()"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": 10,
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "# Extracting the 'ticker' column\n",
360
+ "tickers = X_train[['ticker']]\n",
361
+ "\n",
362
+ "# Initializing OneHotEncoder\n",
363
+ "encoder = OneHotEncoder()\n",
364
+ "\n",
365
+ "# Fitting and transforming the 'ticker' column\n",
366
+ "ticker_encoded = encoder.fit_transform(tickers)\n",
367
+ "\n",
368
+ "# Converting the encoded column into a DataFrame\n",
369
+ "ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n",
370
+ "\n",
371
+ "# Concatenating the encoded DataFrame with the original DataFrame\n",
372
+ "X_train = pd.concat([X_train, ticker_encoded_df], axis=1)\n",
373
+ "\n",
374
+ "# Dropping the original 'ticker' column\n",
375
+ "X_train.drop('ticker', axis=1, inplace=True)"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 11,
381
+ "metadata": {},
382
+ "outputs": [
383
+ {
384
+ "data": {
385
+ "text/html": [
386
+ "<div>\n",
387
+ "<style scoped>\n",
388
+ " .dataframe tbody tr th:only-of-type {\n",
389
+ " vertical-align: middle;\n",
390
+ " }\n",
391
+ "\n",
392
+ " .dataframe tbody tr th {\n",
393
+ " vertical-align: top;\n",
394
+ " }\n",
395
+ "\n",
396
+ " .dataframe thead th {\n",
397
+ " text-align: right;\n",
398
+ " }\n",
399
+ "</style>\n",
400
+ "<table border=\"1\" class=\"dataframe\">\n",
401
+ " <thead>\n",
402
+ " <tr style=\"text-align: right;\">\n",
403
+ " <th></th>\n",
404
+ " <th>date</th>\n",
405
+ " <th>sentiment</th>\n",
406
+ " <th>ticker_TSLA</th>\n",
407
+ " </tr>\n",
408
+ " </thead>\n",
409
+ " <tbody>\n",
410
+ " <tr>\n",
411
+ " <th>0</th>\n",
412
+ " <td>2022-12-14</td>\n",
413
+ " <td>0.102207</td>\n",
414
+ " <td>1.0</td>\n",
415
+ " </tr>\n",
416
+ " <tr>\n",
417
+ " <th>1</th>\n",
418
+ " <td>2023-02-21</td>\n",
419
+ " <td>0.155833</td>\n",
420
+ " <td>1.0</td>\n",
421
+ " </tr>\n",
422
+ " <tr>\n",
423
+ " <th>2</th>\n",
424
+ " <td>2023-08-17</td>\n",
425
+ " <td>0.024046</td>\n",
426
+ " <td>1.0</td>\n",
427
+ " </tr>\n",
428
+ " <tr>\n",
429
+ " <th>3</th>\n",
430
+ " <td>2022-09-16</td>\n",
431
+ " <td>0.087306</td>\n",
432
+ " <td>1.0</td>\n",
433
+ " </tr>\n",
434
+ " <tr>\n",
435
+ " <th>4</th>\n",
436
+ " <td>2023-08-28</td>\n",
437
+ " <td>0.024046</td>\n",
438
+ " <td>1.0</td>\n",
439
+ " </tr>\n",
440
+ " </tbody>\n",
441
+ "</table>\n",
442
+ "</div>"
443
+ ],
444
+ "text/plain": [
445
+ " date sentiment ticker_TSLA\n",
446
+ "0 2022-12-14 0.102207 1.0\n",
447
+ "1 2023-02-21 0.155833 1.0\n",
448
+ "2 2023-08-17 0.024046 1.0\n",
449
+ "3 2022-09-16 0.087306 1.0\n",
450
+ "4 2023-08-28 0.024046 1.0"
451
+ ]
452
+ },
453
+ "execution_count": 11,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "#Inspecting X train after onehotencoding 'Ticker'\n",
460
+ "X_train.head()"
461
+ ]
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "execution_count": 12,
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "#Doing the same for X test as done to X train\n",
470
+ "\n",
471
+ "tickers = X_test[['ticker']]\n",
472
+ "\n",
473
+ "# Initializing OneHotEncoder\n",
474
+ "encoder = OneHotEncoder()\n",
475
+ "\n",
476
+ "# Fitting and transforming the 'ticker' column\n",
477
+ "ticker_encoded_test = encoder.fit_transform(tickers)\n",
478
+ "\n",
479
+ "# Converting the encoded column into a DataFrame\n",
480
+ "ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n",
481
+ "\n",
482
+ "# Concatenating the encoded DataFrame with the original DataFrame\n",
483
+ "X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)\n",
484
+ "\n",
485
+ "# Dropping the original 'ticker' column\n",
486
+ "X_test.drop('ticker', axis=1, inplace=True)"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 13,
492
+ "metadata": {},
493
+ "outputs": [],
494
+ "source": [
495
+ "#Loading in MinMaxScaler to be used on the target variable 'open'\n",
496
+ "scaler = MinMaxScaler()\n",
497
+ "\n",
498
+ "# Fitting and transforming the 'open' column\n",
499
+ "y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])\n",
500
+ "y_train.drop('open', axis=1, inplace=True)"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": 14,
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "#Doing the same to y_test as done to y_train \n",
510
+ "y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])\n",
511
+ "y_test.drop('open', axis=1, inplace=True)"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 15,
517
+ "metadata": {},
518
+ "outputs": [],
519
+ "source": [
520
+ "#Defining the function for the LSTM model\n",
521
+ "def create_model(input_shape,\n",
522
+ " LSTM_filters=64,\n",
523
+ " dropout=0.1,\n",
524
+ " recurrent_dropout=0.1,\n",
525
+ " dense_dropout=0.5,\n",
526
+ " activation='relu',\n",
527
+ " depth=1):\n",
528
+ "\n",
529
+ " model = Sequential()\n",
530
+ "\n",
531
+ " # Input layer\n",
532
+ " model.add(Input(shape=input_shape))\n",
533
+ "\n",
534
+ " if depth > 1:\n",
535
+ " for i in range(1, depth):\n",
536
+ " # Recurrent layer\n",
537
+ " model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))\n",
538
+ "\n",
539
+ " # Recurrent layer\n",
540
+ " model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))\n",
541
+ "\n",
542
+ " # Fully connected layer\n",
543
+ " if activation == 'relu':\n",
544
+ " model.add(Dense(LSTM_filters, activation='relu'))\n",
545
+ " elif activation == 'leaky_relu':\n",
546
+ " model.add(Dense(LSTM_filters))\n",
547
+ " model.add(tf.keras.layers.LeakyReLU(alpha=0.1))\n",
548
+ "\n",
549
+ " # Dropout for regularization\n",
550
+ " model.add(Dropout(dense_dropout))\n",
551
+ "\n",
552
+ " # Output layer for predicting one day forward\n",
553
+ " model.add(Dense(1, activation='linear'))\n",
554
+ "\n",
555
+ " # Compile the model\n",
556
+ " model.compile(optimizer='adam', loss='mse')\n",
557
+ "\n",
558
+ " return model"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": 16,
564
+ "metadata": {},
565
+ "outputs": [
566
+ {
567
+ "name": "stdout",
568
+ "output_type": "stream",
569
+ "text": [
570
+ "2024-05-07 10:28:33,332 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n",
571
+ "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n",
572
+ "\n"
573
+ ]
574
+ }
575
+ ],
576
+ "source": [
577
+ "# As X_train['date'] column exists and is in datetime format, we're converting it\n",
578
+ "X_train['year'] = X_train['date'].dt.year\n",
579
+ "X_train['month'] = X_train['date'].dt.month\n",
580
+ "X_train['day'] = X_train['date'].dt.day\n",
581
+ "\n",
582
+ "# Dropping the original date column\n",
583
+ "X_train.drop(columns=['date'], inplace=True)\n",
584
+ "\n",
585
+ "# Converting dataframe to numpy array\n",
586
+ "X_train_array = X_train.to_numpy()\n",
587
+ "\n",
588
+ "# Reshaping the array to have a shape suitable for LSTM\n",
589
+ "X_train_array = np.expand_dims(X_train_array, axis=1)"
590
+ ]
591
+ },
592
+ {
593
+ "cell_type": "code",
594
+ "execution_count": 17,
595
+ "metadata": {},
596
+ "outputs": [],
597
+ "source": [
598
+ "# Convert DataFrame to numpy array\n",
599
+ "X_train_array = X_train.values\n",
600
+ "\n",
601
+ "# Reshaping X_train_array to add a time step dimension\n",
602
+ "X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])\n",
603
+ "\n",
604
+ "# Assuming X_train_reshaped shape is now (374, 1, 5)\n",
605
+ "input_shape = X_train_reshaped.shape[1:]\n",
606
+ "\n",
607
+ "# Create the model\n",
608
+ "model = create_model(input_shape=input_shape)"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": 18,
614
+ "metadata": {},
615
+ "outputs": [
616
+ {
617
+ "name": "stdout",
618
+ "output_type": "stream",
619
+ "text": [
620
+ "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 5ms/step - loss: 0.5131\n"
621
+ ]
622
+ },
623
+ {
624
+ "data": {
625
+ "text/plain": [
626
+ "<keras.src.callbacks.history.History at 0x1c3aa79ff50>"
627
+ ]
628
+ },
629
+ "execution_count": 18,
630
+ "metadata": {},
631
+ "output_type": "execute_result"
632
+ }
633
+ ],
634
+ "source": [
635
+ "#Fitting the model on the training dataset\n",
636
+ "model.fit(X_train_reshaped, y_train)"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 19,
642
+ "metadata": {},
643
+ "outputs": [
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ "2024-05-07 10:28:39,020 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n",
649
+ "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n",
650
+ "\n"
651
+ ]
652
+ }
653
+ ],
654
+ "source": [
655
+ "# As X_test['date'] column exists and is in datetime format, we're converting it\n",
656
+ "X_test['year'] = X_test['date'].dt.year\n",
657
+ "X_test['month'] = X_test['date'].dt.month\n",
658
+ "X_test['day'] = X_test['date'].dt.day\n",
659
+ "\n",
660
+ "# Dropping the original date column\n",
661
+ "X_test.drop(columns=['date'], inplace=True)\n",
662
+ "\n",
663
+ "# Converting dataframe to numpy array\n",
664
+ "X_test_array = X_test.to_numpy()\n",
665
+ "\n",
666
+ "# Reshape the array to have a shape suitable for LSTM\n",
667
+ "X_test_array = np.expand_dims(X_test_array, axis=1)"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 20,
673
+ "metadata": {},
674
+ "outputs": [
675
+ {
676
+ "name": "stdout",
677
+ "output_type": "stream",
678
+ "text": [
679
+ "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 307ms/step\n"
680
+ ]
681
+ }
682
+ ],
683
+ "source": [
684
+ "#Predicting y_pred with X_test\n",
685
+ "y_pred = model.predict(X_test_array)"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 21,
691
+ "metadata": {},
692
+ "outputs": [
693
+ {
694
+ "name": "stdout",
695
+ "output_type": "stream",
696
+ "text": [
697
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
698
+ ]
699
+ }
700
+ ],
701
+ "source": [
702
+ "#Conneting to hopsworks model registry\n",
703
+ "mr = project.get_model_registry()"
704
+ ]
705
+ },
706
+ {
707
+ "cell_type": "code",
708
+ "execution_count": 22,
709
+ "metadata": {},
710
+ "outputs": [
711
+ {
712
+ "data": {
713
+ "text/plain": [
714
+ "{'RMSE': 0.3981142064349763}"
715
+ ]
716
+ },
717
+ "execution_count": 22,
718
+ "metadata": {},
719
+ "output_type": "execute_result"
720
+ }
721
+ ],
722
+ "source": [
723
+ "# Compute RMSE metric for filling the model\n",
724
+ "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
725
+ "rmse_metrics = {\"RMSE\": rmse}\n",
726
+ "rmse_metrics"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 23,
732
+ "metadata": {},
733
+ "outputs": [],
734
+ "source": [
735
+ "#Setting up the model schema\n",
736
+ "input_schema = Schema(X_train)\n",
737
+ "output_schema = Schema(y_train)\n",
738
+ "model_schema = ModelSchema(input_schema, output_schema)"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 24,
744
+ "metadata": {},
745
+ "outputs": [],
746
+ "source": [
747
+ "#Creating a file colled 'stock_model'\n",
748
+ "model_dir=\"stock_model\"\n",
749
+ "if os.path.isdir(model_dir) == False:\n",
750
+ " os.mkdir(model_dir)"
751
+ ]
752
+ },
753
+ {
754
+ "cell_type": "code",
755
+ "execution_count": 25,
756
+ "metadata": {},
757
+ "outputs": [
758
+ {
759
+ "data": {
760
+ "application/vnd.jupyter.widget-view+json": {
761
+ "model_id": "a6169babeb154f54bdbb9b0b490333ab",
762
+ "version_major": 2,
763
+ "version_minor": 0
764
+ },
765
+ "text/plain": [
766
+ " 0%| | 0/6 [00:00<?, ?it/s]"
767
+ ]
768
+ },
769
+ "metadata": {},
770
+ "output_type": "display_data"
771
+ },
772
+ {
773
+ "data": {
774
+ "application/vnd.jupyter.widget-view+json": {
775
+ "model_id": "f5749cebd1fe422dbeaba0ec2718a3f9",
776
+ "version_major": 2,
777
+ "version_minor": 0
778
+ },
779
+ "text/plain": [
780
+ "Uploading: 0.000%| | 0/561 elapsed<00:00 remaining<?"
781
+ ]
782
+ },
783
+ "metadata": {},
784
+ "output_type": "display_data"
785
+ },
786
+ {
787
+ "name": "stdout",
788
+ "output_type": "stream",
789
+ "text": [
790
+ "Model created, explore it at https://c.app.hopsworks.ai:443/p/693399/models/stock_pred_model/6\n"
791
+ ]
792
+ },
793
+ {
794
+ "data": {
795
+ "text/plain": [
796
+ "Model(name: 'stock_pred_model', version: 6)"
797
+ ]
798
+ },
799
+ "execution_count": 25,
800
+ "metadata": {},
801
+ "output_type": "execute_result"
802
+ }
803
+ ],
804
+ "source": [
805
+ "#Saving the model to hopsworks model registry\n",
806
+ "stock_pred_model = mr.tensorflow.create_model(\n",
807
+ " name=\"stock_pred_model\",\n",
808
+ " metrics= rmse_metrics,\n",
809
+ " model_schema=model_schema,\n",
810
+ " description=\"Stock Market TSLA Predictor from News Sentiment\",\n",
811
+ " )\n",
812
+ "\n",
813
+ "stock_pred_model.save(model_dir)"
814
+ ]
815
+ }
816
+ ],
817
+ "metadata": {
818
+ "kernelspec": {
819
+ "display_name": "base",
820
+ "language": "python",
821
+ "name": "python3"
822
+ },
823
+ "language_info": {
824
+ "codemirror_mode": {
825
+ "name": "ipython",
826
+ "version": 3
827
+ },
828
+ "file_extension": ".py",
829
+ "mimetype": "text/x-python",
830
+ "name": "python",
831
+ "nbconvert_exporter": "python",
832
+ "pygments_lexer": "ipython3",
833
+ "version": "3.11.9"
834
+ },
835
+ "orig_nbformat": 4
836
+ },
837
+ "nbformat": 4,
838
+ "nbformat_minor": 2
839
+ }
Stocks news prediction/Notebooks/8_inference_pipeline.ipynb ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
13
+ "\n",
14
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549016\n",
15
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
16
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
17
+ ]
18
+ },
19
+ {
20
+ "name": "stderr",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "../src/arrow/status.cc:137: DoAction result was not fully consumed: Cancelled: Flight cancelled call, with message: CANCELLED. Detail: Cancelled\n"
24
+ ]
25
+ },
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "Training dataset job started successfully, you can follow the progress at \n",
31
+ "https://c.app.hopsworks.ai/p/549016/jobs/named/tesla_stocks_fv_1_create_fv_td_07052024090051/executions\n",
32
+ "2024-05-07 11:02:21,906 WARNING: VersionWarning: Incremented version to `1`.\n",
33
+ "\n",
34
+ "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 3ms/step - loss: 0.5555\n",
35
+ "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 61ms/step\n",
36
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
37
+ ]
38
+ },
39
+ {
40
+ "data": {
41
+ "application/vnd.jupyter.widget-view+json": {
42
+ "model_id": "1dd33e12e80548c99f5a605b28f82196",
43
+ "version_major": 2,
44
+ "version_minor": 0
45
+ },
46
+ "text/plain": [
47
+ " 0%| | 0/6 [00:00<?, ?it/s]"
48
+ ]
49
+ },
50
+ "metadata": {},
51
+ "output_type": "display_data"
52
+ },
53
+ {
54
+ "data": {
55
+ "application/vnd.jupyter.widget-view+json": {
56
+ "model_id": "b636479e09e94fb2a0c5736c2368aec4",
57
+ "version_major": 2,
58
+ "version_minor": 0
59
+ },
60
+ "text/plain": [
61
+ "Uploading: 0.000%| | 0/528 elapsed<00:00 remaining<?"
62
+ ]
63
+ },
64
+ "metadata": {},
65
+ "output_type": "display_data"
66
+ },
67
+ {
68
+ "name": "stdout",
69
+ "output_type": "stream",
70
+ "text": [
71
+ "Model created, explore it at https://c.app.hopsworks.ai:443/p/549016/models/stock_pred_model/7\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "import pandas as pd \n",
77
+ "import hopsworks \n",
78
+ "from datetime import datetime, timedelta\n",
79
+ "from SML import training_pipeline"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 2,
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "name": "stdout",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "Connection closed.\n",
92
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
93
+ "\n",
94
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549016\n",
95
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
96
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "project = hopsworks.login()\n",
102
+ "fs= project.get_feature_store()\n",
103
+ "mr = project.get_model_registry() "
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 3,
109
+ "metadata": {},
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "2024-05-06\n"
116
+ ]
117
+ }
118
+ ],
119
+ "source": [
120
+ "start_date = datetime.now() - timedelta(hours=24)\n",
121
+ "print(start_date.strftime(\"%Y-%m-%d\"))"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 12,
127
+ "metadata": {},
128
+ "outputs": [
129
+ {
130
+ "name": "stdout",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "2024-05-07\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "end_date = datetime.now().strftime(\"%Y-%m-%d\")\n",
139
+ "print(end_date)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 13,
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "feature_view = fs.get_feature_view('tesla_stocks_fv', 1)\n",
149
+ "feature_view.init_batch_scoring(training_dataset_version=1)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 14,
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "name": "stdout",
159
+ "output_type": "stream",
160
+ "text": [
161
+ "WITH right_fg0 AS (SELECT *\n",
162
+ "FROM (SELECT `fg1`.`date` `date`, `fg1`.`ticker` `ticker`, `fg1`.`ticker` `join_pk_ticker`, `fg1`.`date` `join_evt_date`, `fg0`.`sentiment` `sentiment`, RANK() OVER (PARTITION BY `fg1`.`ticker`, `fg1`.`date` ORDER BY `fg0`.`date` DESC) pit_rank_hopsworks\n",
163
+ "FROM `mtzeve_featurestore`.`tesla_stock_1` `fg1`\n",
164
+ "INNER JOIN `mtzeve_featurestore`.`news_sentiment_updated_1` `fg0` ON `fg1`.`ticker` = `fg0`.`ticker` AND `fg1`.`date` >= `fg0`.`date`) NA\n",
165
+ "WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`date` `date`, `right_fg0`.`ticker` `ticker`, `right_fg0`.`sentiment` `sentiment`\n",
166
+ "FROM right_fg0)\n"
167
+ ]
168
+ }
169
+ ],
170
+ "source": [
171
+ "print(feature_view.get_batch_query())"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 16,
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "name": "stdout",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.11s) \n"
184
+ ]
185
+ },
186
+ {
187
+ "data": {
188
+ "text/html": [
189
+ "<div>\n",
190
+ "<style scoped>\n",
191
+ " .dataframe tbody tr th:only-of-type {\n",
192
+ " vertical-align: middle;\n",
193
+ " }\n",
194
+ "\n",
195
+ " .dataframe tbody tr th {\n",
196
+ " vertical-align: top;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe thead th {\n",
200
+ " text-align: right;\n",
201
+ " }\n",
202
+ "</style>\n",
203
+ "<table border=\"1\" class=\"dataframe\">\n",
204
+ " <thead>\n",
205
+ " <tr style=\"text-align: right;\">\n",
206
+ " <th></th>\n",
207
+ " <th>date</th>\n",
208
+ " <th>ticker</th>\n",
209
+ " <th>sentiment</th>\n",
210
+ " </tr>\n",
211
+ " </thead>\n",
212
+ " <tbody>\n",
213
+ " </tbody>\n",
214
+ "</table>\n",
215
+ "</div>"
216
+ ],
217
+ "text/plain": [
218
+ "Empty DataFrame\n",
219
+ "Columns: [date, ticker, sentiment]\n",
220
+ "Index: []"
221
+ ]
222
+ },
223
+ "execution_count": 16,
224
+ "metadata": {},
225
+ "output_type": "execute_result"
226
+ }
227
+ ],
228
+ "source": [
229
+ "# we had problems fetching the data from fv with get_batch_data function, tried everything and it just doesnt work \n",
230
+ "tsla_df_b = feature_view.get_batch_data(start_time = start_date, end_time = end_date)\n",
231
+ "tsla_df_b"
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": 11,
237
+ "metadata": {},
238
+ "outputs": [
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "Downloading model artifact (0 dirs, 1 files)... DONE\r"
244
+ ]
245
+ },
246
+ {
247
+ "ename": "FileNotFoundError",
248
+ "evalue": "[Errno 2] No such file or directory: '/var/folders/ty/fy7wpfqs4c39hnsfl21_rzyc0000gn/T/d6edbe1d-de39-488f-b12c-c0cbfd5ded37/stock_pred_model/7stock_model'",
249
+ "output_type": "error",
250
+ "traceback": [
251
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
252
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
253
+ "\u001b[1;32m/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb Cell 8\u001b[0m line \u001b[0;36m5\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m the_model \u001b[39m=\u001b[39m mr\u001b[39m.\u001b[39mget_model(\u001b[39m\"\u001b[39m\u001b[39mstock_pred_model\u001b[39m\u001b[39m\"\u001b[39m, version\u001b[39m=\u001b[39m\u001b[39m7\u001b[39m)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m model_dir \u001b[39m=\u001b[39m the_model\u001b[39m.\u001b[39mdownload()\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m model \u001b[39m=\u001b[39m joblib\u001b[39m.\u001b[39mload(model_dir \u001b[39m+\u001b[39m \u001b[39m'\u001b[39m\u001b[39mstock_model\u001b[39m\u001b[39m'\u001b[39m)\n",
254
+ "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/joblib/numpy_pickle.py:650\u001b[0m, in \u001b[0;36mload\u001b[0;34m(filename, mmap_mode)\u001b[0m\n\u001b[1;32m 648\u001b[0m obj \u001b[39m=\u001b[39m _unpickle(fobj)\n\u001b[1;32m 649\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(filename, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 651\u001b[0m \u001b[39mwith\u001b[39;00m _read_fileobject(f, filename, mmap_mode) \u001b[39mas\u001b[39;00m fobj:\n\u001b[1;32m 652\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(fobj, \u001b[39mstr\u001b[39m):\n\u001b[1;32m 653\u001b[0m \u001b[39m# if the returned file object is a string, this means we\u001b[39;00m\n\u001b[1;32m 654\u001b[0m \u001b[39m# try to load a pickle file generated with an version of\u001b[39;00m\n\u001b[1;32m 655\u001b[0m \u001b[39m# Joblib so we load it with joblib compatibility function.\u001b[39;00m\n",
255
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/var/folders/ty/fy7wpfqs4c39hnsfl21_rzyc0000gn/T/d6edbe1d-de39-488f-b12c-c0cbfd5ded37/stock_pred_model/7stock_model'"
256
+ ]
257
+ }
258
+ ],
259
+ "source": [
260
+ "import joblib\n",
261
+ "the_model = mr.get_model(\"stock_pred_model\", version=7)\n",
262
+ "model_dir = the_model.download()\n",
263
+ "\n",
264
+ "model = joblib.load(model_dir + 'stock_model')"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "predictions = model.predict(tsla_df_b)"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": null,
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "predictions "
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": []
291
+ }
292
+ ],
293
+ "metadata": {
294
+ "kernelspec": {
295
+ "display_name": "base",
296
+ "language": "python",
297
+ "name": "python3"
298
+ },
299
+ "language_info": {
300
+ "codemirror_mode": {
301
+ "name": "ipython",
302
+ "version": 3
303
+ },
304
+ "file_extension": ".py",
305
+ "mimetype": "text/x-python",
306
+ "name": "python",
307
+ "nbconvert_exporter": "python",
308
+ "pygments_lexer": "ipython3",
309
+ "version": "3.11.4"
310
+ },
311
+ "orig_nbformat": 4
312
+ },
313
+ "nbformat": 4,
314
+ "nbformat_minor": 2
315
+ }
Stocks news prediction/SML/__pycache__/feature_pipeline.cpython-311.pyc ADDED
Binary file (2.74 kB). View file
 
Stocks news prediction/SML/__pycache__/news_preprocessing.cpython-311.pyc ADDED
Binary file (2.54 kB). View file
 
feature_pipeline.py β†’ Stocks news prediction/SML/feature_pipeline.py RENAMED
@@ -1,10 +1,3 @@
1
- # %%
2
- from dotenv import load_dotenv
3
- import os
4
-
5
- # %%
6
- #!pip install great_expectations==0.18.12
7
-
8
  # %%
9
  # Import necessary libraries
10
  import pandas as pd # For data manipulation using DataFrames
@@ -13,119 +6,85 @@ import matplotlib.pyplot as plt # For data visualization
13
  import os # For operating system-related tasks
14
  import joblib # For saving and loading models
15
  import hopsworks # For getting access to hopsworks
16
-
17
-
18
 
19
  # Import specific modules from scikit-learn
20
  from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing
21
  from sklearn.metrics import accuracy_score # For evaluating model accuracy
22
 
23
- # %%
24
- #from alpha_vantage.timeseries import TimeSeries
25
- #import pandas as pd
26
-
27
- #load_dotenv()
28
-
29
- #api_key = os.environ.get('stocks_api') # Replace this with your actual API key
30
- #ts = TimeSeries(key=api_key, output_format='pandas')
31
-
32
- # Fetch daily adjusted stock prices; adjust the symbol as needed
33
- #data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
34
-
35
- #print(data.head())
36
-
37
- # %%
38
- #data.info()
39
-
40
- # %%
41
- #meta_data
42
-
43
- # %%
44
- # Define your file path and name
45
- #file_path = 'TSLA_stock_price.csv' # Customize the path and filename
46
-
47
- # Save the DataFrame to CSV
48
- #stock_data.to_csv(file_path)
49
-
50
- #print(f"Data saved to {file_path}")
51
-
52
-
53
- # %%
54
- # Load and display the data from CSV to confirm
55
- tsla_df = pd.read_csv('TSLA_stock_price.csv')
56
- print(tsla_df.head())
57
-
58
 
59
- # %%
60
  api_key = os.environ.get('hopsworks_api')
61
  project = hopsworks.login(api_key_value=api_key)
62
  fs = project.get_feature_store()
63
 
64
  # %%
65
- import re
 
 
66
 
67
  # %%
 
68
  def clean_column_name(name):
69
  # Remove all non-letter characters
70
  cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
71
  return cleaned_name
72
 
73
-
74
  # %%
75
  tsla_df
76
 
77
  # %%
78
- # Assuming 'tsla_df' is your DataFrame
79
  tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
80
-
81
-
82
- # %%
83
  print(tsla_df.columns)
84
 
85
-
86
  # %%
87
- import pandas as pd
88
-
89
- # Assuming tsla_df is your pandas DataFrame
90
- # Convert the "date" column to timestamp
91
  tsla_df['date'] = pd.to_datetime(tsla_df['date'])
92
 
93
-
94
  # %%
95
- # Define a feature group
96
  tesla_fg = fs.get_or_create_feature_group(
97
  name="tesla_stock",
98
  description="Tesla stock dataset from alpha vantage",
99
- version=2,
100
  primary_key=["ticker"],
101
  event_time=['date'],
102
  online_enabled=False,
103
  )
104
 
105
  # %%
 
106
  tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
107
 
108
  # %%
 
109
  news_df = pd.read_csv('news_articles_ema.csv')
110
 
111
-
112
  # %%
 
113
  news_df_updated = news_df.drop(columns=['exp_mean_7_days'])
114
 
115
  # %%
 
116
  news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])
117
 
118
  # %%
 
119
  news_sentiment_fg = fs.get_or_create_feature_group(
120
  name='news_sentiment_updated',
121
  description='News sentiment from Polygon',
122
- version=2,
123
  primary_key=['ticker'],
124
  event_time=['date'],
125
  online_enabled=False,
126
  )
127
 
128
  # %%
 
129
  news_sentiment_fg.insert(news_df_updated)
130
 
131
 
 
 
 
 
 
 
 
 
1
  # %%
2
  # Import necessary libraries
3
  import pandas as pd # For data manipulation using DataFrames
 
6
  import os # For operating system-related tasks
7
  import joblib # For saving and loading models
8
  import hopsworks # For getting access to hopsworks
9
+ import re
 
10
 
11
  # Import specific modules from scikit-learn
12
  from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing
13
  from sklearn.metrics import accuracy_score # For evaluating model accuracy
14
 
15
+ from dotenv import load_dotenv
16
+ import os
17
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ #Connecting to hopsworks
20
  api_key = os.environ.get('hopsworks_api')
21
  project = hopsworks.login(api_key_value=api_key)
22
  fs = project.get_feature_store()
23
 
24
  # %%
25
+ # Load and display the data from CSV to confirm
26
+ tsla_df = pd.read_csv('TSLA_stock_price.csv')
27
+ print(tsla_df.head())
28
 
29
  # %%
30
+ #Defining a function to clean the column names
31
  def clean_column_name(name):
32
  # Remove all non-letter characters
33
  cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
34
  return cleaned_name
35
 
 
36
  # %%
37
  tsla_df
38
 
39
  # %%
40
+ # Cleaning up column names for 'tsla_df'
41
  tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
 
 
 
42
  print(tsla_df.columns)
43
 
 
44
  # %%
45
+ # Converting the "date" column to timestamp
 
 
 
46
  tsla_df['date'] = pd.to_datetime(tsla_df['date'])
47
 
 
48
  # %%
49
+ # Defining the stocks feature group
50
  tesla_fg = fs.get_or_create_feature_group(
51
  name="tesla_stock",
52
  description="Tesla stock dataset from alpha vantage",
53
+ version=1,
54
  primary_key=["ticker"],
55
  event_time=['date'],
56
  online_enabled=False,
57
  )
58
 
59
  # %%
60
+ #Inserting the stock data into the stocks feature group
61
  tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
62
 
63
  # %%
64
+ #Collecting news df
65
  news_df = pd.read_csv('news_articles_ema.csv')
66
 
 
67
  # %%
68
+ #Dropping exp mean 7 days
69
  news_df_updated = news_df.drop(columns=['exp_mean_7_days'])
70
 
71
  # %%
72
+ #Updating date to datetime
73
  news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])
74
 
75
  # %%
76
+ #Defining the news feature group
77
  news_sentiment_fg = fs.get_or_create_feature_group(
78
  name='news_sentiment_updated',
79
  description='News sentiment from Polygon',
80
+ version=1,
81
  primary_key=['ticker'],
82
  event_time=['date'],
83
  online_enabled=False,
84
  )
85
 
86
  # %%
87
+ #Inserting the news data into the news feature group
88
  news_sentiment_fg.insert(news_df_updated)
89
 
90
 
feature_view.py β†’ Stocks news prediction/SML/feature_view.py RENAMED
@@ -1,5 +1,5 @@
1
  # %%
2
- # Import necessary libraries
3
  import pandas as pd # For data manipulation using DataFrames
4
  import numpy as np # For numerical operations
5
  import matplotlib.pyplot as plt # For data visualization
@@ -7,39 +7,34 @@ import os # For operating system-related tasks
7
  import joblib # For saving and loading models
8
  import hopsworks # For getting access to hopsworks
9
 
 
 
10
 
11
-
12
- # Import specific modules from scikit-learn
13
- from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing
14
- from sklearn.metrics import accuracy_score # For evaluating model accuracy
15
-
16
- # %%
17
- from feature_pipeline import tesla_fg
18
- from feature_pipeline import news_sentiment_fg
19
-
20
- # %%
21
  from dotenv import load_dotenv
22
  import os
23
 
24
  load_dotenv()
25
 
26
- # %%
27
  api_key = os.environ.get('hopsworks_api')
28
  project = hopsworks.login(api_key_value=api_key)
29
  fs = project.get_feature_store()
30
 
31
  # %%
 
 
32
  def create_stocks_feature_view(fs, version):
33
 
34
  # Loading in the feature groups
35
  tesla_fg = fs.get_feature_group('tesla_stock', version=1)
36
  news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
37
 
38
- # Define the query
39
  ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
40
  .join(news_sentiment_fg.select(['sentiment']))
41
 
42
- # Create the feature view
43
  feature_view = fs.create_feature_view(
44
  name='tesla_stocks_fv',
45
  query=ds_query,
@@ -49,6 +44,7 @@ def create_stocks_feature_view(fs, version):
49
  return feature_view, tesla_fg
50
 
51
  # %%
 
52
  try:
53
  feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
54
  tesla_fg = fs.get_feature_group('tesla_stock', version=1)
@@ -56,6 +52,7 @@ except:
56
  feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
57
 
58
  # %%
 
59
  def fix_data_from_feature_view(df,start_date,end_date):
60
  df = df.sort_values("date")
61
  df = df.reset_index()
@@ -77,27 +74,4 @@ def fix_data_from_feature_view(df,start_date,end_date):
77
 
78
  return filtered_df
79
 
80
- # %%
81
- #def create_stocks_feature_view(fs, version):
82
-
83
- #Loading in the feature groups
84
- # tesla_fg = fs.get_feature_group('tesla_stock', version = 3)
85
- # news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)
86
-
87
- # ds_query = tesla_fg.select(['date','open', 'ticker'])\
88
- # .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))
89
-
90
- # return (fs.create_tesla_feature_view(
91
- # name = 'tsla_stocks_fv',
92
- # query = ds_query,
93
- # labels=['ticker']
94
- # ), tesla_fg)
95
-
96
- # %%
97
- #try:
98
- # feature_view = fs.get_feature_view("tsla_stocks_fv", version=1)
99
- # tesla_fg = fs.get_feature_group('tesla_stock', version=3)
100
- #except:
101
- # feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
102
-
103
 
 
1
  # %%
2
+ # Importing necessary libraries
3
  import pandas as pd # For data manipulation using DataFrames
4
  import numpy as np # For numerical operations
5
  import matplotlib.pyplot as plt # For data visualization
 
7
  import joblib # For saving and loading models
8
  import hopsworks # For getting access to hopsworks
9
 
10
+ from feature_pipeline import tesla_fg #Loading in the tesla_fg
11
+ from feature_pipeline import news_sentiment_fg #Loading in the news_fg
12
 
13
+ #Making the notebook able to fetch from the .env file
 
 
 
 
 
 
 
 
 
14
  from dotenv import load_dotenv
15
  import os
16
 
17
  load_dotenv()
18
 
19
+ #Getting connected to hopsworks
20
  api_key = os.environ.get('hopsworks_api')
21
  project = hopsworks.login(api_key_value=api_key)
22
  fs = project.get_feature_store()
23
 
24
  # %%
25
+ #Defining the function to create feature view
26
+
27
  def create_stocks_feature_view(fs, version):
28
 
29
  # Loading in the feature groups
30
  tesla_fg = fs.get_feature_group('tesla_stock', version=1)
31
  news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
32
 
33
+ # Defining the query
34
  ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
35
  .join(news_sentiment_fg.select(['sentiment']))
36
 
37
+ # Creating the feature view
38
  feature_view = fs.create_feature_view(
39
  name='tesla_stocks_fv',
40
  query=ds_query,
 
44
  return feature_view, tesla_fg
45
 
46
  # %%
47
+ #Creating the feature view
48
  try:
49
  feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
50
  tesla_fg = fs.get_feature_group('tesla_stock', version=1)
 
52
  feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
53
 
54
  # %%
55
+ #Defining a function to get fixed data from the feature view
56
  def fix_data_from_feature_view(df,start_date,end_date):
57
  df = df.sort_values("date")
58
  df = df.reset_index()
 
74
 
75
  return filtered_df
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
Stocks news prediction/SML/historical_news.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ #Importing necessary libraries
3
+ from dotenv import load_dotenv
4
+ from datetime import datetime, timedelta
5
+ import requests
6
+ import os
7
+ import time
8
+ import pandas as pd
9
+ from news_preprocessing import * #Importing everything from 'news_preprocessing'
10
+ load_dotenv()
11
+
12
+ # %%
13
+ #Defining a function for fetching news
14
+
15
+ def fetch_news(api_key, ticker, start_date, end_date):
16
+ base_url = os.environ.get("endpointnewsp")
17
+ headers = {"Authorization": f"Bearer {api_key}"}
18
+ all_news = []
19
+
20
+ current_date = start_date
21
+
22
+ while current_date <= end_date:
23
+ batch_end_date = current_date + timedelta(days=50)
24
+ if batch_end_date > end_date:
25
+ batch_end_date = end_date
26
+
27
+ params = {
28
+ "ticker": ticker,
29
+ "published_utc.gte": current_date.strftime('%Y-%m-%d'),
30
+ "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
31
+ "limit": 50,
32
+ "sort": "published_utc"
33
+ }
34
+
35
+ try:
36
+ response = requests.get(base_url, headers=headers, params=params)
37
+ if response.status_code == 200:
38
+ data = response.json()
39
+ articles = data.get('results', [])
40
+
41
+ # Creating a DataFrame from articles
42
+ df = pd.DataFrame(articles)
43
+
44
+ # Adding primary_key column if ticker is found
45
+ df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
46
+
47
+ all_news.append(df) # Append DataFrame to the list
48
+ print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
49
+ current_date = batch_end_date + timedelta(days=1)
50
+ elif response.status_code == 429:
51
+ print("Rate limit reached. Waiting to retry...")
52
+ time.sleep(60) # Wait for 60 seconds or as recommended by the API
53
+ continue # Retry the current request
54
+ else:
55
+ print(f"Failed to fetch data: {response.status_code}, {response.text}")
56
+ break
57
+ except Exception as e:
58
+ print(f"An error occurred: {e}")
59
+ break
60
+
61
+ return pd.concat(all_news, ignore_index=True)
62
+
63
+ #Usage
64
+ api_key = os.environ.get('newsp_api')
65
+ ticker = 'TSLA'
66
+ end_date = datetime.now() - timedelta(days=1) # Yesterday's date
67
+ start_date = end_date - timedelta(days=365 * 2)
68
+ news_articles = fetch_news(api_key, ticker, start_date, end_date)
69
+ print(f"Total articles fetched: {len(news_articles)}")
70
+
71
+
72
+ # %%
73
+ # Process the news articles
74
+ df = process_news_articles(news_articles)
75
+
76
+ # %%
77
+ df.info()
78
+
79
+ # %%
80
+ df.head()
81
+
82
+ # %%
83
+ df= df.sort_index(ascending=False)
84
+
85
+ # %%
86
+ #Putting the news articles into a csv
87
+ df.to_csv('news_articles.csv', index=False)
88
+
89
+ # %%
90
+ df_processed = exponential_moving_average(df, window=7)
91
+
92
+ # %%
93
+ df_processed.to_csv('news_articles_ema.csv', index=False)
94
+
95
+ # %%
96
+ df_processed.head()
97
+
98
+ # %%
99
+ df_processed.tail()
100
+
101
+ # %%
102
+ print(df_processed['date'].min())
103
+ print(df_processed['date'].max())
104
+
105
+ # %%
106
+ print(df_processed['date'].max() - df_processed['date'].min())
107
+
108
+ # %%
109
+ df_processed.shape
110
+
111
+ # %%
112
+ duplicates = df_processed[df_processed.duplicated('date')]
113
+
114
+ # %%
115
+ duplicates.shape
116
+
117
+ # %%
118
+ df_processed.head()
119
+
120
+
Stocks news prediction/SML/historical_stock.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ #Importing necessary librabries
3
+ from dotenv import load_dotenv
4
+ import os
5
+ from alpha_vantage.timeseries import TimeSeries
6
+ import pandas as pd
7
+ import hopsworks
8
+ import re
9
+ import modal
10
+ #prepocessing
11
+ import requests
12
+ import pandas as pd
13
+ import json
14
+ #import pandas_market_calendars as mcal
15
+ import datetime
16
+ import numpy as np
17
+ from datetime import timedelta
18
+ load_dotenv() #Making the .env file work
19
+
20
+ # %%
21
+ #Setting up API key to being able to fetch stocks from Alpha Vantage
22
+
23
+ api_key = os.environ.get('stocks_api')
24
+ ts = TimeSeries(key=api_key, output_format='pandas')
25
+
26
+ #Defining a function to fetch stocks
27
+
28
+ def fetch_stock_prices(symbol):
29
+ # Fetch daily adjusted stock prices; adjust the symbol as needed
30
+ data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')
31
+
32
+ # Add a new column named 'ticker' and fill it with the ticker name
33
+ data['ticker'] = symbol
34
+
35
+ return data
36
+
37
+ #Usage
38
+ symbol = 'TSLA'
39
+ stock_data = fetch_stock_prices(symbol)
40
+ print(stock_data.head())
41
+
42
+ # %%
43
+ # Defining the file path and name
44
+ file_path = 'TSLA_stock_price.csv'
45
+
46
+ # Saving the DataFrame to CSV
47
+ stock_data.to_csv(file_path)
48
+
49
+ print(f"Data saved to {file_path}")
50
+
51
+
news_preprocessing.py β†’ Stocks news prediction/SML/news_preprocessing.py RENAMED
@@ -1,4 +1,5 @@
1
  # %%
 
2
  from dotenv import load_dotenv
3
  from datetime import datetime, timedelta
4
  import requests
@@ -8,6 +9,7 @@ import pandas as pd
8
  from textblob import TextBlob
9
 
10
  # %%
 
11
  def process_news_articles(news_articles):
12
  # Convert list of dictionaries to DataFrame
13
  df = pd.DataFrame(news_articles)
@@ -27,19 +29,21 @@ def process_news_articles(news_articles):
27
  df['date'] = df['published_utc'].dt.date
28
  df['time'] = df['published_utc'].dt.time
29
 
30
- # Drop unnecessary columns
31
  df.drop(['published_utc'], axis=1, inplace=True)
32
  # set date to index
33
  df = df.set_index("date")
34
  df.reset_index(inplace=True)
35
  df.index = pd.to_datetime(df.index)
36
- df = df.groupby(['date', 'ticker'])['sentiment'].mean().reset_index(name='sentiment')
37
 
38
  return df
39
 
40
  # %%
 
 
41
  def exponential_moving_average(df, window):
42
- # Calculate EMA on the 'sentiment' column
43
  df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()
44
  return df
45
 
 
1
  # %%
2
+ #Importing necessary libraries
3
  from dotenv import load_dotenv
4
  from datetime import datetime, timedelta
5
  import requests
 
9
  from textblob import TextBlob
10
 
11
  # %%
12
+ #Defining a function to process news articles
13
  def process_news_articles(news_articles):
14
  # Convert list of dictionaries to DataFrame
15
  df = pd.DataFrame(news_articles)
 
29
  df['date'] = df['published_utc'].dt.date
30
  df['time'] = df['published_utc'].dt.time
31
 
32
+ # Dropping unnecessary columns
33
  df.drop(['published_utc'], axis=1, inplace=True)
34
  # set date to index
35
  df = df.set_index("date")
36
  df.reset_index(inplace=True)
37
  df.index = pd.to_datetime(df.index)
38
+ df = df.groupby(['date', 'ticker'])['sentiment'].mean().reset_index()
39
 
40
  return df
41
 
42
  # %%
43
+ #Defining a function for the exponential moving average
44
+
45
  def exponential_moving_average(df, window):
46
+ # Calculate EMA on the 'sentiment' column
47
  df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()
48
  return df
49
 
stock_preprocessing.py β†’ Stocks news prediction/SML/stock_preprocessing.py RENAMED
@@ -1,4 +1,5 @@
1
  # %%
 
2
  from dotenv import load_dotenv
3
  import os
4
  from alpha_vantage.timeseries import TimeSeries
@@ -10,15 +11,14 @@ import modal
10
  import requests
11
  import pandas as pd
12
  import json
13
- #import pandas_market_calendars as mcal
14
  import datetime
15
  import numpy as np
16
  from datetime import datetime, timedelta
17
-
18
-
19
- # %%
20
  load_dotenv()
21
 
 
 
22
  api_key = os.environ.get('stocks_api') # Replace this with your actual API key
23
  ts = TimeSeries(key=api_key, output_format='pandas')
24
 
@@ -28,12 +28,11 @@ data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
28
  print(data.head())
29
 
30
  # %%
31
- data
32
-
33
- # %%
34
  data.info()
35
 
36
  # %%
 
37
  meta_data
38
 
39
  # %%
@@ -50,6 +49,7 @@ def today_is_a_business_day(today):
50
  return False
51
 
52
  # %%
 
53
  def next_business_day(today):
54
 
55
  # Real tomorrow
@@ -71,6 +71,7 @@ def next_business_day(today):
71
  return isBusinessDay.to_numpy()[0]
72
 
73
  # %%
 
74
  def extract_business_day(start_date,end_date):
75
  """
76
  Given a start_date and end_date.
@@ -82,27 +83,27 @@ def extract_business_day(start_date,end_date):
82
  e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open
83
  """
84
 
85
- # Save for later
86
  end_date_save = end_date
87
 
88
- # Get the NYSE calendar
89
  cal = mcal.get_calendar('NYSE')
90
 
91
- # Get the NYSE calendar's open and close times for the specified period
92
  schedule = cal.schedule(start_date=start_date, end_date=end_date)
93
 
94
  # Only need a list of dates when it's open (not open and close times)
95
  isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d'))
96
 
97
- # Go over all days:
98
  delta = datetime.timedelta(days=1)
99
  start_date = datetime.datetime.strptime(start_date,"%Y-%m-%d") #datetime.date(2015, 7, 16)
100
  end_date = datetime.datetime.strptime(end_date,"%Y-%m-%d") #datetime.date(2023, 1, 4)
101
 
102
- # Extract days from the timedelta object
103
  num_days = (end_date - start_date).days + 1
104
 
105
- # Create boolean array for days being open (1) and closed (0)
106
  is_open = np.zeros(num_days)
107
 
108
  # iterate over range of dates
@@ -131,6 +132,7 @@ def extract_business_day(start_date,end_date):
131
  return isBusinessDay, is_open
132
 
133
  # %%
 
134
  def clean_column_name(name):
135
  # Remove all non-letter characters
136
  cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
@@ -150,15 +152,12 @@ data.reset_index(inplace=True)
150
  data.head()
151
 
152
  # %%
153
- data
154
-
155
- # %%
156
- # Define the date range you're interested in
157
  yesterday =datetime.now()-timedelta(days=1)
158
  two_years_back = yesterday - timedelta(days=684)
159
 
160
  # %%
161
- # Filter the DataFrame to this range
162
  filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]
163
 
164
  # %%
@@ -171,7 +170,4 @@ print(filtered_df['date'].max())
171
  # %%
172
  filtered_df.shape
173
 
174
- # %%
175
-
176
-
177
 
 
1
  # %%
2
+ #Importing necessary libraries
3
  from dotenv import load_dotenv
4
  import os
5
  from alpha_vantage.timeseries import TimeSeries
 
11
  import requests
12
  import pandas as pd
13
  import json
14
+ import pandas_market_calendars as mcal
15
  import datetime
16
  import numpy as np
17
  from datetime import datetime, timedelta
 
 
 
18
  load_dotenv()
19
 
20
+ # %%
21
+ #Connecting to Alpha vantage using API key
22
  api_key = os.environ.get('stocks_api') # Replace this with your actual API key
23
  ts = TimeSeries(key=api_key, output_format='pandas')
24
 
 
28
  print(data.head())
29
 
30
  # %%
31
+ #Looking at data info
 
 
32
  data.info()
33
 
34
  # %%
35
+ #Looking at the meta data
36
  meta_data
37
 
38
  # %%
 
49
  return False
50
 
51
  # %%
52
+ #Defining a function to find the next business day
53
  def next_business_day(today):
54
 
55
  # Real tomorrow
 
71
  return isBusinessDay.to_numpy()[0]
72
 
73
  # %%
74
+ #Defining a function to extract business day
75
  def extract_business_day(start_date,end_date):
76
  """
77
  Given a start_date and end_date.
 
83
  e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open
84
  """
85
 
86
+ # Saving for later
87
  end_date_save = end_date
88
 
89
+ # Getting the NYSE calendar
90
  cal = mcal.get_calendar('NYSE')
91
 
92
+ # Getting the NYSE calendar's open and close times for the specified period
93
  schedule = cal.schedule(start_date=start_date, end_date=end_date)
94
 
95
  # Only need a list of dates when it's open (not open and close times)
96
  isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d'))
97
 
98
+ # Going over all days:
99
  delta = datetime.timedelta(days=1)
100
  start_date = datetime.datetime.strptime(start_date,"%Y-%m-%d") #datetime.date(2015, 7, 16)
101
  end_date = datetime.datetime.strptime(end_date,"%Y-%m-%d") #datetime.date(2023, 1, 4)
102
 
103
+ # Extracting days from the timedelta object
104
  num_days = (end_date - start_date).days + 1
105
 
106
+ # Creating a boolean array for days being open (1) and closed (0)
107
  is_open = np.zeros(num_days)
108
 
109
  # iterate over range of dates
 
132
  return isBusinessDay, is_open
133
 
134
  # %%
135
+ #Defining a function to clean the column names
136
  def clean_column_name(name):
137
  # Remove all non-letter characters
138
  cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
 
152
  data.head()
153
 
154
  # %%
155
+ # Define the date range we're interested in
 
 
 
156
  yesterday =datetime.now()-timedelta(days=1)
157
  two_years_back = yesterday - timedelta(days=684)
158
 
159
  # %%
160
+ # Filtering the DataFrame to this range
161
  filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]
162
 
163
  # %%
 
170
  # %%
171
  filtered_df.shape
172
 
 
 
 
173
 
Stocks news prediction/SML/training_pipeline.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ #Importing necessary libraries
3
+ import hopsworks
4
+ import hsfs
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sklearn.preprocessing import OneHotEncoder
10
+ from sklearn.preprocessing import MinMaxScaler
11
+ from sklearn.metrics import mean_squared_error
12
+ from hsml.schema import Schema
13
+ from hsml.model_schema import ModelSchema
14
+ from tensorflow.keras.models import Sequential
15
+ from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
16
+ from sklearn.preprocessing import StandardScaler # Import StandardScaler from scikit-learn
17
+ import joblib
18
+
19
+ load_dotenv()
20
+
21
+ #Connecting to hopsworks
22
+ api_key = os.environ.get('hopsworks_api')
23
+ project = hopsworks.login(api_key_value=api_key)
24
+ fs = project.get_feature_store()
25
+
26
+ #Another connection to hopsworks
27
+ api_key = os.getenv('hopsworks_api')
28
+ connection = hsfs.connection()
29
+ fs = connection.get_feature_store()
30
+
31
+ # %%
32
+ #Getting the feature view
33
+ feature_view = fs.get_feature_view(
34
+ name='tesla_stocks_fv',
35
+ version=1
36
+ )
37
+
38
+ # %%
39
+ #Setting up train & test split dates
40
+ train_start = "2022-06-22"
41
+ train_end = "2023-12-31"
42
+
43
+ test_start = '2024-01-01'
44
+ test_end = "2024-05-03"
45
+
46
+ # %%
47
+ #Creating the train/test split on the feature view with the split dates
48
+ feature_view.create_train_test_split(
49
+ train_start=train_start,
50
+ train_end=train_end,
51
+ test_start=test_start,
52
+ test_end=test_end,
53
+ data_format='csv',
54
+ coalesce= True,
55
+ statistics_config={'histogram':True,'correlations':True})
56
+
57
+ # %%
58
+ #Collecting the split from feature view
59
+ X_train, X_test, y_train, y_test = feature_view.get_train_test_split(6)
60
+
61
+ # %%
62
+ #Inspecting X_train
63
+ X_train
64
+
65
+ # %%
66
+ #Converting date into datetime
67
+ X_train['date'] = pd.to_datetime(X_train['date']).dt.date
68
+ X_test['date'] = pd.to_datetime(X_test['date']).dt.date
69
+ X_train['date'] = pd.to_datetime(X_train['date'])
70
+ X_test['date'] = pd.to_datetime(X_test['date'])
71
+
72
+ # %%
73
+ X_train.head()
74
+
75
+ # %%
76
+ # Extracting the 'ticker' column
77
+ tickers = X_train[['ticker']]
78
+
79
+ # Initializing OneHotEncoder
80
+ encoder = OneHotEncoder()
81
+
82
+ # Fitting and transforming the 'ticker' column
83
+ ticker_encoded = encoder.fit_transform(tickers)
84
+
85
+ # Converting the encoded column into a DataFrame
86
+ ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))
87
+
88
+ # Concatenating the encoded DataFrame with the original DataFrame
89
+ X_train = pd.concat([X_train, ticker_encoded_df], axis=1)
90
+
91
+ # Dropping the original 'ticker' column
92
+ X_train.drop('ticker', axis=1, inplace=True)
93
+
94
+ # %%
95
+ #Inspecting X train after onehotencoding 'Ticker'
96
+ X_train.head()
97
+
98
+ # %%
99
+ #Doing the same for X test as done to X train
100
+
101
+ tickers = X_test[['ticker']]
102
+
103
+ # Initializing OneHotEncoder
104
+ encoder = OneHotEncoder()
105
+
106
+ # Fitting and transforming the 'ticker' column
107
+ ticker_encoded_test = encoder.fit_transform(tickers)
108
+
109
+ # Converting the encoded column into a DataFrame
110
+ ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))
111
+
112
+ # Concatenating the encoded DataFrame with the original DataFrame
113
+ X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)
114
+
115
+ # Dropping the original 'ticker' column
116
+ X_test.drop('ticker', axis=1, inplace=True)
117
+
118
+ # %%
119
+ #Loading in MinMaxScaler to be used on the target variable 'open'
120
+ scaler = MinMaxScaler()
121
+
122
+ # Fitting and transforming the 'open' column
123
+ y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])
124
+ y_train.drop('open', axis=1, inplace=True)
125
+
126
+ # %%
127
+ #Doing the same to y_test as done to y_train
128
+ y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])
129
+ y_test.drop('open', axis=1, inplace=True)
130
+
131
+ # %%
132
+ #Defining the function for the LSTM model
133
+ def create_model(input_shape,
134
+ LSTM_filters=64,
135
+ dropout=0.1,
136
+ recurrent_dropout=0.1,
137
+ dense_dropout=0.5,
138
+ activation='relu',
139
+ depth=1):
140
+
141
+ model = Sequential()
142
+
143
+ # Input layer
144
+ model.add(Input(shape=input_shape))
145
+
146
+ if depth > 1:
147
+ for i in range(1, depth):
148
+ # Recurrent layer
149
+ model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))
150
+
151
+ # Recurrent layer
152
+ model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))
153
+
154
+ # Fully connected layer
155
+ if activation == 'relu':
156
+ model.add(Dense(LSTM_filters, activation='relu'))
157
+ elif activation == 'leaky_relu':
158
+ model.add(Dense(LSTM_filters))
159
+ model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
160
+
161
+ # Dropout for regularization
162
+ model.add(Dropout(dense_dropout))
163
+
164
+ # Output layer for predicting one day forward
165
+ model.add(Dense(1, activation='linear'))
166
+
167
+ # Compile the model
168
+ model.compile(optimizer='adam', loss='mse')
169
+
170
+ return model
171
+
172
+ # %%
173
+ # As X_train['date'] column exists and is in datetime format, we're converting it
174
+ X_train['year'] = X_train['date'].dt.year
175
+ X_train['month'] = X_train['date'].dt.month
176
+ X_train['day'] = X_train['date'].dt.day
177
+
178
+ # Dropping the original date column
179
+ X_train.drop(columns=['date'], inplace=True)
180
+
181
+ # Converting dataframe to numpy array
182
+ X_train_array = X_train.to_numpy()
183
+
184
+ # Reshaping the array to have a shape suitable for LSTM
185
+ X_train_array = np.expand_dims(X_train_array, axis=1)
186
+
187
+ # %%
188
+ # Convert DataFrame to numpy array
189
+ X_train_array = X_train.values
190
+
191
+ # Reshaping X_train_array to add a time step dimension
192
+ X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])
193
+
194
+ # Assuming X_train_reshaped shape is now (374, 1, 5)
195
+ input_shape = X_train_reshaped.shape[1:]
196
+
197
+ # Create the model
198
+ model = create_model(input_shape=input_shape)
199
+
200
+ # %%
201
+ #Fitting the model on the training dataset
202
+ model.fit(X_train_reshaped, y_train)
203
+
204
+ # %%
205
+ # As X_test['date'] column exists and is in datetime format, we're converting it
206
+ X_test['year'] = X_test['date'].dt.year
207
+ X_test['month'] = X_test['date'].dt.month
208
+ X_test['day'] = X_test['date'].dt.day
209
+
210
+ # Dropping the original date column
211
+ X_test.drop(columns=['date'], inplace=True)
212
+
213
+ # Converting dataframe to numpy array
214
+ X_test_array = X_test.to_numpy()
215
+
216
+ # Reshape the array to have a shape suitable for LSTM
217
+ X_test_array = np.expand_dims(X_test_array, axis=1)
218
+
219
+ # %%
220
+ #Predicting y_pred with X_test
221
+ y_pred = model.predict(X_test_array)
222
+
223
+ # %%
224
+ #Conneting to hopsworks model registry
225
+ mr = project.get_model_registry()
226
+
227
+ # %%
228
+ # Compute RMSE metric for filling the model
229
+ rmse = np.sqrt(mean_squared_error(y_test, y_pred))
230
+ rmse_metrics = {"RMSE": rmse}
231
+ rmse_metrics
232
+
233
+ # %%
234
+ #Setting up the model schema
235
+ input_schema = Schema(X_train)
236
+ output_schema = Schema(y_train)
237
+ model_schema = ModelSchema(input_schema, output_schema)
238
+
239
+ # %%
240
+ #Creating a file colled 'stock_model'
241
+ model_dir="stock_model"
242
+ if os.path.isdir(model_dir) == False:
243
+ os.mkdir(model_dir)
244
+
245
+ # %%
246
+ #Saving the model to hopsworks model registry
247
+ stock_pred_model = mr.tensorflow.create_model(
248
+ name="stock_pred_model",
249
+ metrics= rmse_metrics,
250
+ model_schema=model_schema,
251
+ description="Stock Market TSLA Predictor from News Sentiment",
252
+ )
253
+
254
+ stock_pred_model.save(model_dir)
255
+
256
+
TSLA_stock_price.csv β†’ Stocks news prediction/TSLA_stock_price.csv RENAMED
File without changes
news_articles.csv β†’ Stocks news prediction/news_articles.csv RENAMED
File without changes
news_articles_ema.csv β†’ Stocks news prediction/news_articles_ema.csv RENAMED
File without changes
feature_engineering.ipynb DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import requests\n",
10
- "import pandas as pd\n",
11
- "import json\n",
12
- "import datetime\n",
13
- "import numpy as np\n",
14
- "from datetime import timedelta "
15
- ]
16
- },
17
- {
18
- "cell_type": "code",
19
- "execution_count": 3,
20
- "metadata": {},
21
- "outputs": [],
22
- "source": [
23
- "def getNews(api_key,endpoint,ticker,from_date,to_date,num=1000):\n",
24
- " # Set the parameters for the request\n",
25
- " params = {\n",
26
- " \"api_token\": api_key,\n",
27
- " \"s\": ticker,\n",
28
- " \"from\": from_date, \n",
29
- " \"to\": to_date,\n",
30
- " \"limit\": num,\n",
31
- " }\n",
32
- " \n",
33
- " # Make the request to the API\n",
34
- " response = requests.get(endpoint, params=params)\n",
35
- " \n",
36
- " # Print the response from the API\n",
37
- " #print(response.json())\n",
38
- "\n",
39
- " #Return a Pandas dataframe from the response\n",
40
- " return pd.DataFrame(response.json())"
41
- ]
42
- },
43
- {
44
- "cell_type": "code",
45
- "execution_count": null,
46
- "metadata": {},
47
- "outputs": [],
48
- "source": []
49
- }
50
- ],
51
- "metadata": {
52
- "kernelspec": {
53
- "display_name": "base",
54
- "language": "python",
55
- "name": "python3"
56
- },
57
- "language_info": {
58
- "codemirror_mode": {
59
- "name": "ipython",
60
- "version": 3
61
- },
62
- "file_extension": ".py",
63
- "mimetype": "text/x-python",
64
- "name": "python",
65
- "nbconvert_exporter": "python",
66
- "pygments_lexer": "ipython3",
67
- "version": "3.11.9"
68
- },
69
- "orig_nbformat": 4
70
- },
71
- "nbformat": 4,
72
- "nbformat_minor": 2
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feature_pipeline.ipynb DELETED
@@ -1,775 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from dotenv import load_dotenv\n",
10
- "import os "
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 2,
16
- "metadata": {},
17
- "outputs": [
18
- {
19
- "name": "stdout",
20
- "output_type": "stream",
21
- "text": [
22
- "Requirement already satisfied: great_expectations==0.18.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (0.18.12)\n",
23
- "Requirement already satisfied: altair<5.0.0,>=4.2.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.2.2)\n",
24
- "Requirement already satisfied: Click>=7.1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.1.7)\n",
25
- "Requirement already satisfied: colorama>=0.4.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (0.4.6)\n",
26
- "Requirement already satisfied: cryptography>=3.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (42.0.6)\n",
27
- "Requirement already satisfied: Ipython>=7.16.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.22.2)\n",
28
- "Requirement already satisfied: ipywidgets>=7.5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.1.2)\n",
29
- "Requirement already satisfied: jinja2>=2.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.1.3)\n",
30
- "Requirement already satisfied: jsonpatch>=1.22 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.33)\n",
31
- "Requirement already satisfied: jsonschema>=2.5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.22.0)\n",
32
- "Requirement already satisfied: makefun<2,>=1.7.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.15.2)\n",
33
- "Requirement already satisfied: marshmallow<4.0.0,>=3.7.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.21.2)\n",
34
- "Requirement already satisfied: mistune>=0.8.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.0.2)\n",
35
- "Requirement already satisfied: nbformat>=5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (5.10.4)\n",
36
- "Requirement already satisfied: notebook>=6.4.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (7.1.3)\n",
37
- "Requirement already satisfied: packaging in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (24.0)\n",
38
- "Requirement already satisfied: pydantic>=1.9.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.7.1)\n",
39
- "Requirement already satisfied: pyparsing>=2.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.1.2)\n",
40
- "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.9.0)\n",
41
- "Requirement already satisfied: pytz>=2021.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2024.1)\n",
42
- "Requirement already satisfied: requests>=2.20 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.31.0)\n",
43
- "Requirement already satisfied: ruamel.yaml<0.17.18,>=0.16 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (0.17.17)\n",
44
- "Requirement already satisfied: scipy>=1.6.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.13.0)\n",
45
- "Requirement already satisfied: tqdm>=4.59.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.66.4)\n",
46
- "Requirement already satisfied: typing-extensions>=3.10.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.11.0)\n",
47
- "Requirement already satisfied: tzlocal>=1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (5.2)\n",
48
- "Requirement already satisfied: urllib3>=1.26 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.2.1)\n",
49
- "Requirement already satisfied: numpy<2.0.0,>=1.22.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.26.4)\n",
50
- "Requirement already satisfied: pandas>=1.3.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.5.1)\n",
51
- "Requirement already satisfied: entrypoints in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from altair<5.0.0,>=4.2.1->great_expectations==0.18.12) (0.4)\n",
52
- "Requirement already satisfied: toolz in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from altair<5.0.0,>=4.2.1->great_expectations==0.18.12) (0.12.1)\n",
53
- "Requirement already satisfied: cffi>=1.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from cryptography>=3.2->great_expectations==0.18.12) (1.16.0)\n",
54
- "Requirement already satisfied: decorator in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (5.1.1)\n",
55
- "Requirement already satisfied: jedi>=0.16 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.19.1)\n",
56
- "Requirement already satisfied: matplotlib-inline in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.1.7)\n",
57
- "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (3.0.42)\n",
58
- "Requirement already satisfied: pygments>=2.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (2.17.2)\n",
59
- "Requirement already satisfied: stack-data in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.6.2)\n",
60
- "Requirement already satisfied: traitlets>=5.13.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (5.14.3)\n",
61
- "Requirement already satisfied: comm>=0.1.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (0.2.2)\n",
62
- "Requirement already satisfied: widgetsnbextension~=4.0.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (4.0.10)\n",
63
- "Requirement already satisfied: jupyterlab-widgets~=3.0.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (3.0.10)\n",
64
- "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jinja2>=2.10->great_expectations==0.18.12) (2.1.5)\n",
65
- "Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonpatch>=1.22->great_expectations==0.18.12) (2.4)\n",
66
- "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (23.2.0)\n",
67
- "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (2023.12.1)\n",
68
- "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (0.35.1)\n",
69
- "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (0.18.0)\n",
70
- "Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbformat>=5.0->great_expectations==0.18.12) (2.19.1)\n",
71
- "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbformat>=5.0->great_expectations==0.18.12) (5.7.2)\n",
72
- "Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (2.14.0)\n",
73
- "Requirement already satisfied: jupyterlab-server<3,>=2.22.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (2.27.1)\n",
74
- "Requirement already satisfied: jupyterlab<4.2,>=4.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (4.1.8)\n",
75
- "Requirement already satisfied: notebook-shim<0.3,>=0.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (0.2.4)\n",
76
- "Requirement already satisfied: tornado>=6.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (6.3.3)\n",
77
- "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic>=1.9.2->great_expectations==0.18.12) (0.6.0)\n",
78
- "Requirement already satisfied: pydantic-core==2.18.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic>=1.9.2->great_expectations==0.18.12) (2.18.2)\n",
79
- "Requirement already satisfied: six>=1.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from python-dateutil>=2.8.1->great_expectations==0.18.12) (1.16.0)\n",
80
- "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (3.3.2)\n",
81
- "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (3.7)\n",
82
- "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (2024.2.2)\n",
83
- "Requirement already satisfied: tzdata in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from tzlocal>=1.2->great_expectations==0.18.12) (2024.1)\n",
84
- "Requirement already satisfied: pycparser in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from cffi>=1.12->cryptography>=3.2->great_expectations==0.18.12) (2.22)\n",
85
- "Requirement already satisfied: parso<0.9.0,>=0.8.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jedi>=0.16->Ipython>=7.16.3->great_expectations==0.18.12) (0.8.4)\n",
86
- "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=5.0->great_expectations==0.18.12) (4.2.1)\n",
87
- "Requirement already satisfied: pywin32>=300 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=5.0->great_expectations==0.18.12) (305.1)\n",
88
- "Requirement already satisfied: anyio>=3.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (4.3.0)\n",
89
- "Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (23.1.0)\n",
90
- "Requirement already satisfied: jupyter-client>=7.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (8.6.1)\n",
91
- "Requirement already satisfied: jupyter-events>=0.9.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.10.0)\n",
92
- "Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.5.3)\n",
93
- "Requirement already satisfied: nbconvert>=6.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (7.16.4)\n",
94
- "Requirement already satisfied: overrides>=5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (7.7.0)\n",
95
- "Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.20.0)\n",
96
- "Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.0.13)\n",
97
- "Requirement already satisfied: pyzmq>=24 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (25.1.2)\n",
98
- "Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.8.3)\n",
99
- "Requirement already satisfied: terminado>=0.8.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.18.1)\n",
100
- "Requirement already satisfied: websocket-client>=1.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.8.0)\n",
101
- "Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (2.0.4)\n",
102
- "Requirement already satisfied: httpx>=0.25.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (0.27.0)\n",
103
- "Requirement already satisfied: ipykernel>=6.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (6.29.3)\n",
104
- "Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (2.2.5)\n",
105
- "Requirement already satisfied: babel>=2.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab-server<3,>=2.22.1->notebook>=6.4.10->great_expectations==0.18.12) (2.14.0)\n",
106
- "Requirement already satisfied: json5>=0.9.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab-server<3,>=2.22.1->notebook>=6.4.10->great_expectations==0.18.12) (0.9.25)\n",
107
- "Requirement already satisfied: wcwidth in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->Ipython>=7.16.3->great_expectations==0.18.12) (0.2.13)\n",
108
- "Requirement already satisfied: executing>=1.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (2.0.1)\n",
109
- "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (2.4.1)\n",
110
- "Requirement already satisfied: pure-eval in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (0.2.2)\n",
111
- "Requirement already satisfied: sniffio>=1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.1)\n",
112
- "Requirement already satisfied: argon2-cffi-bindings in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (21.2.0)\n",
113
- "Requirement already satisfied: httpcore==1.* in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.0.5)\n",
114
- "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpcore==1.*->httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (0.14.0)\n",
115
- "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.6.7)\n",
116
- "Requirement already satisfied: nest-asyncio in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.6.0)\n",
117
- "Requirement already satisfied: psutil in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (5.9.0)\n",
118
- "Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.0.7)\n",
119
- "Requirement already satisfied: pyyaml>=5.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (6.0.1)\n",
120
- "Requirement already satisfied: rfc3339-validator in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.1.4)\n",
121
- "Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.1.1)\n",
122
- "Requirement already satisfied: beautifulsoup4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (4.12.3)\n",
123
- "Requirement already satisfied: bleach!=5.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (6.1.0)\n",
124
- "Requirement already satisfied: defusedxml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.7.1)\n",
125
- "Requirement already satisfied: jupyterlab-pygments in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.3.0)\n",
126
- "Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.10.0)\n",
127
- "Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.5.1)\n",
128
- "Requirement already satisfied: tinycss2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
129
- "Requirement already satisfied: webencodings in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from bleach!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.5.1)\n",
130
- "Requirement already satisfied: fqdn in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.5.1)\n",
131
- "Requirement already satisfied: isoduration in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (20.11.0)\n",
132
- "Requirement already satisfied: uri-template in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
133
- "Requirement already satisfied: webcolors>=1.11 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.13)\n",
134
- "Requirement already satisfied: soupsieve>1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.5)\n",
135
- "Requirement already satisfied: arrow>=0.15.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
136
- "Requirement already satisfied: types-python-dateutil>=2.8.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.9.0.20240316)\n"
137
- ]
138
- }
139
- ],
140
- "source": [
141
- "!pip install great_expectations==0.18.12"
142
- ]
143
- },
144
- {
145
- "cell_type": "code",
146
- "execution_count": 3,
147
- "metadata": {},
148
- "outputs": [],
149
- "source": [
150
- "# Import necessary libraries\n",
151
- "import pandas as pd # For data manipulation using DataFrames\n",
152
- "import numpy as np # For numerical operations\n",
153
- "import matplotlib.pyplot as plt # For data visualization\n",
154
- "import os # For operating system-related tasks\n",
155
- "import joblib # For saving and loading models\n",
156
- "import hopsworks # For getting access to hopsworks\n",
157
- "\n",
158
- "\n",
159
- "\n",
160
- "# Import specific modules from scikit-learn\n",
161
- "from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing\n",
162
- "from sklearn.metrics import accuracy_score # For evaluating model accuracy"
163
- ]
164
- },
165
- {
166
- "cell_type": "code",
167
- "execution_count": 4,
168
- "metadata": {},
169
- "outputs": [
170
- {
171
- "name": "stdout",
172
- "output_type": "stream",
173
- "text": [
174
- "Requirement already satisfied: modal in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (0.62.141)\n",
175
- "Requirement already satisfied: aiohttp in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (3.9.5)\n",
176
- "Requirement already satisfied: aiostream~=0.5.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.5.2)\n",
177
- "Requirement already satisfied: certifi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (2024.2.2)\n",
178
- "Requirement already satisfied: click>=8.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (8.1.7)\n",
179
- "Requirement already satisfied: fastapi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.111.0)\n",
180
- "Requirement already satisfied: grpclib==0.4.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.4.7)\n",
181
- "Requirement already satisfied: protobuf!=4.24.0,<5.0,>=3.19 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (4.25.3)\n",
182
- "Requirement already satisfied: rich>=12.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (13.7.1)\n",
183
- "Requirement already satisfied: synchronicity~=0.6.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.6.7)\n",
184
- "Requirement already satisfied: toml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.10.2)\n",
185
- "Requirement already satisfied: typer>=0.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.12.3)\n",
186
- "Requirement already satisfied: types-certifi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (2021.10.8.3)\n",
187
- "Requirement already satisfied: types-toml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.10.8.20240310)\n",
188
- "Requirement already satisfied: watchfiles in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.21.0)\n",
189
- "Requirement already satisfied: typing-extensions~=4.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (4.11.0)\n",
190
- "Requirement already satisfied: h2<5,>=3.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from grpclib==0.4.7->modal) (4.1.0)\n",
191
- "Requirement already satisfied: multidict in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from grpclib==0.4.7->modal) (6.0.5)\n",
192
- "Requirement already satisfied: colorama in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from click>=8.1.0->modal) (0.4.6)\n",
193
- "Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from rich>=12.0.0->modal) (3.0.0)\n",
194
- "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from rich>=12.0.0->modal) (2.17.2)\n",
195
- "Requirement already satisfied: sigtools==4.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from synchronicity~=0.6.6->modal) (4.0.1)\n",
196
- "Requirement already satisfied: attrs in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from sigtools==4.0.1->synchronicity~=0.6.6->modal) (23.2.0)\n",
197
- "Requirement already satisfied: shellingham>=1.3.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from typer>=0.9->modal) (1.5.4)\n",
198
- "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.3.1)\n",
199
- "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.4.1)\n",
200
- "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.9.4)\n",
201
- "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.37.2)\n",
202
- "Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (2.7.1)\n",
203
- "Requirement already satisfied: fastapi-cli>=0.0.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.0.2)\n",
204
- "Requirement already satisfied: httpx>=0.23.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.27.0)\n",
205
- "Requirement already satisfied: jinja2>=2.11.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (3.1.3)\n",
206
- "Requirement already satisfied: python-multipart>=0.0.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.0.9)\n",
207
- "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (5.9.0)\n",
208
- "Requirement already satisfied: orjson>=3.2.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (3.10.3)\n",
209
- "Requirement already satisfied: email_validator>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (2.1.1)\n",
210
- "Requirement already satisfied: uvicorn>=0.12.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (0.29.0)\n",
211
- "Requirement already satisfied: anyio>=3.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from watchfiles->modal) (4.3.0)\n",
212
- "Requirement already satisfied: idna>=2.8 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.0.0->watchfiles->modal) (3.7)\n",
213
- "Requirement already satisfied: sniffio>=1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.0.0->watchfiles->modal) (1.3.1)\n",
214
- "Requirement already satisfied: dnspython>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from email_validator>=2.0.0->fastapi->modal) (2.6.1)\n",
215
- "Requirement already satisfied: hyperframe<7,>=6.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from h2<5,>=3.1.0->grpclib==0.4.7->modal) (6.0.1)\n",
216
- "Requirement already satisfied: hpack<5,>=4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from h2<5,>=3.1.0->grpclib==0.4.7->modal) (4.0.0)\n",
217
- "Requirement already satisfied: httpcore==1.* in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpx>=0.23.0->fastapi->modal) (1.0.5)\n",
218
- "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpcore==1.*->httpx>=0.23.0->fastapi->modal) (0.14.0)\n",
219
- "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jinja2>=2.11.2->fastapi->modal) (2.1.5)\n",
220
- "Requirement already satisfied: mdurl~=0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->modal) (0.1.2)\n",
221
- "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->modal) (0.6.0)\n",
222
- "Requirement already satisfied: pydantic-core==2.18.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->modal) (2.18.2)\n",
223
- "Requirement already satisfied: httptools>=0.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (0.6.1)\n",
224
- "Requirement already satisfied: python-dotenv>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (1.0.1)\n",
225
- "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (6.0.1)\n",
226
- "Requirement already satisfied: websockets>=10.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (12.0)\n"
227
- ]
228
- }
229
- ],
230
- "source": [
231
- "!pip install modal"
232
- ]
233
- },
234
- {
235
- "cell_type": "code",
236
- "execution_count": 5,
237
- "metadata": {},
238
- "outputs": [],
239
- "source": [
240
- "#from alpha_vantage.timeseries import TimeSeries\n",
241
- "#import pandas as pd\n",
242
- "\n",
243
- "#load_dotenv()\n",
244
- "\n",
245
- "#api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
246
- "#ts = TimeSeries(key=api_key, output_format='pandas')\n",
247
- "\n",
248
- "# Fetch daily adjusted stock prices; adjust the symbol as needed\n",
249
- "#data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n",
250
- "\n",
251
- "#print(data.head())"
252
- ]
253
- },
254
- {
255
- "cell_type": "code",
256
- "execution_count": 6,
257
- "metadata": {},
258
- "outputs": [],
259
- "source": [
260
- "#data.info()"
261
- ]
262
- },
263
- {
264
- "cell_type": "code",
265
- "execution_count": 7,
266
- "metadata": {},
267
- "outputs": [],
268
- "source": [
269
- "#meta_data"
270
- ]
271
- },
272
- {
273
- "cell_type": "code",
274
- "execution_count": 8,
275
- "metadata": {},
276
- "outputs": [],
277
- "source": [
278
- "# Define your file path and name\n",
279
- "#file_path = 'TSLA_stock_price.csv' # Customize the path and filename\n",
280
- "\n",
281
- "# Save the DataFrame to CSV\n",
282
- "#stock_data.to_csv(file_path)\n",
283
- "\n",
284
- "#print(f\"Data saved to {file_path}\")\n"
285
- ]
286
- },
287
- {
288
- "cell_type": "code",
289
- "execution_count": 9,
290
- "metadata": {},
291
- "outputs": [
292
- {
293
- "name": "stdout",
294
- "output_type": "stream",
295
- "text": [
296
- " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
297
- "0 2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
298
- "1 2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
299
- "2 2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
300
- "3 2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
301
- "4 2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n"
302
- ]
303
- }
304
- ],
305
- "source": [
306
- "# Load and display the data from CSV to confirm\n",
307
- "tsla_df = pd.read_csv('TSLA_stock_price.csv')\n",
308
- "print(tsla_df.head())\n",
309
- " "
310
- ]
311
- },
312
- {
313
- "cell_type": "code",
314
- "execution_count": 10,
315
- "metadata": {},
316
- "outputs": [
317
- {
318
- "name": "stdout",
319
- "output_type": "stream",
320
- "text": [
321
- "Connected. Call `.close()` to terminate connection gracefully.\n",
322
- "\n",
323
- "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
324
- "Connected. Call `.close()` to terminate connection gracefully.\n"
325
- ]
326
- }
327
- ],
328
- "source": [
329
- "api_key = os.environ.get('hopsworks_api')\n",
330
- "project = hopsworks.login(api_key_value=api_key)\n",
331
- "fs = project.get_feature_store()"
332
- ]
333
- },
334
- {
335
- "cell_type": "code",
336
- "execution_count": 11,
337
- "metadata": {},
338
- "outputs": [],
339
- "source": [
340
- "import re "
341
- ]
342
- },
343
- {
344
- "cell_type": "code",
345
- "execution_count": 12,
346
- "metadata": {},
347
- "outputs": [],
348
- "source": [
349
- "def clean_column_name(name):\n",
350
- " # Remove all non-letter characters\n",
351
- " cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
352
- " return cleaned_name\n"
353
- ]
354
- },
355
- {
356
- "cell_type": "code",
357
- "execution_count": 13,
358
- "metadata": {},
359
- "outputs": [
360
- {
361
- "data": {
362
- "text/html": [
363
- "<div>\n",
364
- "<style scoped>\n",
365
- " .dataframe tbody tr th:only-of-type {\n",
366
- " vertical-align: middle;\n",
367
- " }\n",
368
- "\n",
369
- " .dataframe tbody tr th {\n",
370
- " vertical-align: top;\n",
371
- " }\n",
372
- "\n",
373
- " .dataframe thead th {\n",
374
- " text-align: right;\n",
375
- " }\n",
376
- "</style>\n",
377
- "<table border=\"1\" class=\"dataframe\">\n",
378
- " <thead>\n",
379
- " <tr style=\"text-align: right;\">\n",
380
- " <th></th>\n",
381
- " <th>date</th>\n",
382
- " <th>1. open</th>\n",
383
- " <th>2. high</th>\n",
384
- " <th>3. low</th>\n",
385
- " <th>4. close</th>\n",
386
- " <th>5. volume</th>\n",
387
- " <th>ticker</th>\n",
388
- " </tr>\n",
389
- " </thead>\n",
390
- " <tbody>\n",
391
- " <tr>\n",
392
- " <th>0</th>\n",
393
- " <td>2024-05-03</td>\n",
394
- " <td>182.10</td>\n",
395
- " <td>184.7800</td>\n",
396
- " <td>178.4200</td>\n",
397
- " <td>181.19</td>\n",
398
- " <td>75491539.0</td>\n",
399
- " <td>TSLA</td>\n",
400
- " </tr>\n",
401
- " <tr>\n",
402
- " <th>1</th>\n",
403
- " <td>2024-05-02</td>\n",
404
- " <td>182.86</td>\n",
405
- " <td>184.6000</td>\n",
406
- " <td>176.0200</td>\n",
407
- " <td>180.01</td>\n",
408
- " <td>89148041.0</td>\n",
409
- " <td>TSLA</td>\n",
410
- " </tr>\n",
411
- " <tr>\n",
412
- " <th>2</th>\n",
413
- " <td>2024-05-01</td>\n",
414
- " <td>182.00</td>\n",
415
- " <td>185.8600</td>\n",
416
- " <td>179.0100</td>\n",
417
- " <td>179.99</td>\n",
418
- " <td>92829719.0</td>\n",
419
- " <td>TSLA</td>\n",
420
- " </tr>\n",
421
- " <tr>\n",
422
- " <th>3</th>\n",
423
- " <td>2024-04-30</td>\n",
424
- " <td>186.98</td>\n",
425
- " <td>190.9500</td>\n",
426
- " <td>182.8401</td>\n",
427
- " <td>183.28</td>\n",
428
- " <td>127031787.0</td>\n",
429
- " <td>TSLA</td>\n",
430
- " </tr>\n",
431
- " <tr>\n",
432
- " <th>4</th>\n",
433
- " <td>2024-04-29</td>\n",
434
- " <td>188.42</td>\n",
435
- " <td>198.8700</td>\n",
436
- " <td>184.5400</td>\n",
437
- " <td>194.05</td>\n",
438
- " <td>243869678.0</td>\n",
439
- " <td>TSLA</td>\n",
440
- " </tr>\n",
441
- " <tr>\n",
442
- " <th>...</th>\n",
443
- " <td>...</td>\n",
444
- " <td>...</td>\n",
445
- " <td>...</td>\n",
446
- " <td>...</td>\n",
447
- " <td>...</td>\n",
448
- " <td>...</td>\n",
449
- " <td>...</td>\n",
450
- " </tr>\n",
451
- " <tr>\n",
452
- " <th>3481</th>\n",
453
- " <td>2010-07-06</td>\n",
454
- " <td>20.00</td>\n",
455
- " <td>20.0000</td>\n",
456
- " <td>15.8300</td>\n",
457
- " <td>16.11</td>\n",
458
- " <td>6866900.0</td>\n",
459
- " <td>TSLA</td>\n",
460
- " </tr>\n",
461
- " <tr>\n",
462
- " <th>3482</th>\n",
463
- " <td>2010-07-02</td>\n",
464
- " <td>23.00</td>\n",
465
- " <td>23.1000</td>\n",
466
- " <td>18.7100</td>\n",
467
- " <td>19.20</td>\n",
468
- " <td>5139800.0</td>\n",
469
- " <td>TSLA</td>\n",
470
- " </tr>\n",
471
- " <tr>\n",
472
- " <th>3483</th>\n",
473
- " <td>2010-07-01</td>\n",
474
- " <td>25.00</td>\n",
475
- " <td>25.9200</td>\n",
476
- " <td>20.2700</td>\n",
477
- " <td>21.96</td>\n",
478
- " <td>8218800.0</td>\n",
479
- " <td>TSLA</td>\n",
480
- " </tr>\n",
481
- " <tr>\n",
482
- " <th>3484</th>\n",
483
- " <td>2010-06-30</td>\n",
484
- " <td>25.79</td>\n",
485
- " <td>30.4192</td>\n",
486
- " <td>23.3000</td>\n",
487
- " <td>23.83</td>\n",
488
- " <td>17187100.0</td>\n",
489
- " <td>TSLA</td>\n",
490
- " </tr>\n",
491
- " <tr>\n",
492
- " <th>3485</th>\n",
493
- " <td>2010-06-29</td>\n",
494
- " <td>19.00</td>\n",
495
- " <td>25.0000</td>\n",
496
- " <td>17.5400</td>\n",
497
- " <td>23.89</td>\n",
498
- " <td>18766300.0</td>\n",
499
- " <td>TSLA</td>\n",
500
- " </tr>\n",
501
- " </tbody>\n",
502
- "</table>\n",
503
- "<p>3486 rows Γ— 7 columns</p>\n",
504
- "</div>"
505
- ],
506
- "text/plain": [
507
- " date 1. open 2. high 3. low 4. close 5. volume ticker\n",
508
- "0 2024-05-03 182.10 184.7800 178.4200 181.19 75491539.0 TSLA\n",
509
- "1 2024-05-02 182.86 184.6000 176.0200 180.01 89148041.0 TSLA\n",
510
- "2 2024-05-01 182.00 185.8600 179.0100 179.99 92829719.0 TSLA\n",
511
- "3 2024-04-30 186.98 190.9500 182.8401 183.28 127031787.0 TSLA\n",
512
- "4 2024-04-29 188.42 198.8700 184.5400 194.05 243869678.0 TSLA\n",
513
- "... ... ... ... ... ... ... ...\n",
514
- "3481 2010-07-06 20.00 20.0000 15.8300 16.11 6866900.0 TSLA\n",
515
- "3482 2010-07-02 23.00 23.1000 18.7100 19.20 5139800.0 TSLA\n",
516
- "3483 2010-07-01 25.00 25.9200 20.2700 21.96 8218800.0 TSLA\n",
517
- "3484 2010-06-30 25.79 30.4192 23.3000 23.83 17187100.0 TSLA\n",
518
- "3485 2010-06-29 19.00 25.0000 17.5400 23.89 18766300.0 TSLA\n",
519
- "\n",
520
- "[3486 rows x 7 columns]"
521
- ]
522
- },
523
- "execution_count": 13,
524
- "metadata": {},
525
- "output_type": "execute_result"
526
- }
527
- ],
528
- "source": [
529
- "tsla_df"
530
- ]
531
- },
532
- {
533
- "cell_type": "code",
534
- "execution_count": 14,
535
- "metadata": {},
536
- "outputs": [],
537
- "source": [
538
- "# Assuming 'tsla_df' is your DataFrame\n",
539
- "tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]\n"
540
- ]
541
- },
542
- {
543
- "cell_type": "code",
544
- "execution_count": 15,
545
- "metadata": {},
546
- "outputs": [
547
- {
548
- "name": "stdout",
549
- "output_type": "stream",
550
- "text": [
551
- "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
552
- ]
553
- }
554
- ],
555
- "source": [
556
- "print(tsla_df.columns)\n"
557
- ]
558
- },
559
- {
560
- "cell_type": "code",
561
- "execution_count": 16,
562
- "metadata": {},
563
- "outputs": [],
564
- "source": [
565
- "import pandas as pd\n",
566
- "\n",
567
- "# Assuming tsla_df is your pandas DataFrame\n",
568
- "# Convert the \"date\" column to timestamp\n",
569
- "tsla_df['date'] = pd.to_datetime(tsla_df['date'])\n"
570
- ]
571
- },
572
- {
573
- "cell_type": "code",
574
- "execution_count": 17,
575
- "metadata": {},
576
- "outputs": [
577
- {
578
- "name": "stdout",
579
- "output_type": "stream",
580
- "text": [
581
- "2024-05-06 13:43:00,985 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
582
- "\n"
583
- ]
584
- }
585
- ],
586
- "source": [
587
- "# Define a feature group\n",
588
- "tesla_fg = fs.get_or_create_feature_group(\n",
589
- " name=\"tesla_stock\",\n",
590
- " description=\"Tesla stock dataset from alpha vantage\",\n",
591
- " version=1,\n",
592
- " primary_key=[\"ticker\"],\n",
593
- " event_time=['date'],\n",
594
- " online_enabled=False,\n",
595
- ")"
596
- ]
597
- },
598
- {
599
- "cell_type": "code",
600
- "execution_count": 18,
601
- "metadata": {},
602
- "outputs": [
603
- {
604
- "name": "stdout",
605
- "output_type": "stream",
606
- "text": [
607
- "Feature Group created successfully, explore it at \n",
608
- "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/786781\n"
609
- ]
610
- },
611
- {
612
- "data": {
613
- "application/vnd.jupyter.widget-view+json": {
614
- "model_id": "b3248b9d522a467db9ce202ef5815fe9",
615
- "version_major": 2,
616
- "version_minor": 0
617
- },
618
- "text/plain": [
619
- "Uploading Dataframe: 0.00% | | Rows 0/3486 | Elapsed Time: 00:00 | Remaining Time: ?"
620
- ]
621
- },
622
- "metadata": {},
623
- "output_type": "display_data"
624
- },
625
- {
626
- "name": "stdout",
627
- "output_type": "stream",
628
- "text": [
629
- "Launching job: tesla_stock_1_offline_fg_materialization\n",
630
- "Job started successfully, you can follow the progress at \n",
631
- "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
632
- ]
633
- },
634
- {
635
- "data": {
636
- "text/plain": [
637
- "(<hsfs.core.job.Job at 0x19cffe27490>, None)"
638
- ]
639
- },
640
- "execution_count": 18,
641
- "metadata": {},
642
- "output_type": "execute_result"
643
- }
644
- ],
645
- "source": [
646
- "tesla_fg.insert(tsla_df, write_options={\"wait_for_job\" : False})"
647
- ]
648
- },
649
- {
650
- "cell_type": "code",
651
- "execution_count": 19,
652
- "metadata": {},
653
- "outputs": [],
654
- "source": [
655
- "news_df = pd.read_csv('news_articles_ema.csv')\n"
656
- ]
657
- },
658
- {
659
- "cell_type": "code",
660
- "execution_count": 20,
661
- "metadata": {},
662
- "outputs": [],
663
- "source": [
664
- "news_df_updated = news_df.drop(columns=['exp_mean_7_days'])"
665
- ]
666
- },
667
- {
668
- "cell_type": "code",
669
- "execution_count": 21,
670
- "metadata": {},
671
- "outputs": [],
672
- "source": [
673
- "news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])"
674
- ]
675
- },
676
- {
677
- "cell_type": "code",
678
- "execution_count": 22,
679
- "metadata": {},
680
- "outputs": [
681
- {
682
- "name": "stdout",
683
- "output_type": "stream",
684
- "text": [
685
- "2024-05-06 13:43:12,343 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
686
- "\n"
687
- ]
688
- }
689
- ],
690
- "source": [
691
- "news_sentiment_fg = fs.get_or_create_feature_group(\n",
692
- " name='news_sentiment_updated',\n",
693
- " description='News sentiment from Polygon',\n",
694
- " version=1,\n",
695
- " primary_key=['ticker'],\n",
696
- " event_time=['date'],\n",
697
- " online_enabled=False,\n",
698
- ")"
699
- ]
700
- },
701
- {
702
- "cell_type": "code",
703
- "execution_count": 23,
704
- "metadata": {},
705
- "outputs": [
706
- {
707
- "name": "stdout",
708
- "output_type": "stream",
709
- "text": [
710
- "Feature Group created successfully, explore it at \n",
711
- "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787796\n"
712
- ]
713
- },
714
- {
715
- "data": {
716
- "application/vnd.jupyter.widget-view+json": {
717
- "model_id": "524bb5481c34441ba708a4c14edac44b",
718
- "version_major": 2,
719
- "version_minor": 0
720
- },
721
- "text/plain": [
722
- "Uploading Dataframe: 0.00% | | Rows 0/66 | Elapsed Time: 00:00 | Remaining Time: ?"
723
- ]
724
- },
725
- "metadata": {},
726
- "output_type": "display_data"
727
- },
728
- {
729
- "name": "stdout",
730
- "output_type": "stream",
731
- "text": [
732
- "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
733
- "Job started successfully, you can follow the progress at \n",
734
- "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n"
735
- ]
736
- },
737
- {
738
- "data": {
739
- "text/plain": [
740
- "(<hsfs.core.job.Job at 0x19c811c2e90>, None)"
741
- ]
742
- },
743
- "execution_count": 23,
744
- "metadata": {},
745
- "output_type": "execute_result"
746
- }
747
- ],
748
- "source": [
749
- "news_sentiment_fg.insert(news_df_updated)"
750
- ]
751
- }
752
- ],
753
- "metadata": {
754
- "kernelspec": {
755
- "display_name": "base",
756
- "language": "python",
757
- "name": "python3"
758
- },
759
- "language_info": {
760
- "codemirror_mode": {
761
- "name": "ipython",
762
- "version": 3
763
- },
764
- "file_extension": ".py",
765
- "mimetype": "text/x-python",
766
- "name": "python",
767
- "nbconvert_exporter": "python",
768
- "pygments_lexer": "ipython3",
769
- "version": "3.11.9"
770
- },
771
- "orig_nbformat": 4
772
- },
773
- "nbformat": 4,
774
- "nbformat_minor": 2
775
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feature_view_freddie.py DELETED
@@ -1,95 +0,0 @@
1
- # %%
2
- # Import necessary libraries
3
- import pandas as pd # For data manipulation using DataFrames
4
- import numpy as np # For numerical operations
5
- import matplotlib.pyplot as plt # For data visualization
6
- import os # For operating system-related tasks
7
- import joblib # For saving and loading models
8
- import hopsworks # For getting access to hopsworks
9
-
10
-
11
-
12
- # Import specific modules from scikit-learn
13
- from sklearn.preprocessing import StandardScaler, OneHotEncoder # For data preprocessing
14
- from sklearn.metrics import accuracy_score # For evaluating model accuracy
15
-
16
- # %%
17
- from feature_pipeline import tesla_fg
18
- from feature_pipeline import news_sentiment_fg
19
-
20
- # %%
21
- from dotenv import load_dotenv
22
- import os
23
-
24
- load_dotenv()
25
-
26
- # %%
27
- api_key = os.environ.get('hopsworks_api')
28
- project = hopsworks.login(api_key_value=api_key)
29
- fs = project.get_feature_store()
30
-
31
- # %%
32
- def create_stocks_feature_view(fs, version):
33
-
34
- # Loading in the feature groups
35
- tesla_fg = fs.get_feature_group('tesla_stock', version=1)
36
- news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
37
-
38
- # Define the query
39
- ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
40
- .join(news_sentiment_fg.select(['date','sentiment']))
41
-
42
- # Create the feature view
43
- feature_view = fs.create_feature_view(
44
- name='tesla_stocks_fv',
45
- query=ds_query,
46
- labels=['ticker']
47
- )
48
-
49
- return feature_view, tesla_fg
50
-
51
- # %%
52
- try:
53
- feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
54
- tesla_fg = fs.get_feature_group('tesla_stock', version=1)
55
- except:
56
- feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
57
-
58
- # %%
59
- def fix_data_from_feature_view(df,start_date,end_date):
60
- df = df.sort_values("date")
61
- df = df.reset_index()
62
- df = df.drop(columns=["index"])
63
-
64
- # Create a boolean mask for rows that fall within the date range
65
- mask = (pd.to_datetime(df['date']) >= pd.to_datetime(start_date)) & (pd.to_datetime(df['date']) <= pd.to_datetime(end_date))
66
- len_df = np.shape(df)
67
- df = df[mask] # Use the boolean mask to filter the DataFrame
68
- print('From shape {} to {} after cropping to given date range: {} to {}'.format(len_df,np.shape(df),start_date,end_date))
69
-
70
- return df
71
-
72
- # %%
73
- #def create_stocks_feature_view(fs, version):
74
-
75
- #Loading in the feature groups
76
- # tesla_fg = fs.get_feature_group('tesla_stock', version = 3)
77
- # news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)
78
-
79
- # ds_query = tesla_fg.select(['date','open', 'ticker'])\
80
- # .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))
81
-
82
- # return (fs.create_tesla_feature_view(
83
- # name = 'tsla_stocks_fv',
84
- # query = ds_query,
85
- # labels=['ticker']
86
- # ), tesla_fg)
87
-
88
- # %%
89
- #try:
90
- # feature_view = fs.get_feature_view("tsla_stocks_fv", version=1)
91
- # tesla_fg = fs.get_feature_group('tesla_stock', version=3)
92
- #except:
93
- # feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
94
-
95
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
historical_stock.ipynb DELETED
@@ -1,257 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "ename": "ModuleNotFoundError",
10
- "evalue": "No module named 'modal'",
11
- "output_type": "error",
12
- "traceback": [
13
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
14
- "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
15
- "Cell \u001b[1;32mIn[1], line 7\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mhopsworks\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mre\u001b[39;00m \n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmodal\u001b[39;00m \n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m#prepocessing\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n",
16
- "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'modal'"
17
- ]
18
- }
19
- ],
20
- "source": [
21
- "from dotenv import load_dotenv\n",
22
- "import os \n",
23
- "from alpha_vantage.timeseries import TimeSeries\n",
24
- "import pandas as pd\n",
25
- "import hopsworks\n",
26
- "import re \n",
27
- "import modal \n",
28
- "#prepocessing\n",
29
- "import requests\n",
30
- "import pandas as pd\n",
31
- "import json\n",
32
- "#import pandas_market_calendars as mcal\n",
33
- "import datetime\n",
34
- "import numpy as np\n",
35
- "from datetime import timedelta \n"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": null,
41
- "metadata": {},
42
- "outputs": [
43
- {
44
- "name": "stdout",
45
- "output_type": "stream",
46
- "text": [
47
- " 1. open 2. high 3. low 4. close 5. volume ticker\n",
48
- "date \n",
49
- "2024-05-03 182.10 184.78 178.4200 181.19 75491539.0 TSLA\n",
50
- "2024-05-02 182.86 184.60 176.0200 180.01 89148041.0 TSLA\n",
51
- "2024-05-01 182.00 185.86 179.0100 179.99 92829719.0 TSLA\n",
52
- "2024-04-30 186.98 190.95 182.8401 183.28 127031787.0 TSLA\n",
53
- "2024-04-29 188.42 198.87 184.5400 194.05 243869678.0 TSLA\n"
54
- ]
55
- }
56
- ],
57
- "source": [
58
- "\n",
59
- "\n",
60
- "load_dotenv()\n",
61
- "\n",
62
- "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
63
- "ts = TimeSeries(key=api_key, output_format='pandas')\n",
64
- "\n",
65
- "def fetch_stock_prices(symbol):\n",
66
- " # Fetch daily adjusted stock prices; adjust the symbol as needed\n",
67
- " data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')\n",
68
- " \n",
69
- " # Add a new column named 'ticker' and fill it with the ticker name\n",
70
- " data['ticker'] = symbol\n",
71
- " \n",
72
- " return data\n",
73
- "\n",
74
- "# Example usage\n",
75
- "symbol = 'TSLA'\n",
76
- "stock_data = fetch_stock_prices(symbol)\n",
77
- "print(stock_data.head())\n"
78
- ]
79
- },
80
- {
81
- "cell_type": "code",
82
- "execution_count": null,
83
- "metadata": {},
84
- "outputs": [],
85
- "source": [
86
- "def create_tsla_history():\n",
87
- "\n",
88
- " start_date = datetime.datetime.strptime('2015-07-16',\"%Y-%m-%d\")\n",
89
- " end_date = datetime.datetime.strptime('2023-01-05',\"%Y-%m-%d\")\n",
90
- "\n",
91
- " # Get the TSLA stock data from yfinance\n",
92
- " tsla = Ticker(\"TSLA\")\n",
93
- " # info = tsla.info\n",
94
- "\n",
95
- " # get historical market data\n",
96
- " data = tsla.history(start=start_date, end=end_date)\n",
97
- "\n",
98
- " # drop some columns\n",
99
- " tesla_df = data.drop(columns=['Dividends','Stock Splits'])\n",
100
- " tesla_df.index = tesla_df.index.strftime('%Y-%m-%d')\n",
101
- " \n",
102
- " print('Number of business days included in data set: ',np.shape(tesla_df))\n",
103
- "\n",
104
- " # Create an array of all dates in the specified period\n",
105
- " all_dates = np.array([start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days)])\n",
106
- " all_dates = [d.strftime('%Y-%m-%d') for d in all_dates]\n",
107
- "\n",
108
- " # Use setdiff1d() to find the non-business days\n",
109
- " isBusinessDay, _ = extract_business_day(start_date='2015-07-16',end_date='2023-01-04')\n",
110
- " non_business_days = np.setdiff1d(all_dates, isBusinessDay)\n",
111
- "\n",
112
- " # Add nan-values to the non-business days\n",
113
- " print('Add {} non business days with NaN-values'.format(len(non_business_days)))\n",
114
- " for d in non_business_days:\n",
115
- " tesla_df.loc[d,:] = [np.nan,np.nan,np.nan,np.nan,np.nan]\n",
116
- "\n",
117
- " # sort index (dates)\n",
118
- " tesla_df = tesla_df.sort_index()\n",
119
- " \n",
120
- " # move \"date\"-index into its own column\n",
121
- " tesla_df = tesla_df.reset_index()\n",
122
- " \n",
123
- " # Rename column 'Date' to 'date'\n",
124
- " tesla_df = tesla_df.rename(columns={'Date': 'date'})\n",
125
- " print('Final size of dataframe',np.shape(tesla_df))\n",
126
- " \n",
127
- " # Write the merged dataframe to a CSV file\n",
128
- " start_date ='2022-04-01'\n",
129
- " end_date = '2024-04-01'\n",
130
- " save_path = \"data/stock/tesla_{}-{}.csv\".format(start_date,end_date)\n",
131
- " \n",
132
- " print('Save at :',save_path)\n",
133
- " tesla_df.to_csv(save_path, index=False)\n",
134
- " \n",
135
- " return tesla_df"
136
- ]
137
- },
138
- {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "metadata": {},
142
- "outputs": [],
143
- "source": [
144
- "def extract_business_day(start_date,end_date):\n",
145
- " \"\"\"\n",
146
- " Given a start_date and end_date.\n",
147
- " \n",
148
- " `Returns`:\n",
149
- " \n",
150
- " isBusinessDay: list of str (with all dates being business days)\n",
151
- " is_open: boolean list\n",
152
- " e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
153
- " \"\"\"\n",
154
- " \n",
155
- " # Save for later\n",
156
- " end_date_save = end_date\n",
157
- " \n",
158
- " # Get the NYSE calendar\n",
159
- " cal = mcal.get_calendar('NYSE')\n",
160
- "\n",
161
- " # Get the NYSE calendar's open and close times for the specified period\n",
162
- " schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
163
- " \n",
164
- " # Only need a list of dates when it's open (not open and close times)\n",
165
- " isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
166
- " \n",
167
- " # Go over all days: \n",
168
- " delta = datetime.timedelta(days=1)\n",
169
- " start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
170
- " end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
171
- " \n",
172
- " # Extract days from the timedelta object\n",
173
- " num_days = (end_date - start_date).days + 1\n",
174
- " \n",
175
- " # Create boolean array for days being open (1) and closed (0) \n",
176
- " is_open = np.zeros(num_days)\n",
177
- " \n",
178
- " # iterate over range of dates\n",
179
- " current_BusinessDay = isBusinessDay[0]\n",
180
- " count_dates = 0\n",
181
- " next_BusinessDay = 0\n",
182
- " \n",
183
- " while (start_date <= end_date):\n",
184
- " \n",
185
- " if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
186
- " is_open[count_dates] = True\n",
187
- "\n",
188
- " if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
189
- " break\n",
190
- " else:\n",
191
- " next_BusinessDay += 1\n",
192
- " current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
193
- " else:\n",
194
- " is_open[count_dates] = False\n",
195
- "\n",
196
- " count_dates += 1 \n",
197
- " start_date += delta\n",
198
- " \n",
199
- " print(np.shape(is_open))\n",
200
- " \n",
201
- " return isBusinessDay, is_open"
202
- ]
203
- },
204
- {
205
- "cell_type": "code",
206
- "execution_count": null,
207
- "metadata": {},
208
- "outputs": [],
209
- "source": []
210
- },
211
- {
212
- "cell_type": "code",
213
- "execution_count": null,
214
- "metadata": {},
215
- "outputs": [
216
- {
217
- "name": "stdout",
218
- "output_type": "stream",
219
- "text": [
220
- "Data saved to TSLA_stock_price.csv\n"
221
- ]
222
- }
223
- ],
224
- "source": [
225
- "# Define your file path and name\n",
226
- "file_path = 'TSLA_stock_price.csv' # Customize the path and filename\n",
227
- "\n",
228
- "# Save the DataFrame to CSV\n",
229
- "stock_data.to_csv(file_path)\n",
230
- "\n",
231
- "print(f\"Data saved to {file_path}\")"
232
- ]
233
- }
234
- ],
235
- "metadata": {
236
- "kernelspec": {
237
- "display_name": "base",
238
- "language": "python",
239
- "name": "python3"
240
- },
241
- "language_info": {
242
- "codemirror_mode": {
243
- "name": "ipython",
244
- "version": 3
245
- },
246
- "file_extension": ".py",
247
- "mimetype": "text/x-python",
248
- "name": "python",
249
- "nbconvert_exporter": "python",
250
- "pygments_lexer": "ipython3",
251
- "version": "3.11.9"
252
- },
253
- "orig_nbformat": 4
254
- },
255
- "nbformat": 4,
256
- "nbformat_minor": 2
257
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -19,3 +19,4 @@ textblob
19
  great_expectations==0.18.12
20
  prophet
21
  tensorflow
 
 
19
  great_expectations==0.18.12
20
  prophet
21
  tensorflow
22
+ pandas_market_calendars
training_pipeline.ipynb DELETED
@@ -1,167 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 4,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "data": {
10
- "text/plain": [
11
- "True"
12
- ]
13
- },
14
- "execution_count": 4,
15
- "metadata": {},
16
- "output_type": "execute_result"
17
- }
18
- ],
19
- "source": [
20
- "import hopsworks\n",
21
- "from dotenv import load_dotenv\n",
22
- "import os\n",
23
- "\n",
24
- "load_dotenv()"
25
- ]
26
- },
27
- {
28
- "cell_type": "code",
29
- "execution_count": 21,
30
- "metadata": {},
31
- "outputs": [
32
- {
33
- "name": "stdout",
34
- "output_type": "stream",
35
- "text": [
36
- "Connected. Call `.close()` to terminate connection gracefully.\n",
37
- "\n",
38
- "Sample data from the feature view:\n",
39
- "<class 'tuple'>\n",
40
- "( date open sentiment\n",
41
- "0 2023-06-26T00:00:00.000Z 250.065 0.119444\n",
42
- "1 2023-07-25T00:00:00.000Z 272.380 0.119444\n",
43
- "2 2023-01-10T00:00:00.000Z 121.070 0.102207\n",
44
- "3 2023-05-11T00:00:00.000Z 168.700 0.141296\n",
45
- "4 2023-08-01T00:00:00.000Z 266.260 0.011111\n",
46
- ".. ... ... ...\n",
47
- "464 2022-12-22T00:00:00.000Z 136.000 0.102207\n",
48
- "465 2023-08-23T00:00:00.000Z 229.340 0.024046\n",
49
- "466 2022-09-08T00:00:00.000Z 281.300 0.087306\n",
50
- "467 2023-07-06T00:00:00.000Z 278.090 0.119444\n",
51
- "468 2023-10-27T00:00:00.000Z 210.600 0.164868\n",
52
- "\n",
53
- "[469 rows x 3 columns], ticker\n",
54
- "0 TSLA\n",
55
- "1 TSLA\n",
56
- "2 TSLA\n",
57
- "3 TSLA\n",
58
- "4 TSLA\n",
59
- ".. ...\n",
60
- "464 TSLA\n",
61
- "465 TSLA\n",
62
- "466 TSLA\n",
63
- "467 TSLA\n",
64
- "468 TSLA\n",
65
- "\n",
66
- "[469 rows x 1 columns])\n"
67
- ]
68
- }
69
- ],
70
- "source": [
71
- "import hsfs\n",
72
- "\n",
73
- "# Connection setup\n",
74
- "# Connect to Hopsworks\n",
75
- "api_key = os.getenv('hopsworks_api')\n",
76
- "connection = hsfs.connection()\n",
77
- "fs = connection.get_feature_store()\n",
78
- "\n",
79
- "# Get feature view\n",
80
- "feature_view = fs.get_feature_view(\n",
81
- " name='tesla_stocks_fv',\n",
82
- " version=1\n",
83
- ")\n",
84
- "td_version, td_job = feature_view.create_train_test_split(\n",
85
- " description = 'tesla and news sentiment training dataset',\n",
86
- " data_format = \"csv\",\n",
87
- " test_size = 0.2,\n",
88
- " coalesce = True,\n",
89
- " statistics_config={\n",
90
- " \"enabled\": True,\n",
91
- " \"histograms\": False,\n",
92
- " \"correlations\": False\n",
93
- " } \n",
94
- ")\n"
95
- ]
96
- },
97
- {
98
- "cell_type": "code",
99
- "execution_count": 22,
100
- "metadata": {},
101
- "outputs": [
102
- {
103
- "data": {
104
- "text/plain": [
105
- "( date open sentiment\n",
106
- " 0 2023-06-26T00:00:00.000Z 250.065 0.119444\n",
107
- " 1 2023-07-25T00:00:00.000Z 272.380 0.119444\n",
108
- " 2 2023-01-10T00:00:00.000Z 121.070 0.102207\n",
109
- " 3 2023-05-11T00:00:00.000Z 168.700 0.141296\n",
110
- " 4 2023-08-01T00:00:00.000Z 266.260 0.011111\n",
111
- " .. ... ... ...\n",
112
- " 464 2022-12-22T00:00:00.000Z 136.000 0.102207\n",
113
- " 465 2023-08-23T00:00:00.000Z 229.340 0.024046\n",
114
- " 466 2022-09-08T00:00:00.000Z 281.300 0.087306\n",
115
- " 467 2023-07-06T00:00:00.000Z 278.090 0.119444\n",
116
- " 468 2023-10-27T00:00:00.000Z 210.600 0.164868\n",
117
- " \n",
118
- " [469 rows x 3 columns],\n",
119
- " ticker\n",
120
- " 0 TSLA\n",
121
- " 1 TSLA\n",
122
- " 2 TSLA\n",
123
- " 3 TSLA\n",
124
- " 4 TSLA\n",
125
- " .. ...\n",
126
- " 464 TSLA\n",
127
- " 465 TSLA\n",
128
- " 466 TSLA\n",
129
- " 467 TSLA\n",
130
- " 468 TSLA\n",
131
- " \n",
132
- " [469 rows x 1 columns])"
133
- ]
134
- },
135
- "execution_count": 22,
136
- "metadata": {},
137
- "output_type": "execute_result"
138
- }
139
- ],
140
- "source": [
141
- "sample_data"
142
- ]
143
- }
144
- ],
145
- "metadata": {
146
- "kernelspec": {
147
- "display_name": "base",
148
- "language": "python",
149
- "name": "python3"
150
- },
151
- "language_info": {
152
- "codemirror_mode": {
153
- "name": "ipython",
154
- "version": 3
155
- },
156
- "file_extension": ".py",
157
- "mimetype": "text/x-python",
158
- "name": "python",
159
- "nbconvert_exporter": "python",
160
- "pygments_lexer": "ipython3",
161
- "version": "3.11.4"
162
- },
163
- "orig_nbformat": 4
164
- },
165
- "nbformat": 4,
166
- "nbformat_minor": 2
167
- }