Spaces:

mtzeve
/

stocks-prediction-app

No application file

App Files Files Community

mtzeve commited on May 1, 2024

Commit

1f66d66

1 Parent(s): 92a94bb

Updated_1

Browse files

Files changed (13) hide show

__pycache__/news_preprocessing.cpython-311.pyc +0 -0
feature_engineering.py +0 -32
feature_pipeline.ipynb +154 -140
feature_pipeline.py +100 -0
feature_preprocessing.ipynb +0 -116
feature_view.ipynb +45 -0
historical_news.ipynb +148 -0
historical_stock.ipynb +87 -0
news_articles.csv +0 -0
news_exp.ipynb +0 -0
news_preprocessing.ipynb +76 -0
news_preprocessing.py +40 -0
requirements.txt +4 -0

__pycache__/news_preprocessing.cpython-311.pyc ADDED Viewed

Binary file (1.86 kB). View file

feature_engineering.py DELETED Viewed

@@ -1,32 +0,0 @@
-# %%
-import requests
-import pandas as pd
-import json
-import datetime
-import numpy as np
-from datetime import timedelta
-# %%
-def getNews(api_key,endpoint,ticker,from_date,to_date,num=1000):
-    # Set the parameters for the request
-    params = {
-        "api_token": api_key,
-        "s": ticker,
-        "from": from_date,
-        "to": to_date,
-        "limit": num,
-    }
-    # Make the request to the API
-    response = requests.get(endpoint, params=params)
-    # Print the response from the API
-    print(response.json())
-    #Return a Pandas dataframe from the response
-    return pd.DataFrame(response.json())
-# %%

feature_pipeline.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,20 +12,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "            1. open  2. high    3. low  4. close    5. volume\n",
-      "date                                                         \n",
-      "2024-04-26   168.85   172.12  166.3700    168.29  109815725.0\n",
-      "2024-04-25   158.96   170.88  158.3600    170.18  126427521.0\n",
-      "2024-04-24   162.84   167.97  157.5100    162.13  181178020.0\n",
-      "2024-04-23   143.33   147.26  141.1100    144.68  124545104.0\n",
-      "2024-04-22   140.56   144.44  138.8025    142.05  107097564.0\n"
      ]
     }
    ],
@@ -48,53 +48,12 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Feature Group created successfully, explore it at \n",
-      "https://c.app.hopsworks.ai:443/p/549016/fs/544838/fg/752979\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "394c6ab7da624ed388df0b9b8bff469a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/3479 | Elapsed Time: 00:00 | Remaining Time: ?"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Launching job: tsla_stock_1_offline_fg_materialization\n",
-      "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/549016/jobs/named/tsla_stock_1_offline_fg_materialization/executions\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(<hsfs.core.job.Job at 0x158c80fd0>, None)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -102,17 +61,17 @@
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "DatetimeIndex: 3479 entries, 2024-04-24 to 2010-06-29\n",
       "Data columns (total 5 columns):\n",
       " #   Column     Non-Null Count  Dtype  \n",
       "---  ------     --------------  -----  \n",
-      " 0   1. open    3479 non-null   float64\n",
-      " 1   2. high    3479 non-null   float64\n",
-      " 2   3. low     3479 non-null   float64\n",
-      " 3   4. close   3479 non-null   float64\n",
-      " 4   5. volume  3479 non-null   float64\n",
       "dtypes: float64(5)\n",
-      "memory usage: 163.1 KB\n"
      ]
     }
    ],
@@ -122,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -130,12 +89,12 @@
       "text/plain": [
        "{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n",
        " '2. Symbol': 'TSLA',\n",
-       " '3. Last Refreshed': '2024-04-24',\n",
        " '4. Output Size': 'Full size',\n",
        " '5. Time Zone': 'US/Eastern'}"
       ]
      },
-     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -146,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -169,19 +128,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "         date  1. open  2. high    3. low  4. close    5. volume\n",
-      "0  2024-04-24   162.84   167.97  157.5100    162.13  181178020.0\n",
-      "1  2024-04-23   143.33   147.26  141.1100    144.68  124545104.0\n",
-      "2  2024-04-22   140.56   144.44  138.8025    142.05  107097564.0\n",
-      "3  2024-04-19   148.97   150.94  146.2200    147.05   87074500.0\n",
-      "4  2024-04-18   151.25   152.20  148.7000    149.93   96098830.0\n"
      ]
     }
    ],
@@ -193,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -232,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -241,7 +200,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -253,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -288,48 +247,48 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2024-04-24</td>\n",
-       "      <td>162.84</td>\n",
-       "      <td>167.9700</td>\n",
-       "      <td>157.5100</td>\n",
-       "      <td>162.13</td>\n",
-       "      <td>181178020.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2024-04-23</td>\n",
-       "      <td>143.33</td>\n",
-       "      <td>147.2600</td>\n",
-       "      <td>141.1100</td>\n",
-       "      <td>144.68</td>\n",
-       "      <td>124545104.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>2024-04-22</td>\n",
-       "      <td>140.56</td>\n",
-       "      <td>144.4400</td>\n",
-       "      <td>138.8025</td>\n",
-       "      <td>142.05</td>\n",
-       "      <td>107097564.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>2024-04-19</td>\n",
-       "      <td>148.97</td>\n",
-       "      <td>150.9400</td>\n",
-       "      <td>146.2200</td>\n",
-       "      <td>147.05</td>\n",
-       "      <td>87074500.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>2024-04-18</td>\n",
-       "      <td>151.25</td>\n",
-       "      <td>152.2000</td>\n",
-       "      <td>148.7000</td>\n",
-       "      <td>149.93</td>\n",
-       "      <td>96098830.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -341,73 +300,73 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3474</th>\n",
        "      <td>2010-07-06</td>\n",
        "      <td>20.00</td>\n",
        "      <td>20.0000</td>\n",
-       "      <td>15.8300</td>\n",
        "      <td>16.11</td>\n",
        "      <td>6866900.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3475</th>\n",
        "      <td>2010-07-02</td>\n",
        "      <td>23.00</td>\n",
        "      <td>23.1000</td>\n",
-       "      <td>18.7100</td>\n",
        "      <td>19.20</td>\n",
        "      <td>5139800.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3476</th>\n",
        "      <td>2010-07-01</td>\n",
        "      <td>25.00</td>\n",
        "      <td>25.9200</td>\n",
-       "      <td>20.2700</td>\n",
        "      <td>21.96</td>\n",
        "      <td>8218800.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3477</th>\n",
        "      <td>2010-06-30</td>\n",
        "      <td>25.79</td>\n",
        "      <td>30.4192</td>\n",
-       "      <td>23.3000</td>\n",
        "      <td>23.83</td>\n",
        "      <td>17187100.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3478</th>\n",
        "      <td>2010-06-29</td>\n",
        "      <td>19.00</td>\n",
        "      <td>25.0000</td>\n",
-       "      <td>17.5400</td>\n",
        "      <td>23.89</td>\n",
        "      <td>18766300.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3479 rows × 6 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "            date  1. open   2. high    3. low  4. close    5. volume\n",
-       "0     2024-04-24   162.84  167.9700  157.5100    162.13  181178020.0\n",
-       "1     2024-04-23   143.33  147.2600  141.1100    144.68  124545104.0\n",
-       "2     2024-04-22   140.56  144.4400  138.8025    142.05  107097564.0\n",
-       "3     2024-04-19   148.97  150.9400  146.2200    147.05   87074500.0\n",
-       "4     2024-04-18   151.25  152.2000  148.7000    149.93   96098830.0\n",
-       "...          ...      ...       ...       ...       ...          ...\n",
-       "3474  2010-07-06    20.00   20.0000   15.8300     16.11    6866900.0\n",
-       "3475  2010-07-02    23.00   23.1000   18.7100     19.20    5139800.0\n",
-       "3476  2010-07-01    25.00   25.9200   20.2700     21.96    8218800.0\n",
-       "3477  2010-06-30    25.79   30.4192   23.3000     23.83   17187100.0\n",
-       "3478  2010-06-29    19.00   25.0000   17.5400     23.89   18766300.0\n",
        "\n",
-       "[3479 rows x 6 columns]"
       ]
      },
-     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -418,7 +377,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -428,7 +387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -445,7 +404,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -461,18 +420,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "91ef74ded4714a1492bdc24b176c4f1e",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/3479 | Elapsed Time: 00:00 | Remaining Time: ?"
       ]
      },
      "metadata": {},
@@ -490,10 +449,10 @@
     {
      "data": {
       "text/plain": [
-       "(<hsfs.core.job.Job at 0x177b01510>, None)"
       ]
      },
-     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -504,10 +463,65 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "            1. open  2. high  3. low  4. close    5. volume\n",
+      "date                                                       \n",
+      "2024-04-29   188.42   198.87  184.54    194.05  243869678.0\n",
+      "2024-04-26   168.85   172.12  166.37    168.29  109815725.0\n",
+      "2024-04-25   158.96   170.88  158.36    170.18  126427521.0\n",
+      "2024-04-24   162.84   167.97  157.51    162.13  181178020.0\n",
+      "2024-04-23   143.33   147.26  141.11    144.68  124545104.0\n"
      ]
     }
    ],
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
+      "DatetimeIndex: 3482 entries, 2024-04-29 to 2010-06-29\n",
       "Data columns (total 5 columns):\n",
       " #   Column     Non-Null Count  Dtype  \n",
       "---  ------     --------------  -----  \n",
+      " 0   1. open    3482 non-null   float64\n",
+      " 1   2. high    3482 non-null   float64\n",
+      " 2   3. low     3482 non-null   float64\n",
+      " 3   4. close   3482 non-null   float64\n",
+      " 4   5. volume  3482 non-null   float64\n",
       "dtypes: float64(5)\n",
+      "memory usage: 163.2 KB\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
       "text/plain": [
        "{'1. Information': 'Daily Prices (open, high, low, close) and Volumes',\n",
        " '2. Symbol': 'TSLA',\n",
+       " '3. Last Refreshed': '2024-04-29',\n",
        " '4. Output Size': 'Full size',\n",
        " '5. Time Zone': 'US/Eastern'}"
       ]
      },
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "         date  1. open  2. high  3. low  4. close    5. volume\n",
+      "0  2024-04-29   188.42   198.87  184.54    194.05  243869678.0\n",
+      "1  2024-04-26   168.85   172.12  166.37    168.29  109815725.0\n",
+      "2  2024-04-25   158.96   170.88  158.36    170.18  126427521.0\n",
+      "3  2024-04-24   162.84   167.97  157.51    162.13  181178020.0\n",
+      "4  2024-04-23   143.33   147.26  141.11    144.68  124545104.0\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
+       "      <td>2024-04-29</td>\n",
+       "      <td>188.42</td>\n",
+       "      <td>198.8700</td>\n",
+       "      <td>184.54</td>\n",
+       "      <td>194.05</td>\n",
+       "      <td>243869678.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
+       "      <td>2024-04-26</td>\n",
+       "      <td>168.85</td>\n",
+       "      <td>172.1200</td>\n",
+       "      <td>166.37</td>\n",
+       "      <td>168.29</td>\n",
+       "      <td>109815725.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
+       "      <td>2024-04-25</td>\n",
+       "      <td>158.96</td>\n",
+       "      <td>170.8800</td>\n",
+       "      <td>158.36</td>\n",
+       "      <td>170.18</td>\n",
+       "      <td>126427521.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>2024-04-24</td>\n",
+       "      <td>162.84</td>\n",
+       "      <td>167.9700</td>\n",
+       "      <td>157.51</td>\n",
+       "      <td>162.13</td>\n",
+       "      <td>181178020.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
+       "      <td>2024-04-23</td>\n",
+       "      <td>143.33</td>\n",
+       "      <td>147.2600</td>\n",
+       "      <td>141.11</td>\n",
+       "      <td>144.68</td>\n",
+       "      <td>124545104.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3477</th>\n",
        "      <td>2010-07-06</td>\n",
        "      <td>20.00</td>\n",
        "      <td>20.0000</td>\n",
+       "      <td>15.83</td>\n",
        "      <td>16.11</td>\n",
        "      <td>6866900.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3478</th>\n",
        "      <td>2010-07-02</td>\n",
        "      <td>23.00</td>\n",
        "      <td>23.1000</td>\n",
+       "      <td>18.71</td>\n",
        "      <td>19.20</td>\n",
        "      <td>5139800.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3479</th>\n",
        "      <td>2010-07-01</td>\n",
        "      <td>25.00</td>\n",
        "      <td>25.9200</td>\n",
+       "      <td>20.27</td>\n",
        "      <td>21.96</td>\n",
        "      <td>8218800.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3480</th>\n",
        "      <td>2010-06-30</td>\n",
        "      <td>25.79</td>\n",
        "      <td>30.4192</td>\n",
+       "      <td>23.30</td>\n",
        "      <td>23.83</td>\n",
        "      <td>17187100.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3481</th>\n",
        "      <td>2010-06-29</td>\n",
        "      <td>19.00</td>\n",
        "      <td>25.0000</td>\n",
+       "      <td>17.54</td>\n",
        "      <td>23.89</td>\n",
        "      <td>18766300.0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>3482 rows × 6 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
+       "            date  1. open   2. high  3. low  4. close    5. volume\n",
+       "0     2024-04-29   188.42  198.8700  184.54    194.05  243869678.0\n",
+       "1     2024-04-26   168.85  172.1200  166.37    168.29  109815725.0\n",
+       "2     2024-04-25   158.96  170.8800  158.36    170.18  126427521.0\n",
+       "3     2024-04-24   162.84  167.9700  157.51    162.13  181178020.0\n",
+       "4     2024-04-23   143.33  147.2600  141.11    144.68  124545104.0\n",
+       "...          ...      ...       ...     ...       ...          ...\n",
+       "3477  2010-07-06    20.00   20.0000   15.83     16.11    6866900.0\n",
+       "3478  2010-07-02    23.00   23.1000   18.71     19.20    5139800.0\n",
+       "3479  2010-07-01    25.00   25.9200   20.27     21.96    8218800.0\n",
+       "3480  2010-06-30    25.79   30.4192   23.30     23.83   17187100.0\n",
+       "3481  2010-06-29    19.00   25.0000   17.54     23.89   18766300.0\n",
        "\n",
+       "[3482 rows x 6 columns]"
       ]
      },
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae6a0214d34943cabcdd66d70198ae3a",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/3482 | Elapsed Time: 00:00 | Remaining Time: ?"
       ]
      },
      "metadata": {},
     {
      "data": {
       "text/plain": [
+       "(<hsfs.core.job.Job at 0x162ac3e50>, None)"
       ]
      },
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/549016/fs/544838/fg/766341\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74f0d70aeb3942c093321c530120434e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/712 | Elapsed Time: 00:00 | Remaining Time: ?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: news_sentiment_1_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/549016/jobs/named/news_sentiment_1_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<hsfs.core.job.Job at 0x164180710>, None)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create feature group for historical news data\n",
+    "news_df = pd.read_csv('/Users/manos/Documents/BDS/MLops_mod/news_articles.csv')\n",
+    "\n",
+    "news_sentiment_fg = fs.get_or_create_feature_group(\n",
+    "    name='news_sentiment',\n",
+    "    description='News sentiment from Polygon',\n",
+    "    version=1,\n",
+    "    primary_key=['date'],\n",
+    "    online_enabled=True,\n",
+    ")\n",
+    "\n",
+    "news_sentiment_fg.insert(news_df)"
+   ]
   }
  ],
  "metadata": {

feature_pipeline.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# %%
+from dotenv import load_dotenv
+import os
+# %%
+from alpha_vantage.timeseries import TimeSeries
+import pandas as pd
+load_dotenv()
+api_key = os.environ.get('stocks_api') # Replace this with your actual API key
+ts = TimeSeries(key=api_key, output_format='pandas')
+# Fetch daily adjusted stock prices; adjust the symbol as needed
+data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
+print(data.head())
+# %%
+# %%
+data.info()
+# %%
+meta_data
+# %%
+# Define your file path and name
+file_path = '/Users/manos/Documents/BDS/MLops_mod/TSLA_stock_price.csv'  # Customize the path and filename
+# Save the DataFrame to CSV
+data.to_csv(file_path)
+print(f"Data saved to {file_path}")
+# %%
+# Load and display the data from CSV to confirm
+tsla_df = pd.read_csv(file_path)
+print(tsla_df.head())
+# %%
+import hopsworks
+project = hopsworks.login()
+fs = project.get_feature_store()
+# %%
+import re
+# %%
+def clean_column_name(name):
+    # Remove all non-letter characters
+    cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
+    return cleaned_name
+# %%
+tsla_df
+# %%
+# Assuming 'tsla_df' is your DataFrame
+tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
+# %%
+print(tsla_df.columns)
+# %%
+# Define a feature group
+tesla_fg = fs.get_or_create_feature_group(
+    name="tsla_stock",
+    description="Tesla stock dataset from alpha vantage",
+    version=1,
+    primary_key=["date"],
+    online_enabled=True,
+)
+# %%
+tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
+# %%
+# Create feature group for historical news data
+news_df = pd.read_csv('/Users/manos/Documents/BDS/MLops_mod/news_articles.csv')
+news_sentiment_fg = fs.get_or_create_feature_group(
+    name='news_sentiment',
+    description='News sentiment from Polygon',
+    version=1,
+    primary_key=['date'],
+    online_enabled=True,
+)
+news_sentiment_fg.insert(news_df)

feature_preprocessing.ipynb DELETED Viewed

@@ -1,116 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from calendar import monthrange\n",
-    "from feature_engineering import *\n",
-    "import glob\n",
-    "import pandas as pd\n",
-    "from dotenv import load_dotenv\n",
-    "import os\n",
-    "\n",
-    "load_dotenv()\n",
-    "\n",
-    "# Set the API endpoint and your API key\n",
-    "endpoint = \"https://api.marketaux.com/v1/news/all?symbols=TSLA&filter_entities=true&published_after=2021&language=en&api_token=iy6rRX4oxFrouZocXr8JNpOzaxZLk3UvMfoMGxYs\"\n",
-    "api_key = os.environ.get('news_api')\n",
-    "\n",
-    "# Set the ticker symbol\n",
-    "ticker = \"TSLA\" #TSLA"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def getNews_historical(api_key,endpoint,ticker,year,month,num=1000):\n",
-    "  \n",
-    "    for start,end in zip([1,15],[16,monthrange(year, month)[1]]):\n",
-    "    \n",
-    "        from_date = '{}-{:02d}-{:02d}'.format(year,month,start)\n",
-    "        to_date = '{}-{:02d}-{:02d}'.format(year,month,end)\n",
-    "        \n",
-    "        print('Grabbing News data between {}-{}'.format(from_date,to_date))    \n",
-    "        news = getNews(api_key,endpoint,ticker,from_date,to_date)\n",
-    "        \n",
-    "        print('Number of articles: ',len(news.index))\n",
-    "        news.head(n=num)\n",
-    "\n",
-    "        # Store the dataframe as a CSV file\n",
-    "        news.to_csv(\"/Users/manos/Documents/BDS/MLops_mod/TSLA_news_{}_to_{}.csv\".format(from_date,to_date))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'meta': {'found': 58203, 'returned': 3, 'limit': 3, 'page': 1}, 'data': [{'uuid': 'a2f5f0e0-937a-4333-9aa7-da32fb0ede1f', 'title': \"What's next for Big Tech? See what SA analysts have to say\", 'description': 'Technology stocks have dropped over the past couple of weeks. See what SA analysts have to say about the overall state of tech and the economy.', 'keywords': '', 'snippet': 'Technology stocks have dropped over the past couple of weeks, and it was further seen with the selloff in Meta Platforms (META) and weak GDP data, as the two ac...', 'url': 'https://seekingalpha.com/news/4094186-tech-stocks-dive-see-what-sa-analysts-have-to-say', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/184997191/image_184997191.jpg?io=getty-c-w750', 'language': 'en', 'published_at': '2024-04-26T12:20:54.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 11.309888, 'sentiment_score': 0, 'highlights': [{'highlight': '<em>Tesla</em> (TSLA) -3.5% .\\n\\nTech ETFs', 'sentiment': 0, 'highlighted_in': 'main_text'}]}], 'similar': []}, {'uuid': '650adf2f-d62f-478d-9322-05d3e7d7532d', 'title': 'Stellantis And Tesla: Combine These Stocks For The Ultimate Automotive Portfolio (STLA)', 'description': 'Tesla and Stellantis are two automakers that complement each other. Find out why I see both STLA and TSLA stocks as currently undervalued.', 'keywords': '', 'snippet': 'Tramino/iStock Unreleased via Getty Images\\n\\nStellantis N.V. (NYSE:STLA) and Tesla, Inc. (TSLA) are two very distinct automakers that, in my view, perfectly comp...', 'url': 'https://seekingalpha.com/article/4686610-stellantis-tesla-combine-these-stocks-for-ultimate-automotive-portfolio', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/1305717707/image_1305717707.jpg?io=getty-c-w1536', 'language': 'en', 'published_at': '2024-04-26T10:58:06.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 22.866589, 'sentiment_score': 0.173982, 'highlights': [{'highlight': '(NYSE:STLA) and <em>Tesla</em>, <em>Inc</em>. (TSLA) are two very distinct automakers that, in my view, perfectly complement each other. By entering a 50/50 balanced position in the two companies, you can create an “artificial” automaker in your portfolio that is bound to dominate the industry and provide superior returns for shareholders.', 'sentiment': 0.8519, 'highlighted_in': 'main_text'}, {'highlight': 'The brands of “TESSA” include:\\n\\n<em>Tesla</em>, the leading global EV brand and #1 most valuable car brand in the world. Because of Tesla’s aggressive price policy lately, I believe it makes almost no economic sense to buy an EV that is not a <em>Tesla</em>, for the majority of consumers. More on this shortly.', 'sentiment': 0.2089, 'highlighted_in': 'main_text'}, {'highlight': 'The two overall car brand portfolios encompass all market segments\\n\\nGoing beyond EVs, I see “TESSA’s” car portfolio to cover all segments, again because of the complementarity of <em>Tesla</em> and Stellantis. The below chart outlines how all car segments are covered by either <em>Tesla</em> or Stellantis.', 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'To be fair, both Stellantis and <em>Tesla</em> margins declined in 2023, and in the case of <em>Tesla</em>, the company just reported that margins are now down to 5.5% after Q1 price cuts.\\n\\nHowever, I believe that Tesla’s margins at the moment do not tell the full story.', 'sentiment': 0.0258, 'highlighted_in': 'main_text'}, {'highlight': 'It is precisely because it enjoyed a 25%+ operating margin back in 2021 that <em>Tesla</em> was able to grow its company with aggressive pricing in the past 2 years.\\n\\nToday, for the majority of use cases, I believe buying an EV that is not a <em>Tesla</em> does not make rational sense.', 'sentiment': 0.4019, 'highlighted_in': 'main_text'}, {'highlight': 'These are cars that have starting prices that are significantly higher than <em>Tesla</em>, but with worse reviews, worse technology and limited access to Tesla’s SuperCharger system. Even EV-native car brands, such as Rivian and Polestar, have difficulty in competing with <em>Tesla</em>.', 'sentiment': -0.9294, 'highlighted_in': 'main_text'}, {'highlight': 'A Rivian R2 starts at $45,000, which is almost $7,000 more than the base <em>Tesla</em> Model 3.\\n\\nI believe that <em>Tesla</em> is using its margins to grow the EV category, converting ICE consumers, and simultaneously gain the monster share of that growing market.', 'sentiment': 0.743, 'highlighted_in': 'main_text'}, {'highlight': \"Key Financial Metrics for <em>Tesla</em>, Q1 24 (Tesla's Q1 Shareholders Presentation)\\n\\nKey Financial Metrics for Stellantis, Q1 24 (Stellantis' Q1 Shareholder Presentation)\\n\\nThis financial data tells the same story: <em>Tesla</em> and Stellantis complement each other.\", 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'VOO since 2021 (Seeking Alpha)\\n\\nWhat matters for my thesis is that <em>Tesla</em> and Stellantis are complementary in how they reward shareholders and how the market prices their stocks. <em>Tesla</em> is a tech company, looking at the long term, and shareholders need to be patient to see returns.', 'sentiment': 0.5859, 'highlighted_in': 'main_text'}, {'highlight': 'In that case, <em>Tesla</em> might generate returns significantly higher than Stellantis, to the point that it would have seemed silly to “dilute” a <em>Tesla</em> investment with another stock.', 'sentiment': 0.0258, 'highlighted_in': 'main_text'}, {'highlight': 'Stellantis And <em>Tesla</em>: Combine These Stocks For The Ultimate Automotive Portfolio (STLA)', 'sentiment': 0, 'highlighted_in': 'title'}]}], 'similar': []}, {'uuid': '47a58bd4-3a8d-40fe-8a89-934d0d695ea4', 'title': 'Tesla is being investigated by the NHTSA for Autopilot software fix (NASDAQ:TSLA)', 'description': \"The National Highway Traffic Safety Administration is investigating whether Tesla's recall of 2 million vehicles for Autopilot safeguards is sufficient.\", 'keywords': '', 'snippet': \"The National Highway Traffic Safety Administration confirmed on Friday that the safety regulator has opened an investigation into whether Tesla's (NASDAQ:TSLA) ...\", 'url': 'https://seekingalpha.com/news/4094754-tesla-is-being-investigated-by-the-nhtsa-for-autopilot-software-fix', 'image_url': 'https://static.seekingalpha.com/cdn/s3/uploads/getty_images/1415090444/image_1415090444.jpg?io=getty-c-w750', 'language': 'en', 'published_at': '2024-04-26T10:50:20.000000Z', 'source': 'seekingalpha.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 51.845444, 'sentiment_score': 0.42985, 'highlights': [{'highlight': \"The National Highway Traffic Safety Administration confirmed on Friday that the safety regulator has opened an investigation into whether Tesla's (<em>NASDAQ:TSLA</em>) recall of more than 2 million vehicles announced in December to install new Autopilot safeguards is adequate.\", 'sentiment': 0.836, 'highlighted_in': 'main_text'}, {'highlight': \"While <em>Tesla</em> has released software updates to address potential issues, NHTSA cited Tesla's statement that a portion of the remedy both requires the owner to opt in and allows a driver to readily reverse it.\", 'sentiment': 0, 'highlighted_in': 'main_text'}, {'highlight': 'In December, <em>Tesla</em> (TSLA) said its largest-ever recall was to better ensure drivers pay attention when using its advanced driver assistance system.\\n\\nShares of <em>Tesla</em> (TSLA) rose 1.17% in premarket trading on Friday to $172.17. The EV stock is down 31.51% on a year-to-date basis. Short interest stands at 3.84% of the total float.', 'sentiment': 0.8834, 'highlighted_in': 'main_text'}, {'highlight': '<em>Tesla</em> is being investigated by the NHTSA for Autopilot software fix (<em>NASDAQ:TSLA</em>)', 'sentiment': 0, 'highlighted_in': 'title'}]}], 'similar': [{'uuid': 'b269d18a-6ea0-4554-a20e-047c623513f9', 'title': 'US probes Tesla recall of 2 million vehicles over Autopilot, citing concerns By Reuters', 'description': 'US probes Tesla recall of 2 million vehicles over Autopilot, citing concerns', 'keywords': '', 'snippet': \"WASHINGTON (Reuters) - U.S. auto safety regulators said Friday they have opened an investigation into whether Tesla (NASDAQ: )'s recall of more than 2 million v...\", 'url': 'https://www.investing.com/news/stock-market-news/us-probes-tesla-recall-of-2-million-vehicles-over-autopilot-citing-concerns-3400236', 'image_url': 'https://i-invdn-com.investing.com/news/moved_LYNXMPEJ580NE_L.jpg', 'language': 'en', 'published_at': '2024-04-26T09:51:10.000000Z', 'source': 'investing.com', 'relevance_score': None, 'entities': [{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'exchange': None, 'exchange_long': None, 'country': 'us', 'type': 'equity', 'industry': 'Consumer Cyclical', 'match_score': 25.2132, 'sentiment_score': 0.432933, 'highlights': [{'highlight': \"WASHINGTON (Reuters) - U.S. auto safety regulators said Friday they have opened an investigation into whether <em>Tesla</em> (NASDAQ: )'s recall of more than 2 million vehicles announced in December to install new Autopilot safeguards is adequate.\", 'sentiment': 0.7269, 'highlighted_in': 'main_text'}, {'highlight': 'The agency said <em>Tesla</em> has issued software updates to address issues that appear related to its concerns but has not made them \"a part of the recall or otherwise determined to remedy a defect that poses an unreasonable safety risk.\"', 'sentiment': 0.5719, 'highlighted_in': 'main_text'}, {'highlight': 'US probes <em>Tesla</em> recall of 2 million vehicles over Autopilot, citing concerns By Reuters', 'sentiment': 0, 'highlighted_in': 'title'}]}]}]}]}\n"
-     ]
-    }
-   ],
-   "source": [
-    "response = requests.get(endpoint)\n",
-    "data = response.json()\n",
-    "print(data)  # See what the data looks like\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Grabbing News data between 2022-01-01-2022-01-16\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "All arrays must be of the same length",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[1;32m/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb Cell 4\u001b[0m line \u001b[0;36m4\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mfor\u001b[39;00m year \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m2022\u001b[39m,\u001b[39m2023\u001b[39m):\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m     \u001b[39mfor\u001b[39;00m month \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m,\u001b[39m13\u001b[39m):\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m         getNews_historical(api_key,endpoint,ticker,year,month)\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m         \u001b[39mif\u001b[39;00m year \u001b[39m==\u001b[39m \u001b[39m2023\u001b[39m \u001b[39mand\u001b[39;00m month \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m             \u001b[39mbreak\u001b[39;00m\n",
-      "\u001b[1;32m/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb Cell 4\u001b[0m line \u001b[0;36m9\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m to_date \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{:02d}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{:02d}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mformat(year,month,end)\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mGrabbing News data between \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m-\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39mformat(from_date,to_date))    \n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m news \u001b[39m=\u001b[39m getNews(api_key,endpoint,ticker,from_date,to_date)\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mNumber of articles: \u001b[39m\u001b[39m'\u001b[39m,\u001b[39mlen\u001b[39m(news\u001b[39m.\u001b[39mindex))\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/MLops_mod/feature_preprocessing.ipynb#X14sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m news\u001b[39m.\u001b[39mhead(n\u001b[39m=\u001b[39mnum)\n",
-      "File \u001b[0;32m~/Documents/BDS/MLops_mod/feature_engineering.py:27\u001b[0m, in \u001b[0;36mgetNews\u001b[0;34m(api_key, endpoint, ticker, from_date, to_date, num)\u001b[0m\n\u001b[1;32m     21\u001b[0m response \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(endpoint, params\u001b[39m=\u001b[39mparams)\n\u001b[1;32m     23\u001b[0m \u001b[39m# Print the response from the API\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[39m#print(response.json())\u001b[39;00m\n\u001b[1;32m     25\u001b[0m \n\u001b[1;32m     26\u001b[0m \u001b[39m#Return a Pandas dataframe from the response\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[39mreturn\u001b[39;00m pd\u001b[39m.\u001b[39mDataFrame(response\u001b[39m.\u001b[39mjson())\n",
-      "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/frame.py:662\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m    656\u001b[0m     mgr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_init_mgr(\n\u001b[1;32m    657\u001b[0m         data, axes\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m: index, \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: columns}, dtype\u001b[39m=\u001b[39mdtype, copy\u001b[39m=\u001b[39mcopy\n\u001b[1;32m    658\u001b[0m     )\n\u001b[1;32m    660\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(data, \u001b[39mdict\u001b[39m):\n\u001b[1;32m    661\u001b[0m     \u001b[39m# GH#38939 de facto copy defaults to False only in non-dict cases\u001b[39;00m\n\u001b[0;32m--> 662\u001b[0m     mgr \u001b[39m=\u001b[39m dict_to_mgr(data, index, columns, dtype\u001b[39m=\u001b[39mdtype, copy\u001b[39m=\u001b[39mcopy, typ\u001b[39m=\u001b[39mmanager)\n\u001b[1;32m    663\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(data, ma\u001b[39m.\u001b[39mMaskedArray):\n\u001b[1;32m    664\u001b[0m     \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mma\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmrecords\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mmrecords\u001b[39;00m\n",
-      "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:493\u001b[0m, in \u001b[0;36mdict_to_mgr\u001b[0;34m(data, index, columns, dtype, typ, copy)\u001b[0m\n\u001b[1;32m    489\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    490\u001b[0m         \u001b[39m# dtype check to exclude e.g. range objects, scalars\u001b[39;00m\n\u001b[1;32m    491\u001b[0m         arrays \u001b[39m=\u001b[39m [x\u001b[39m.\u001b[39mcopy() \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(x, \u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39melse\u001b[39;00m x \u001b[39mfor\u001b[39;00m x \u001b[39min\u001b[39;00m arrays]\n\u001b[0;32m--> 493\u001b[0m \u001b[39mreturn\u001b[39;00m arrays_to_mgr(arrays, columns, index, dtype\u001b[39m=\u001b[39mdtype, typ\u001b[39m=\u001b[39mtyp, consolidate\u001b[39m=\u001b[39mcopy)\n",
-      "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:118\u001b[0m, in \u001b[0;36marrays_to_mgr\u001b[0;34m(arrays, columns, index, dtype, verify_integrity, typ, consolidate)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[39mif\u001b[39;00m verify_integrity:\n\u001b[1;32m    116\u001b[0m     \u001b[39m# figure out the index, if necessary\u001b[39;00m\n\u001b[1;32m    117\u001b[0m     \u001b[39mif\u001b[39;00m index \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 118\u001b[0m         index \u001b[39m=\u001b[39m _extract_index(arrays)\n\u001b[1;32m    119\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    120\u001b[0m         index \u001b[39m=\u001b[39m ensure_index(index)\n",
-      "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/pandas/core/internals/construction.py:666\u001b[0m, in \u001b[0;36m_extract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m    664\u001b[0m lengths \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(raw_lengths))\n\u001b[1;32m    665\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(lengths) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m--> 666\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mAll arrays must be of the same length\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    668\u001b[0m \u001b[39mif\u001b[39;00m have_dicts:\n\u001b[1;32m    669\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    670\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mMixing dicts with non-Series may lead to ambiguous ordering.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    671\u001b[0m     )\n",
-      "\u001b[0;31mValueError\u001b[0m: All arrays must be of the same length"
-     ]
-    }
-   ],
-   "source": [
-    "# Grab old data\n",
-    "for year in range(2022,2023):\n",
-    "    for month in range(1,13):\n",
-    "        getNews_historical(api_key,endpoint,ticker,year,month)\n",
-    "        if year == 2023 and month == 1:\n",
-    "            break"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

feature_view.ipynb CHANGED Viewed

	@@ -0,0 +1,45 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import modal\n",
+    "import hopsworks\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from calendar import monthrange\n",
+    "from feature_engineering import *\n",
+    "import glob\n",
+    "import pandas as pd\n",
+    "from dotenv import load_dotenv\n",
+    "import os"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

historical_news.ipynb ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "from datetime import datetime, timedelta\n",
+    "import requests\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd \n",
+    "from news_preprocessing import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetched 50 articles from 2022-04-01 to 2022-05-21\n",
+      "Fetched 50 articles from 2022-05-22 to 2022-07-11\n",
+      "Fetched 50 articles from 2022-07-12 to 2022-08-31\n",
+      "Fetched 50 articles from 2022-09-01 to 2022-10-21\n",
+      "Fetched 50 articles from 2022-10-22 to 2022-12-11\n",
+      "Rate limit reached. Waiting to retry...\n",
+      "Fetched 50 articles from 2022-12-12 to 2023-01-31\n",
+      "Fetched 50 articles from 2023-02-01 to 2023-03-23\n",
+      "Fetched 50 articles from 2023-03-24 to 2023-05-13\n",
+      "Fetched 50 articles from 2023-05-14 to 2023-07-03\n",
+      "Fetched 50 articles from 2023-07-04 to 2023-08-23\n",
+      "Rate limit reached. Waiting to retry...\n",
+      "Fetched 50 articles from 2023-08-24 to 2023-10-13\n",
+      "Fetched 50 articles from 2023-10-14 to 2023-12-03\n",
+      "Fetched 50 articles from 2023-12-04 to 2024-01-23\n",
+      "Fetched 50 articles from 2024-01-24 to 2024-03-14\n",
+      "Fetched 50 articles from 2024-03-15 to 2024-04-01\n",
+      "Total articles fetched: 750\n"
+     ]
+    }
+   ],
+   "source": [
+    "def fetch_news(api_key, ticker, start_date, end_date):\n",
+    "    base_url = os.environ.get(\"endpointnewsp\")\n",
+    "    headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
+    "    all_news = []\n",
+    "    \n",
+    "    current_date = start_date\n",
+    "\n",
+    "    while current_date <= end_date:\n",
+    "        batch_end_date = current_date + timedelta(days=50)\n",
+    "        if batch_end_date > end_date:\n",
+    "            batch_end_date = end_date\n",
+    "\n",
+    "        params = {\n",
+    "            \"ticker\": ticker,\n",
+    "            \"published_utc.gte\": current_date.strftime('%Y-%m-%d'),\n",
+    "            \"published_utc.lte\": batch_end_date.strftime('%Y-%m-%d'),\n",
+    "            \"limit\": 50,\n",
+    "            \"sort\": \"published_utc\"\n",
+    "        }\n",
+    "\n",
+    "        try:\n",
+    "            response = requests.get(base_url, headers=headers, params=params)\n",
+    "            if response.status_code == 200:\n",
+    "                data = response.json()\n",
+    "                articles = data.get('results', [])\n",
+    "                all_news.extend(articles)\n",
+    "                print(f\"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}\")\n",
+    "                current_date = batch_end_date + timedelta(days=1)\n",
+    "            elif response.status_code == 429:\n",
+    "                print(\"Rate limit reached. Waiting to retry...\")\n",
+    "                time.sleep(60)  # Wait for 60 seconds or as recommended by the API\n",
+    "                continue  # Retry the current request\n",
+    "            else:\n",
+    "                print(f\"Failed to fetch data: {response.status_code}, {response.text}\")\n",
+    "                break\n",
+    "        except Exception as e:\n",
+    "            print(f\"An error occurred: {e}\")\n",
+    "            break\n",
+    "\n",
+    "    return all_news\n",
+    "\n",
+    "# Example usage\n",
+    "api_key = os.environ.get('newsp_api')\n",
+    "ticker = 'TSLA'\n",
+    "start_date = datetime(2022, 4, 1)  # start date\n",
+    "end_date = datetime(2024, 4, 1)\n",
+    "news_articles = fetch_news(api_key, ticker, start_date, end_date)\n",
+    "print(f\"Total articles fetched: {len(news_articles)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process the news articles\n",
+    "df = process_news_articles(news_articles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv('news_articles.csv', index=False)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

historical_stock.ipynb ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "import os \n",
+    "from alpha_vantage.timeseries import TimeSeries\n",
+    "import pandas as pd\n",
+    "import hopsworks\n",
+    "import re \n",
+    "import modal \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            1. open  2. high    3. low  4. close    5. volume\n",
+      "date                                                         \n",
+      "2024-04-30   186.98   190.95  182.8401    183.28  127031787.0\n",
+      "2024-04-29   188.42   198.87  184.5400    194.05  243869678.0\n",
+      "2024-04-26   168.85   172.12  166.3700    168.29  109815725.0\n",
+      "2024-04-25   158.96   170.88  158.3600    170.18  126427521.0\n",
+      "2024-04-24   162.84   167.97  157.5100    162.13  181178020.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "load_dotenv()\n",
+    "\n",
+    "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
+    "ts = TimeSeries(key=api_key, output_format='pandas')\n",
+    "\n",
+    "# Fetch daily adjusted stock prices; adjust the symbol as needed\n",
+    "data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n",
+    "\n",
+    "print(data.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define your file path and name\n",
+    "file_path = '/Users/manos/Documents/BDS/MLops_mod/TSLA_stock_price.csv'  # Customize the path and filename\n",
+    "\n",
+    "# Save the DataFrame to CSV\n",
+    "data.to_csv(file_path)\n",
+    "\n",
+    "print(f\"Data saved to {file_path}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

news_articles.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

news_exp.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

news_preprocessing.ipynb ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "from datetime import datetime, timedelta\n",
+    "import requests\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd \n",
+    "from textblob import TextBlob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_news_articles(news_articles):\n",
+    "    # Convert list of dictionaries to DataFrame\n",
+    "    df = pd.DataFrame(news_articles)\n",
+    "\n",
+    "    # Drop rows where the description is NaN\n",
+    "    df = df.dropna(subset=['description'])\n",
+    "\n",
+    "    # Fill missing 'amp_url' and 'keywords' with specific placeholders\n",
+    "    df['amp_url'] = df['amp_url'].fillna('No URL provided')\n",
+    "    df['keywords'] = df['keywords'].fillna('No keywords')\n",
+    "\n",
+    "    # Sentiment analysis on descriptions\n",
+    "    df['sentiment'] = df['description'].apply(lambda text: TextBlob(text).sentiment.polarity)\n",
+    "\n",
+    "    # Convert 'published_utc' to datetime and extract date and time\n",
+    "    df['published_utc'] = pd.to_datetime(df['published_utc'])\n",
+    "    df['date'] = df['published_utc'].dt.date\n",
+    "    df['time'] = df['published_utc'].dt.time\n",
+    "\n",
+    "    # Drop unnecessary columns\n",
+    "    df.drop(['published_utc'], axis=1, inplace=True)\n",
+    "    # set date to index\n",
+    "    df = df.set_index(\"date\")\n",
+    "    df.index = pd.to_datetime(df.index)\n",
+    "\n",
+    "    return df\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

news_preprocessing.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# %%
+from dotenv import load_dotenv
+from datetime import datetime, timedelta
+import requests
+import os
+import time
+import pandas as pd
+from textblob import TextBlob
+# %%
+def process_news_articles(news_articles):
+    # Convert list of dictionaries to DataFrame
+    df = pd.DataFrame(news_articles)
+    # Drop rows where the description is NaN
+    df = df.dropna(subset=['description'])
+    # Fill missing 'amp_url' and 'keywords' with specific placeholders
+    df['amp_url'] = df['amp_url'].fillna('No URL provided')
+    df['keywords'] = df['keywords'].fillna('No keywords')
+    # Sentiment analysis on descriptions
+    df['sentiment'] = df['description'].apply(lambda text: TextBlob(text).sentiment.polarity)
+    # Convert 'published_utc' to datetime and extract date and time
+    df['published_utc'] = pd.to_datetime(df['published_utc'])
+    df['date'] = df['published_utc'].dt.date
+    df['time'] = df['published_utc'].dt.time
+    # Drop unnecessary columns
+    df.drop(['published_utc'], axis=1, inplace=True)
+    # set date to index
+    df = df.set_index("date")
+    df.index = pd.to_datetime(df.index)
+    return df

requirements.txt CHANGED Viewed

@@ -12,3 +12,7 @@ pandas==1.5.1
 #Pillow==10.2.0
 scikit-learn==1.4.0
 seaborn==0.13.2

 #Pillow==10.2.0
 scikit-learn==1.4.0
 seaborn==0.13.2
+python-dotenv
+requests
+alpha_vantage
+textblob