Spaces:

mtzeve
/

stocks-prediction-app

No application file

App Files Files Community

fklitt commited on May 9, 2024

Commit

efa7e3e

•

1 Parent(s): 67d3433

Updated_11

Browse files

Files changed (27) hide show

LSTM_model.keras +0 -0
historical_news.ipynb → Stocks news prediction/Notebooks/1_historical_news.ipynb +25 -47
Stocks news prediction/Notebooks/2_historical_stock.ipynb +127 -0
news_preprocessing.ipynb → Stocks news prediction/Notebooks/3_news_preprocessing.ipynb +5 -1
stock_preprocessing.ipynb → Stocks news prediction/Notebooks/4_stock_preprocessing.ipynb +31 -348
Stocks news prediction/Notebooks/5_feature_pipeline.ipynb +493 -0
feature_view.ipynb → Stocks news prediction/Notebooks/6_feature_view.ipynb +38 -126
Stocks news prediction/Notebooks/7_training_pipeline.ipynb +839 -0
Stocks news prediction/Notebooks/8_inference_pipeline.ipynb +315 -0
Stocks news prediction/SML/__pycache__/feature_pipeline.cpython-311.pyc +0 -0
Stocks news prediction/SML/__pycache__/news_preprocessing.cpython-311.pyc +0 -0
feature_pipeline.py → Stocks news prediction/SML/feature_pipeline.py +20 -61
feature_view.py → Stocks news prediction/SML/feature_view.py +11 -37
Stocks news prediction/SML/historical_news.py +120 -0
Stocks news prediction/SML/historical_stock.py +51 -0
news_preprocessing.py → Stocks news prediction/SML/news_preprocessing.py +7 -3
stock_preprocessing.py → Stocks news prediction/SML/stock_preprocessing.py +17 -21
Stocks news prediction/SML/training_pipeline.py +256 -0
TSLA_stock_price.csv → Stocks news prediction/TSLA_stock_price.csv +0 -0
news_articles.csv → Stocks news prediction/news_articles.csv +0 -0
news_articles_ema.csv → Stocks news prediction/news_articles_ema.csv +0 -0
feature_engineering.ipynb +0 -73
feature_pipeline.ipynb +0 -775
feature_view_freddie.py +0 -95
historical_stock.ipynb +0 -257
requirements.txt +1 -0
training_pipeline.ipynb +0 -167

LSTM_model.keras DELETED Viewed

Binary file (291 kB)

historical_news.ipynb → Stocks news prediction/Notebooks/1_historical_news.ipynb RENAMED Viewed

@@ -1,20 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dotenv import load_dotenv\n",
-    "from datetime import datetime, timedelta\n",
-    "import requests\n",
-    "import os\n",
-    "import time\n",
-    "import pandas as pd \n",
-    "from news_preprocessing import *"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -32,6 +17,14 @@
     }
    ],
    "source": [
     "load_dotenv()"
    ]
   },
@@ -44,32 +37,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fetched 50 articles from 2022-05-06 to 2022-06-25\n",
-      "Fetched 50 articles from 2022-06-26 to 2022-08-15\n",
-      "Fetched 50 articles from 2022-08-16 to 2022-10-05\n",
-      "Fetched 50 articles from 2022-10-06 to 2022-11-25\n",
-      "Fetched 50 articles from 2022-11-26 to 2023-01-15\n",
-      "Rate limit reached. Waiting to retry...\n",
-      "Fetched 50 articles from 2023-01-16 to 2023-03-07\n",
-      "Fetched 50 articles from 2023-03-08 to 2023-04-27\n",
-      "Fetched 50 articles from 2023-04-28 to 2023-06-17\n",
-      "Fetched 50 articles from 2023-06-18 to 2023-08-07\n",
-      "Fetched 50 articles from 2023-08-08 to 2023-09-27\n",
       "Rate limit reached. Waiting to retry...\n",
-      "Fetched 50 articles from 2023-09-28 to 2023-11-17\n",
-      "Fetched 50 articles from 2023-11-18 to 2024-01-07\n",
-      "Fetched 50 articles from 2024-01-08 to 2024-02-27\n",
-      "Fetched 50 articles from 2024-02-28 to 2024-04-18\n",
-      "Fetched 50 articles from 2024-04-19 to 2024-05-05\n",
-      "Total articles fetched: 750\n"
      ]
     }
    ],
    "source": [
-    "import os\n",
-    "import requests\n",
-    "from datetime import datetime, timedelta\n",
-    "import pandas as pd\n",
     "\n",
     "def fetch_news(api_key, ticker, start_date, end_date):\n",
     "    base_url = os.environ.get(\"endpointnewsp\")\n",
@@ -97,10 +81,10 @@
     "                data = response.json()\n",
     "                articles = data.get('results', [])\n",
     "                \n",
-    "                # Create DataFrame from articles\n",
     "                df = pd.DataFrame(articles)\n",
     "                \n",
-    "                # Add primary_key column if ticker is found\n",
     "                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
     "                \n",
     "                all_news.append(df)  # Append DataFrame to the list\n",
@@ -119,7 +103,7 @@
     "\n",
     "    return pd.concat(all_news, ignore_index=True)\n",
     "\n",
-    "# Example usage\n",
     "api_key = os.environ.get('newsp_api')\n",
     "ticker = 'TSLA'\n",
     "end_date = datetime.now() - timedelta(days=1)  # Yesterday's date\n",
@@ -263,7 +247,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.to_csv('news_articles.csv', index=False)\n"
    ]
   },
   {
@@ -638,13 +623,6 @@
    "source": [
     "df_processed.head()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

 {
  "cells": [
   {
    "cell_type": "code",
    "execution_count": 2,
     }
    ],
    "source": [
+    "#Importing necessary libraries\n",
+    "from dotenv import load_dotenv\n",
+    "from datetime import datetime, timedelta\n",
+    "import requests\n",
+    "import os\n",
+    "import time\n",
+    "import pandas as pd \n",
+    "from SML import news_preprocessing    #Importing everything from 'news_preprocessing'\n",
     "load_dotenv()"
    ]
   },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Fetched 50 articles from 2022-05-07 to 2022-06-26\n",
+      "Fetched 50 articles from 2022-06-27 to 2022-08-16\n",
+      "Fetched 50 articles from 2022-08-17 to 2022-10-06\n",
+      "Fetched 50 articles from 2022-10-07 to 2022-11-26\n",
+      "Fetched 50 articles from 2022-11-27 to 2023-01-16\n",
       "Rate limit reached. Waiting to retry...\n",
+      "Fetched 50 articles from 2023-01-17 to 2023-03-08\n",
+      "Fetched 50 articles from 2023-03-09 to 2023-04-28\n",
+      "Fetched 50 articles from 2023-04-29 to 2023-06-18\n",
+      "Fetched 50 articles from 2023-06-19 to 2023-08-08\n",
+      "Fetched 50 articles from 2023-08-09 to 2023-09-28\n",
+      "Rate limit reached. Waiting to retry...\n"
      ]
     }
    ],
    "source": [
+    "#Defining a function for fetching news\n",
     "\n",
     "def fetch_news(api_key, ticker, start_date, end_date):\n",
     "    base_url = os.environ.get(\"endpointnewsp\")\n",
     "                data = response.json()\n",
     "                articles = data.get('results', [])\n",
     "                \n",
+    "                # Creating a DataFrame from articles\n",
     "                df = pd.DataFrame(articles)\n",
     "                \n",
+    "                # Adding primary_key column if ticker is found\n",
     "                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)\n",
     "                \n",
     "                all_news.append(df)  # Append DataFrame to the list\n",
     "\n",
     "    return pd.concat(all_news, ignore_index=True)\n",
     "\n",
+    "#Usage\n",
     "api_key = os.environ.get('newsp_api')\n",
     "ticker = 'TSLA'\n",
     "end_date = datetime.now() - timedelta(days=1)  # Yesterday's date\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Putting the news articles into a csv\n",
+    "df.to_csv('news_articles.csv', index=False)"
    ]
   },
   {
    "source": [
     "df_processed.head()"
    ]
   }
  ],
  "metadata": {

Stocks news prediction/Notebooks/2_historical_stock.ipynb ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Importing necessary librabries\n",
+    "from dotenv import load_dotenv\n",
+    "import os \n",
+    "from alpha_vantage.timeseries import TimeSeries\n",
+    "import pandas as pd\n",
+    "import hopsworks\n",
+    "import re \n",
+    "import modal \n",
+    "#prepocessing\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "#import pandas_market_calendars as mcal\n",
+    "import datetime\n",
+    "import numpy as np\n",
+    "from datetime import timedelta\n",
+    "load_dotenv()   #Making the .env file work"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            1. open  2. high    3. low  4. close    5. volume ticker\n",
+      "date                                                                \n",
+      "2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
+      "2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
+      "2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
+      "2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
+      "2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Setting up API key to being able to fetch stocks from Alpha Vantage\n",
+    "\n",
+    "api_key = os.environ.get('stocks_api') \n",
+    "ts = TimeSeries(key=api_key, output_format='pandas')\n",
+    "\n",
+    "#Defining a function to fetch stocks\n",
+    "\n",
+    "def fetch_stock_prices(symbol):\n",
+    "    # Fetch daily adjusted stock prices; adjust the symbol as needed\n",
+    "    data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')\n",
+    "    \n",
+    "    # Add a new column named 'ticker' and fill it with the ticker name\n",
+    "    data['ticker'] = symbol\n",
+    "    \n",
+    "    return data\n",
+    "\n",
+    "#Usage\n",
+    "symbol = 'TSLA'\n",
+    "stock_data = fetch_stock_prices(symbol)\n",
+    "print(stock_data.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data saved to TSLA_stock_price.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Defining the file path and name\n",
+    "file_path = 'TSLA_stock_price.csv'  \n",
+    "\n",
+    "# Saving the DataFrame to CSV\n",
+    "stock_data.to_csv(file_path)\n",
+    "\n",
+    "print(f\"Data saved to {file_path}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

news_preprocessing.ipynb → Stocks news prediction/Notebooks/3_news_preprocessing.ipynb RENAMED Viewed

@@ -6,6 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from dotenv import load_dotenv\n",
     "from datetime import datetime, timedelta\n",
     "import requests\n",
@@ -21,6 +22,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def process_news_articles(news_articles):\n",
     "    # Convert list of dictionaries to DataFrame\n",
     "    df = pd.DataFrame(news_articles)\n",
@@ -40,7 +42,7 @@
     "    df['date'] = df['published_utc'].dt.date\n",
     "    df['time'] = df['published_utc'].dt.time\n",
     "\n",
-    "    # Drop unnecessary columns\n",
     "    df.drop(['published_utc'], axis=1, inplace=True)\n",
     "    # set date to index\n",
     "    df = df.set_index(\"date\")\n",
@@ -57,6 +59,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def exponential_moving_average(df, window):\n",
     "   # Calculate EMA on the 'sentiment' column\n",
     "    df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()\n",

    "metadata": {},
    "outputs": [],
    "source": [
+    "#Importing necessary libraries\n",
     "from dotenv import load_dotenv\n",
     "from datetime import datetime, timedelta\n",
     "import requests\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function to process news articles\n",
     "def process_news_articles(news_articles):\n",
     "    # Convert list of dictionaries to DataFrame\n",
     "    df = pd.DataFrame(news_articles)\n",
     "    df['date'] = df['published_utc'].dt.date\n",
     "    df['time'] = df['published_utc'].dt.time\n",
     "\n",
+    "    # Dropping unnecessary columns\n",
     "    df.drop(['published_utc'], axis=1, inplace=True)\n",
     "    # set date to index\n",
     "    df = df.set_index(\"date\")\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function for the exponential moving average\n",
+    "\n",
     "def exponential_moving_average(df, window):\n",
     "   # Calculate EMA on the 'sentiment' column\n",
     "    df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()\n",

stock_preprocessing.ipynb → Stocks news prediction/Notebooks/4_stock_preprocessing.ipynb RENAMED Viewed

@@ -2,10 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
-   "outputs": [],
    "source": [
     "from dotenv import load_dotenv\n",
     "import os \n",
     "from alpha_vantage.timeseries import TimeSeries\n",
@@ -20,7 +32,8 @@
     "import pandas_market_calendars as mcal\n",
     "import datetime\n",
     "import numpy as np\n",
-    "from datetime import datetime, timedelta\n"
    ]
   },
   {
@@ -43,8 +56,7 @@
     }
    ],
    "source": [
-    "load_dotenv()\n",
-    "\n",
     "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
     "ts = TimeSeries(key=api_key, output_format='pandas')\n",
     "\n",
@@ -54,168 +66,6 @@
     "print(data.head())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>1. open</th>\n",
-       "      <th>2. high</th>\n",
-       "      <th>3. low</th>\n",
-       "      <th>4. close</th>\n",
-       "      <th>5. volume</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>date</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2024-05-02</th>\n",
-       "      <td>182.86</td>\n",
-       "      <td>184.6000</td>\n",
-       "      <td>176.0200</td>\n",
-       "      <td>180.01</td>\n",
-       "      <td>89148041.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-05-01</th>\n",
-       "      <td>182.00</td>\n",
-       "      <td>185.8600</td>\n",
-       "      <td>179.0100</td>\n",
-       "      <td>179.99</td>\n",
-       "      <td>92829719.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-04-30</th>\n",
-       "      <td>186.98</td>\n",
-       "      <td>190.9500</td>\n",
-       "      <td>182.8401</td>\n",
-       "      <td>183.28</td>\n",
-       "      <td>127031787.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-04-29</th>\n",
-       "      <td>188.42</td>\n",
-       "      <td>198.8700</td>\n",
-       "      <td>184.5400</td>\n",
-       "      <td>194.05</td>\n",
-       "      <td>243869678.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2024-04-26</th>\n",
-       "      <td>168.85</td>\n",
-       "      <td>172.1200</td>\n",
-       "      <td>166.3700</td>\n",
-       "      <td>168.29</td>\n",
-       "      <td>109815725.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2010-07-06</th>\n",
-       "      <td>20.00</td>\n",
-       "      <td>20.0000</td>\n",
-       "      <td>15.8300</td>\n",
-       "      <td>16.11</td>\n",
-       "      <td>6866900.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2010-07-02</th>\n",
-       "      <td>23.00</td>\n",
-       "      <td>23.1000</td>\n",
-       "      <td>18.7100</td>\n",
-       "      <td>19.20</td>\n",
-       "      <td>5139800.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2010-07-01</th>\n",
-       "      <td>25.00</td>\n",
-       "      <td>25.9200</td>\n",
-       "      <td>20.2700</td>\n",
-       "      <td>21.96</td>\n",
-       "      <td>8218800.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2010-06-30</th>\n",
-       "      <td>25.79</td>\n",
-       "      <td>30.4192</td>\n",
-       "      <td>23.3000</td>\n",
-       "      <td>23.83</td>\n",
-       "      <td>17187100.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2010-06-29</th>\n",
-       "      <td>19.00</td>\n",
-       "      <td>25.0000</td>\n",
-       "      <td>17.5400</td>\n",
-       "      <td>23.89</td>\n",
-       "      <td>18766300.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3485 rows × 5 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            1. open   2. high    3. low  4. close    5. volume\n",
-       "date                                                          \n",
-       "2024-05-02   182.86  184.6000  176.0200    180.01   89148041.0\n",
-       "2024-05-01   182.00  185.8600  179.0100    179.99   92829719.0\n",
-       "2024-04-30   186.98  190.9500  182.8401    183.28  127031787.0\n",
-       "2024-04-29   188.42  198.8700  184.5400    194.05  243869678.0\n",
-       "2024-04-26   168.85  172.1200  166.3700    168.29  109815725.0\n",
-       "...             ...       ...       ...       ...          ...\n",
-       "2010-07-06    20.00   20.0000   15.8300     16.11    6866900.0\n",
-       "2010-07-02    23.00   23.1000   18.7100     19.20    5139800.0\n",
-       "2010-07-01    25.00   25.9200   20.2700     21.96    8218800.0\n",
-       "2010-06-30    25.79   30.4192   23.3000     23.83   17187100.0\n",
-       "2010-06-29    19.00   25.0000   17.5400     23.89   18766300.0\n",
-       "\n",
-       "[3485 rows x 5 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -241,6 +91,7 @@
     }
    ],
    "source": [
     "data.info()"
    ]
   },
@@ -265,6 +116,7 @@
     }
    ],
    "source": [
     "meta_data"
    ]
   },
@@ -293,6 +145,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def next_business_day(today):\n",
     "    \n",
     "    # Real tomorrow\n",
@@ -320,6 +173,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def extract_business_day(start_date,end_date):\n",
     "    \"\"\"\n",
     "    Given a start_date and end_date.\n",
@@ -331,27 +185,27 @@
     "        e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
     "    \"\"\"\n",
     "    \n",
-    "    # Save for later\n",
     "    end_date_save = end_date\n",
     "    \n",
-    "    # Get the NYSE calendar\n",
     "    cal = mcal.get_calendar('NYSE')\n",
     "\n",
-    "    # Get the NYSE calendar's open and close times for the specified period\n",
     "    schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
     "    \n",
     "    # Only need a list of dates when it's open (not open and close times)\n",
     "    isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
     "    \n",
-    "    # Go over all days: \n",
     "    delta = datetime.timedelta(days=1)\n",
     "    start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
     "    end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
     "    \n",
-    "    # Extract days from the timedelta object\n",
     "    num_days = (end_date - start_date).days + 1\n",
     "    \n",
-    "    # Create boolean array for days being open (1) and closed (0) \n",
     "    is_open = np.zeros(num_days)\n",
     "    \n",
     "    # iterate over range of dates\n",
@@ -386,6 +240,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def clean_column_name(name):\n",
     "    # Remove all non-letter characters\n",
     "    cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
@@ -617,178 +472,13 @@
     "data.head()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>date</th>\n",
-       "      <th>open</th>\n",
-       "      <th>high</th>\n",
-       "      <th>low</th>\n",
-       "      <th>close</th>\n",
-       "      <th>volume</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2024-05-02</td>\n",
-       "      <td>182.86</td>\n",
-       "      <td>184.6000</td>\n",
-       "      <td>176.0200</td>\n",
-       "      <td>180.01</td>\n",
-       "      <td>89148041.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2024-05-01</td>\n",
-       "      <td>182.00</td>\n",
-       "      <td>185.8600</td>\n",
-       "      <td>179.0100</td>\n",
-       "      <td>179.99</td>\n",
-       "      <td>92829719.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2024-04-30</td>\n",
-       "      <td>186.98</td>\n",
-       "      <td>190.9500</td>\n",
-       "      <td>182.8401</td>\n",
-       "      <td>183.28</td>\n",
-       "      <td>127031787.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2024-04-29</td>\n",
-       "      <td>188.42</td>\n",
-       "      <td>198.8700</td>\n",
-       "      <td>184.5400</td>\n",
-       "      <td>194.05</td>\n",
-       "      <td>243869678.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2024-04-26</td>\n",
-       "      <td>168.85</td>\n",
-       "      <td>172.1200</td>\n",
-       "      <td>166.3700</td>\n",
-       "      <td>168.29</td>\n",
-       "      <td>109815725.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3480</th>\n",
-       "      <td>2010-07-06</td>\n",
-       "      <td>20.00</td>\n",
-       "      <td>20.0000</td>\n",
-       "      <td>15.8300</td>\n",
-       "      <td>16.11</td>\n",
-       "      <td>6866900.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3481</th>\n",
-       "      <td>2010-07-02</td>\n",
-       "      <td>23.00</td>\n",
-       "      <td>23.1000</td>\n",
-       "      <td>18.7100</td>\n",
-       "      <td>19.20</td>\n",
-       "      <td>5139800.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3482</th>\n",
-       "      <td>2010-07-01</td>\n",
-       "      <td>25.00</td>\n",
-       "      <td>25.9200</td>\n",
-       "      <td>20.2700</td>\n",
-       "      <td>21.96</td>\n",
-       "      <td>8218800.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3483</th>\n",
-       "      <td>2010-06-30</td>\n",
-       "      <td>25.79</td>\n",
-       "      <td>30.4192</td>\n",
-       "      <td>23.3000</td>\n",
-       "      <td>23.83</td>\n",
-       "      <td>17187100.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3484</th>\n",
-       "      <td>2010-06-29</td>\n",
-       "      <td>19.00</td>\n",
-       "      <td>25.0000</td>\n",
-       "      <td>17.5400</td>\n",
-       "      <td>23.89</td>\n",
-       "      <td>18766300.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3485 rows × 6 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "           date    open      high       low   close       volume\n",
-       "0    2024-05-02  182.86  184.6000  176.0200  180.01   89148041.0\n",
-       "1    2024-05-01  182.00  185.8600  179.0100  179.99   92829719.0\n",
-       "2    2024-04-30  186.98  190.9500  182.8401  183.28  127031787.0\n",
-       "3    2024-04-29  188.42  198.8700  184.5400  194.05  243869678.0\n",
-       "4    2024-04-26  168.85  172.1200  166.3700  168.29  109815725.0\n",
-       "...         ...     ...       ...       ...     ...          ...\n",
-       "3480 2010-07-06   20.00   20.0000   15.8300   16.11    6866900.0\n",
-       "3481 2010-07-02   23.00   23.1000   18.7100   19.20    5139800.0\n",
-       "3482 2010-07-01   25.00   25.9200   20.2700   21.96    8218800.0\n",
-       "3483 2010-06-30   25.79   30.4192   23.3000   23.83   17187100.0\n",
-       "3484 2010-06-29   19.00   25.0000   17.5400   23.89   18766300.0\n",
-       "\n",
-       "[3485 rows x 6 columns]"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Define the date range you're interested in\n",
     "yesterday =datetime.now()-timedelta(days=1)\n",
     "two_years_back = yesterday - timedelta(days=684)"
    ]
@@ -799,7 +489,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Filter the DataFrame to this range\n",
     "filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]"
    ]
   },
@@ -943,13 +633,6 @@
    "source": [
     "filtered_df.shape"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -968,7 +651,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
   },
   "orig_nbformat": 4
  },

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
+    "#Importing necessary libraries\n",
     "from dotenv import load_dotenv\n",
     "import os \n",
     "from alpha_vantage.timeseries import TimeSeries\n",
     "import pandas_market_calendars as mcal\n",
     "import datetime\n",
     "import numpy as np\n",
+    "from datetime import datetime, timedelta\n",
+    "load_dotenv()"
    ]
   },
   {
     }
    ],
    "source": [
+    "#Connecting to Alpha vantage using API key\n",
     "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
     "ts = TimeSeries(key=api_key, output_format='pandas')\n",
     "\n",
     "print(data.head())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
     }
    ],
    "source": [
+    "#Looking at data info\n",
     "data.info()"
    ]
   },
     }
    ],
    "source": [
+    "#Looking at the meta data\n",
     "meta_data"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function to find the next business day\n",
     "def next_business_day(today):\n",
     "    \n",
     "    # Real tomorrow\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function to extract business day\n",
     "def extract_business_day(start_date,end_date):\n",
     "    \"\"\"\n",
     "    Given a start_date and end_date.\n",
     "        e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
     "    \"\"\"\n",
     "    \n",
+    "    # Saving for later\n",
     "    end_date_save = end_date\n",
     "    \n",
+    "    # Getting the NYSE calendar\n",
     "    cal = mcal.get_calendar('NYSE')\n",
     "\n",
+    "    # Getting the NYSE calendar's open and close times for the specified period\n",
     "    schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
     "    \n",
     "    # Only need a list of dates when it's open (not open and close times)\n",
     "    isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
     "    \n",
+    "    # Going over all days: \n",
     "    delta = datetime.timedelta(days=1)\n",
     "    start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
     "    end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
     "    \n",
+    "    # Extracting days from the timedelta object\n",
     "    num_days = (end_date - start_date).days + 1\n",
     "    \n",
+    "    # Creating a boolean array for days being open (1) and closed (0) \n",
     "    is_open = np.zeros(num_days)\n",
     "    \n",
     "    # iterate over range of dates\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function to clean the column names\n",
     "def clean_column_name(name):\n",
     "    # Remove all non-letter characters\n",
     "    cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
     "data.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Define the date range we're interested in\n",
     "yesterday =datetime.now()-timedelta(days=1)\n",
     "two_years_back = yesterday - timedelta(days=684)"
    ]
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Filtering the DataFrame to this range\n",
     "filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]"
    ]
   },
    "source": [
     "filtered_df.shape"
    ]
   }
  ],
  "metadata": {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   },
   "orig_nbformat": 4
  },

Stocks news prediction/Notebooks/5_feature_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,493 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connection closed.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import necessary libraries\n",
+    "import pandas as pd               # For data manipulation using DataFrames\n",
+    "import numpy as np                # For numerical operations\n",
+    "import matplotlib.pyplot as plt   # For data visualization\n",
+    "import os                         # For operating system-related tasks\n",
+    "import joblib                     # For saving and loading models\n",
+    "import hopsworks                  # For getting access to hopsworks\n",
+    "import re\n",
+    "\n",
+    "# Import specific modules from scikit-learn\n",
+    "from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing\n",
+    "from sklearn.metrics import accuracy_score                        # For evaluating model accuracy\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "load_dotenv()\n",
+    "\n",
+    "#Connecting to hopsworks\n",
+    "api_key = os.environ.get('hopsworks_api')\n",
+    "project = hopsworks.login(api_key_value=api_key)\n",
+    "fs = project.get_feature_store()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         date  1. open  2. high    3. low  4. close    5. volume ticker\n",
+      "0  2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
+      "1  2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
+      "2  2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
+      "3  2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
+      "4  2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load and display the data from CSV to confirm\n",
+    "tsla_df = pd.read_csv('TSLA_stock_price.csv')\n",
+    "print(tsla_df.head())    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Defining a function to clean the column names\n",
+    "def clean_column_name(name):\n",
+    "    # Remove all non-letter characters\n",
+    "    cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
+    "    return cleaned_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>1. open</th>\n",
+       "      <th>2. high</th>\n",
+       "      <th>3. low</th>\n",
+       "      <th>4. close</th>\n",
+       "      <th>5. volume</th>\n",
+       "      <th>ticker</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2024-05-03</td>\n",
+       "      <td>182.10</td>\n",
+       "      <td>184.7800</td>\n",
+       "      <td>178.4200</td>\n",
+       "      <td>181.19</td>\n",
+       "      <td>75491539.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2024-05-02</td>\n",
+       "      <td>182.86</td>\n",
+       "      <td>184.6000</td>\n",
+       "      <td>176.0200</td>\n",
+       "      <td>180.01</td>\n",
+       "      <td>89148041.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2024-05-01</td>\n",
+       "      <td>182.00</td>\n",
+       "      <td>185.8600</td>\n",
+       "      <td>179.0100</td>\n",
+       "      <td>179.99</td>\n",
+       "      <td>92829719.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2024-04-30</td>\n",
+       "      <td>186.98</td>\n",
+       "      <td>190.9500</td>\n",
+       "      <td>182.8401</td>\n",
+       "      <td>183.28</td>\n",
+       "      <td>127031787.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2024-04-29</td>\n",
+       "      <td>188.42</td>\n",
+       "      <td>198.8700</td>\n",
+       "      <td>184.5400</td>\n",
+       "      <td>194.05</td>\n",
+       "      <td>243869678.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3481</th>\n",
+       "      <td>2010-07-06</td>\n",
+       "      <td>20.00</td>\n",
+       "      <td>20.0000</td>\n",
+       "      <td>15.8300</td>\n",
+       "      <td>16.11</td>\n",
+       "      <td>6866900.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3482</th>\n",
+       "      <td>2010-07-02</td>\n",
+       "      <td>23.00</td>\n",
+       "      <td>23.1000</td>\n",
+       "      <td>18.7100</td>\n",
+       "      <td>19.20</td>\n",
+       "      <td>5139800.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3483</th>\n",
+       "      <td>2010-07-01</td>\n",
+       "      <td>25.00</td>\n",
+       "      <td>25.9200</td>\n",
+       "      <td>20.2700</td>\n",
+       "      <td>21.96</td>\n",
+       "      <td>8218800.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3484</th>\n",
+       "      <td>2010-06-30</td>\n",
+       "      <td>25.79</td>\n",
+       "      <td>30.4192</td>\n",
+       "      <td>23.3000</td>\n",
+       "      <td>23.83</td>\n",
+       "      <td>17187100.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3485</th>\n",
+       "      <td>2010-06-29</td>\n",
+       "      <td>19.00</td>\n",
+       "      <td>25.0000</td>\n",
+       "      <td>17.5400</td>\n",
+       "      <td>23.89</td>\n",
+       "      <td>18766300.0</td>\n",
+       "      <td>TSLA</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3486 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            date  1. open   2. high    3. low  4. close    5. volume ticker\n",
+       "0     2024-05-03   182.10  184.7800  178.4200    181.19   75491539.0   TSLA\n",
+       "1     2024-05-02   182.86  184.6000  176.0200    180.01   89148041.0   TSLA\n",
+       "2     2024-05-01   182.00  185.8600  179.0100    179.99   92829719.0   TSLA\n",
+       "3     2024-04-30   186.98  190.9500  182.8401    183.28  127031787.0   TSLA\n",
+       "4     2024-04-29   188.42  198.8700  184.5400    194.05  243869678.0   TSLA\n",
+       "...          ...      ...       ...       ...       ...          ...    ...\n",
+       "3481  2010-07-06    20.00   20.0000   15.8300     16.11    6866900.0   TSLA\n",
+       "3482  2010-07-02    23.00   23.1000   18.7100     19.20    5139800.0   TSLA\n",
+       "3483  2010-07-01    25.00   25.9200   20.2700     21.96    8218800.0   TSLA\n",
+       "3484  2010-06-30    25.79   30.4192   23.3000     23.83   17187100.0   TSLA\n",
+       "3485  2010-06-29    19.00   25.0000   17.5400     23.89   18766300.0   TSLA\n",
+       "\n",
+       "[3486 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tsla_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Cleaning up column names for 'tsla_df'\n",
+    "tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]\n",
+    "print(tsla_df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Converting the \"date\" column to timestamp\n",
+    "tsla_df['date'] = pd.to_datetime(tsla_df['date'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Defining the stocks feature group\n",
+    "tesla_fg = fs.get_or_create_feature_group(\n",
+    "    name=\"tesla_stock\",\n",
+    "    description=\"Tesla stock dataset from alpha vantage\",\n",
+    "    version=1,\n",
+    "    primary_key=[\"ticker\"],\n",
+    "    event_time=['date'],\n",
+    "    online_enabled=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/786781\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b3248b9d522a467db9ce202ef5815fe9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/3486 | Elapsed Time: 00:00 | Remaining Time: ?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: tesla_stock_1_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<hsfs.core.job.Job at 0x19cffe27490>, None)"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Inserting the stock data into the stocks feature group\n",
+    "tesla_fg.insert(tsla_df, write_options={\"wait_for_job\" : False})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Collecting news df\n",
+    "news_df = pd.read_csv('news_articles_ema.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Dropping exp mean 7 days\n",
+    "news_df_updated = news_df.drop(columns=['exp_mean_7_days'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Updating date to datetime\n",
+    "news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-06 13:43:12,343 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Defining the news feature group\n",
+    "news_sentiment_fg = fs.get_or_create_feature_group(\n",
+    "    name='news_sentiment_updated',\n",
+    "    description='News sentiment from Polygon',\n",
+    "    version=1,\n",
+    "    primary_key=['ticker'],\n",
+    "    event_time=['date'],\n",
+    "    online_enabled=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787796\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "524bb5481c34441ba708a4c14edac44b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/66 | Elapsed Time: 00:00 | Remaining Time: ?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<hsfs.core.job.Job at 0x19c811c2e90>, None)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Inserting the news data into the news feature group\n",
+    "news_sentiment_fg.insert(news_df_updated)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

feature_view.ipynb → Stocks news prediction/Notebooks/6_feature_view.ipynb RENAMED Viewed

@@ -2,55 +2,31 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import necessary libraries\n",
-    "import pandas as pd               # For data manipulation using DataFrames\n",
-    "import numpy as np                # For numerical operations\n",
-    "import matplotlib.pyplot as plt   # For data visualization\n",
-    "import os                         # For operating system-related tasks\n",
-    "import joblib                     # For saving and loading models\n",
-    "import hopsworks                  # For getting access to hopsworks\n",
-    "\n",
-    "\n",
-    "\n",
-    "# Import specific modules from scikit-learn\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing\n",
-    "from sklearn.metrics import accuracy_score                        # For evaluating model accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "         date  1. open  2. high    3. low  4. close    5. volume ticker\n",
       "0  2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
       "1  2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
       "2  2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
       "3  2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
       "4  2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "\n",
-      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n",
-      "2024-05-06 13:44:59,122 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
-      "\n",
-      "Feature Group created successfully, explore it at \n",
-      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787797\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1b857e05ae714fc09a2a7fcd05f56a73",
        "version_major": 2,
        "version_minor": 0
       },
@@ -65,19 +41,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Launching job: tesla_stock_2_offline_fg_materialization\n",
       "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_2_offline_fg_materialization/executions\n",
-      "2024-05-06 13:45:08,516 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
-      "\n",
-      "Feature Group created successfully, explore it at \n",
-      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/785786\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "98e7ee7cb2c943b8893d0ae2a7254104",
        "version_major": 2,
        "version_minor": 0
       },
@@ -92,57 +64,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Launching job: news_sentiment_updated_2_offline_fg_materialization\n",
       "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_2_offline_fg_materialization/executions\n"
-     ]
-    }
-   ],
-   "source": [
-    "from feature_pipeline import tesla_fg\n",
-    "from feature_pipeline import news_sentiment_fg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from dotenv import load_dotenv\n",
-    "import os\n",
-    "\n",
-    "load_dotenv()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
       "Connection closed.\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
       "\n",
       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
       "Connected. Call `.close()` to terminate connection gracefully.\n"
@@ -150,6 +76,23 @@
     }
    ],
    "source": [
     "api_key = os.environ.get('hopsworks_api')\n",
     "project = hopsworks.login(api_key_value=api_key)\n",
     "fs = project.get_feature_store()"
@@ -161,17 +104,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def create_stocks_feature_view(fs, version):\n",
     "\n",
     "    # Loading in the feature groups\n",
     "    tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
     "    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)\n",
     "\n",
-    "    # Define the query\n",
     "    ds_query = tesla_fg.select(['date', 'open', 'ticker'])\\\n",
     "        .join(news_sentiment_fg.select(['sentiment']))\n",
     "\n",
-    "    # Create the feature view\n",
     "    feature_view = fs.create_feature_view(\n",
     "        name='tesla_stocks_fv',\n",
     "        query=ds_query,\n",
@@ -196,6 +141,7 @@
     }
    ],
    "source": [
     "try:\n",
     "    feature_view = fs.get_feature_view(\"tesla_stocks_fv\", version=1)\n",
     "    tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
@@ -209,6 +155,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "def fix_data_from_feature_view(df,start_date,end_date):\n",
     "    df = df.sort_values(\"date\")\n",
     "    df = df.reset_index()\n",
@@ -230,41 +177,6 @@
     "    \n",
     "    return filtered_df"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#def create_stocks_feature_view(fs, version):\n",
-    "\n",
-    "    #Loading in the feature groups\n",
-    "#    tesla_fg = fs.get_feature_group('tesla_stock', version = 3)\n",
-    "#    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)\n",
-    "\n",
-    "#    ds_query = tesla_fg.select(['date','open', 'ticker'])\\\n",
-    "#        .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))\n",
-    "    \n",
-    "#    return (fs.create_tesla_feature_view(\n",
-    "#        name = 'tsla_stocks_fv',\n",
-    "#        query = ds_query,\n",
-    "#        labels=['ticker']\n",
-    "#    ), tesla_fg)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#try:\n",
-    "#    feature_view = fs.get_feature_view(\"tsla_stocks_fv\", version=1)\n",
-    "#    tesla_fg = fs.get_feature_group('tesla_stock', version=3)\n",
-    "#except:\n",
-    "#    feature_view, tesla_fg = create_stocks_feature_view(fs, 1)"
-   ]
   }
  ],
  "metadata": {
@@ -283,7 +195,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
   }
  },
  "nbformat": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Connection closed.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
       "         date  1. open  2. high    3. low  4. close    5. volume ticker\n",
       "0  2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
       "1  2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
       "2  2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
       "3  2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
       "4  2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n",
+      "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db4ef90d03b0464f957c18365d8d636f",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Launching job: tesla_stock_1_offline_fg_materialization\n",
       "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9043e7043c1843288091f7c3a6bbd83e",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
       "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n",
       "Connection closed.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
       "\n",
       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
       "Connected. Call `.close()` to terminate connection gracefully.\n"
     }
    ],
    "source": [
+    "# Importing necessary libraries\n",
+    "import pandas as pd               # For data manipulation using DataFrames\n",
+    "import numpy as np                # For numerical operations\n",
+    "import matplotlib.pyplot as plt   # For data visualization\n",
+    "import os                         # For operating system-related tasks\n",
+    "import joblib                     # For saving and loading models\n",
+    "import hopsworks                  # For getting access to hopsworks\n",
+    "\n",
+    "from SML import feature_pipeline   #Loading in the tesla_fg\n",
+    "\n",
+    "#Making the notebook able to fetch from the .env file\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "#Getting connected to hopsworks\n",
     "api_key = os.environ.get('hopsworks_api')\n",
     "project = hopsworks.login(api_key_value=api_key)\n",
     "fs = project.get_feature_store()"
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining the function to create feature view\n",
+    "\n",
     "def create_stocks_feature_view(fs, version):\n",
     "\n",
     "    # Loading in the feature groups\n",
     "    tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
     "    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)\n",
     "\n",
+    "    # Defining the query\n",
     "    ds_query = tesla_fg.select(['date', 'open', 'ticker'])\\\n",
     "        .join(news_sentiment_fg.select(['sentiment']))\n",
     "\n",
+    "    # Creating the feature view\n",
     "    feature_view = fs.create_feature_view(\n",
     "        name='tesla_stocks_fv',\n",
     "        query=ds_query,\n",
     }
    ],
    "source": [
+    "#Creating the feature view\n",
     "try:\n",
     "    feature_view = fs.get_feature_view(\"tesla_stocks_fv\", version=1)\n",
     "    tesla_fg = fs.get_feature_group('tesla_stock', version=1)\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "#Defining a function to get fixed data from the feature view\n",
     "def fix_data_from_feature_view(df,start_date,end_date):\n",
     "    df = df.sort_values(\"date\")\n",
     "    df = df.reset_index()\n",
     "    \n",
     "    return filtered_df"
    ]
   }
  ],
  "metadata": {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

Stocks news prediction/Notebooks/7_training_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,839 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Importing necessary libraries\n",
+    "import hopsworks\n",
+    "import hsfs\n",
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from hsml.schema import Schema\n",
+    "from hsml.model_schema import ModelSchema\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Input, LSTM, Dense, Dropout\n",
+    "from sklearn.preprocessing import StandardScaler  # Import StandardScaler from scikit-learn\n",
+    "import joblib\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "#Connecting to hopsworks\n",
+    "api_key = os.environ.get('hopsworks_api')\n",
+    "project = hopsworks.login(api_key_value=api_key)\n",
+    "fs = project.get_feature_store()\n",
+    "\n",
+    "#Another connection to hopsworks\n",
+    "api_key = os.getenv('hopsworks_api')\n",
+    "connection = hsfs.connection()\n",
+    "fs = connection.get_feature_store()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Getting the feature view\n",
+    "feature_view = fs.get_feature_view(\n",
+    "    name='tesla_stocks_fv',\n",
+    "    version=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Setting up train & test split dates\n",
+    "train_start = \"2022-06-22\"\n",
+    "train_end = \"2023-12-31\"\n",
+    "\n",
+    "test_start = '2024-01-01'\n",
+    "test_end = \"2024-05-03\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training dataset job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stocks_fv_1_create_fv_td_07052024082715/executions\n",
+      "2024-05-07 10:28:31,852 WARNING: VersionWarning: Incremented version to `6`.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(6, <hsfs.core.job.Job at 0x1c3ac2719d0>)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Creating the train/test split on the feature view with the split dates\n",
+    "feature_view.create_train_test_split(\n",
+    "    train_start=train_start,\n",
+    "    train_end=train_end,\n",
+    "    test_start=test_start,\n",
+    "    test_end=test_end,\n",
+    "    data_format='csv',\n",
+    "    coalesce= True,\n",
+    "    statistics_config={'histogram':True,'correlations':True})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Collecting the split from feature view\n",
+    "X_train, X_test, y_train, y_test = feature_view.get_train_test_split(6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>ticker</th>\n",
+       "      <th>sentiment</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-12-14T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.102207</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-02-21T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.155833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023-08-17T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.024046</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-09-16T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.087306</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-08-28T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.024046</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>378</th>\n",
+       "      <td>2023-02-10T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.155833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>379</th>\n",
+       "      <td>2023-05-08T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.141296</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>380</th>\n",
+       "      <td>2022-09-08T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.087306</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>381</th>\n",
+       "      <td>2023-07-06T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.119444</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>382</th>\n",
+       "      <td>2023-10-27T00:00:00.000Z</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.164868</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>383 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         date ticker  sentiment\n",
+       "0    2022-12-14T00:00:00.000Z   TSLA   0.102207\n",
+       "1    2023-02-21T00:00:00.000Z   TSLA   0.155833\n",
+       "2    2023-08-17T00:00:00.000Z   TSLA   0.024046\n",
+       "3    2022-09-16T00:00:00.000Z   TSLA   0.087306\n",
+       "4    2023-08-28T00:00:00.000Z   TSLA   0.024046\n",
+       "..                        ...    ...        ...\n",
+       "378  2023-02-10T00:00:00.000Z   TSLA   0.155833\n",
+       "379  2023-05-08T00:00:00.000Z   TSLA   0.141296\n",
+       "380  2022-09-08T00:00:00.000Z   TSLA   0.087306\n",
+       "381  2023-07-06T00:00:00.000Z   TSLA   0.119444\n",
+       "382  2023-10-27T00:00:00.000Z   TSLA   0.164868\n",
+       "\n",
+       "[383 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Inspecting X_train\n",
+    "X_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Converting date into datetime\n",
+    "X_train['date'] = pd.to_datetime(X_train['date']).dt.date\n",
+    "X_test['date'] = pd.to_datetime(X_test['date']).dt.date\n",
+    "X_train['date'] = pd.to_datetime(X_train['date'])\n",
+    "X_test['date'] = pd.to_datetime(X_test['date'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>ticker</th>\n",
+       "      <th>sentiment</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-12-14</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.102207</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-02-21</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.155833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023-08-17</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.024046</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-09-16</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.087306</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-08-28</td>\n",
+       "      <td>TSLA</td>\n",
+       "      <td>0.024046</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        date ticker  sentiment\n",
+       "0 2022-12-14   TSLA   0.102207\n",
+       "1 2023-02-21   TSLA   0.155833\n",
+       "2 2023-08-17   TSLA   0.024046\n",
+       "3 2022-09-16   TSLA   0.087306\n",
+       "4 2023-08-28   TSLA   0.024046"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extracting the 'ticker' column\n",
+    "tickers = X_train[['ticker']]\n",
+    "\n",
+    "# Initializing OneHotEncoder\n",
+    "encoder = OneHotEncoder()\n",
+    "\n",
+    "# Fitting and transforming the 'ticker' column\n",
+    "ticker_encoded = encoder.fit_transform(tickers)\n",
+    "\n",
+    "# Converting the encoded column into a DataFrame\n",
+    "ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n",
+    "\n",
+    "# Concatenating the encoded DataFrame with the original DataFrame\n",
+    "X_train = pd.concat([X_train, ticker_encoded_df], axis=1)\n",
+    "\n",
+    "# Dropping the original 'ticker' column\n",
+    "X_train.drop('ticker', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>ticker_TSLA</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-12-14</td>\n",
+       "      <td>0.102207</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-02-21</td>\n",
+       "      <td>0.155833</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023-08-17</td>\n",
+       "      <td>0.024046</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-09-16</td>\n",
+       "      <td>0.087306</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-08-28</td>\n",
+       "      <td>0.024046</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        date  sentiment  ticker_TSLA\n",
+       "0 2022-12-14   0.102207          1.0\n",
+       "1 2023-02-21   0.155833          1.0\n",
+       "2 2023-08-17   0.024046          1.0\n",
+       "3 2022-09-16   0.087306          1.0\n",
+       "4 2023-08-28   0.024046          1.0"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Inspecting X train after onehotencoding 'Ticker'\n",
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Doing the same for X test as done to X train\n",
+    "\n",
+    "tickers = X_test[['ticker']]\n",
+    "\n",
+    "# Initializing OneHotEncoder\n",
+    "encoder = OneHotEncoder()\n",
+    "\n",
+    "# Fitting and transforming the 'ticker' column\n",
+    "ticker_encoded_test = encoder.fit_transform(tickers)\n",
+    "\n",
+    "# Converting the encoded column into a DataFrame\n",
+    "ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n",
+    "\n",
+    "# Concatenating the encoded DataFrame with the original DataFrame\n",
+    "X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)\n",
+    "\n",
+    "# Dropping the original 'ticker' column\n",
+    "X_test.drop('ticker', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Loading in MinMaxScaler to be used on the target variable 'open'\n",
+    "scaler = MinMaxScaler()\n",
+    "\n",
+    "# Fitting and transforming the 'open' column\n",
+    "y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])\n",
+    "y_train.drop('open', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Doing the same to y_test as done to y_train \n",
+    "y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])\n",
+    "y_test.drop('open', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Defining the function for the LSTM model\n",
+    "def create_model(input_shape,\n",
+    "                 LSTM_filters=64,\n",
+    "                 dropout=0.1,\n",
+    "                 recurrent_dropout=0.1,\n",
+    "                 dense_dropout=0.5,\n",
+    "                 activation='relu',\n",
+    "                 depth=1):\n",
+    "\n",
+    "    model = Sequential()\n",
+    "\n",
+    "    # Input layer\n",
+    "    model.add(Input(shape=input_shape))\n",
+    "\n",
+    "    if depth > 1:\n",
+    "        for i in range(1, depth):\n",
+    "            # Recurrent layer\n",
+    "            model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))\n",
+    "\n",
+    "    # Recurrent layer\n",
+    "    model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))\n",
+    "\n",
+    "    # Fully connected layer\n",
+    "    if activation == 'relu':\n",
+    "        model.add(Dense(LSTM_filters, activation='relu'))\n",
+    "    elif activation == 'leaky_relu':\n",
+    "        model.add(Dense(LSTM_filters))\n",
+    "        model.add(tf.keras.layers.LeakyReLU(alpha=0.1))\n",
+    "\n",
+    "    # Dropout for regularization\n",
+    "    model.add(Dropout(dense_dropout))\n",
+    "\n",
+    "    # Output layer for predicting one day forward\n",
+    "    model.add(Dense(1, activation='linear'))\n",
+    "\n",
+    "    # Compile the model\n",
+    "    model.compile(optimizer='adam', loss='mse')\n",
+    "\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-07 10:28:33,332 WARNING: DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
+      "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# As X_train['date'] column exists and is in datetime format, we're converting it\n",
+    "X_train['year'] = X_train['date'].dt.year\n",
+    "X_train['month'] = X_train['date'].dt.month\n",
+    "X_train['day'] = X_train['date'].dt.day\n",
+    "\n",
+    "# Dropping the original date column\n",
+    "X_train.drop(columns=['date'], inplace=True)\n",
+    "\n",
+    "# Converting dataframe to numpy array\n",
+    "X_train_array = X_train.to_numpy()\n",
+    "\n",
+    "# Reshaping the array to have a shape suitable for LSTM\n",
+    "X_train_array = np.expand_dims(X_train_array, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert DataFrame to numpy array\n",
+    "X_train_array = X_train.values\n",
+    "\n",
+    "# Reshaping X_train_array to add a time step dimension\n",
+    "X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])\n",
+    "\n",
+    "# Assuming X_train_reshaped shape is now (374, 1, 5)\n",
+    "input_shape = X_train_reshaped.shape[1:]\n",
+    "\n",
+    "# Create the model\n",
+    "model = create_model(input_shape=input_shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 5ms/step - loss: 0.5131\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<keras.src.callbacks.history.History at 0x1c3aa79ff50>"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Fitting the model on the training dataset\n",
+    "model.fit(X_train_reshaped, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-07 10:28:39,020 WARNING: DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
+      "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# As X_test['date'] column exists and is in datetime format, we're converting it\n",
+    "X_test['year'] = X_test['date'].dt.year\n",
+    "X_test['month'] = X_test['date'].dt.month\n",
+    "X_test['day'] = X_test['date'].dt.day\n",
+    "\n",
+    "# Dropping the original date column\n",
+    "X_test.drop(columns=['date'], inplace=True)\n",
+    "\n",
+    "# Converting dataframe to numpy array\n",
+    "X_test_array = X_test.to_numpy()\n",
+    "\n",
+    "# Reshape the array to have a shape suitable for LSTM\n",
+    "X_test_array = np.expand_dims(X_test_array, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 307ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Predicting y_pred with X_test\n",
+    "y_pred = model.predict(X_test_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Conneting to hopsworks model registry\n",
+    "mr = project.get_model_registry()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'RMSE': 0.3981142064349763}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Compute RMSE metric for filling the model\n",
+    "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+    "rmse_metrics = {\"RMSE\": rmse}\n",
+    "rmse_metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Setting up the model schema\n",
+    "input_schema = Schema(X_train)\n",
+    "output_schema = Schema(y_train)\n",
+    "model_schema = ModelSchema(input_schema, output_schema)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Creating a file colled 'stock_model'\n",
+    "model_dir=\"stock_model\"\n",
+    "if os.path.isdir(model_dir) == False:\n",
+    "    os.mkdir(model_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6169babeb154f54bdbb9b0b490333ab",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f5749cebd1fe422dbeaba0ec2718a3f9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/561 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model created, explore it at https://c.app.hopsworks.ai:443/p/693399/models/stock_pred_model/6\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Model(name: 'stock_pred_model', version: 6)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Saving the model to hopsworks model registry\n",
+    "stock_pred_model = mr.tensorflow.create_model(\n",
+    "        name=\"stock_pred_model\",\n",
+    "        metrics= rmse_metrics,\n",
+    "        model_schema=model_schema,\n",
+    "        description=\"Stock Market TSLA Predictor from News Sentiment\",\n",
+    "    )\n",
+    "\n",
+    "stock_pred_model.save(model_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Stocks news prediction/Notebooks/8_inference_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,315 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549016\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "../src/arrow/status.cc:137: DoAction result was not fully consumed: Cancelled: Flight cancelled call, with message: CANCELLED. Detail: Cancelled\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training dataset job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/549016/jobs/named/tesla_stocks_fv_1_create_fv_td_07052024090051/executions\n",
+      "2024-05-07 11:02:21,906 WARNING: VersionWarning: Incremented version to `1`.\n",
+      "\n",
+      "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 3ms/step - loss: 0.5555\n",
+      "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 61ms/step\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1dd33e12e80548c99f5a605b28f82196",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b636479e09e94fb2a0c5736c2368aec4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/528 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model created, explore it at https://c.app.hopsworks.ai:443/p/549016/models/stock_pred_model/7\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd \n",
+    "import hopsworks \n",
+    "from datetime import datetime, timedelta\n",
+    "from SML import training_pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connection closed.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549016\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "project = hopsworks.login()\n",
+    "fs= project.get_feature_store()\n",
+    "mr = project.get_model_registry() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-06\n"
+     ]
+    }
+   ],
+   "source": [
+    "start_date = datetime.now() - timedelta(hours=24)\n",
+    "print(start_date.strftime(\"%Y-%m-%d\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-05-07\n"
+     ]
+    }
+   ],
+   "source": [
+    "end_date = datetime.now().strftime(\"%Y-%m-%d\")\n",
+    "print(end_date)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_view = fs.get_feature_view('tesla_stocks_fv', 1)\n",
+    "feature_view.init_batch_scoring(training_dataset_version=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WITH right_fg0 AS (SELECT *\n",
+      "FROM (SELECT `fg1`.`date` `date`, `fg1`.`ticker` `ticker`, `fg1`.`ticker` `join_pk_ticker`, `fg1`.`date` `join_evt_date`, `fg0`.`sentiment` `sentiment`, RANK() OVER (PARTITION BY `fg1`.`ticker`, `fg1`.`date` ORDER BY `fg0`.`date` DESC) pit_rank_hopsworks\n",
+      "FROM `mtzeve_featurestore`.`tesla_stock_1` `fg1`\n",
+      "INNER JOIN `mtzeve_featurestore`.`news_sentiment_updated_1` `fg0` ON `fg1`.`ticker` = `fg0`.`ticker` AND `fg1`.`date` >= `fg0`.`date`) NA\n",
+      "WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`date` `date`, `right_fg0`.`ticker` `ticker`, `right_fg0`.`sentiment` `sentiment`\n",
+      "FROM right_fg0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(feature_view.get_batch_query())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.11s) \n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>ticker</th>\n",
+       "      <th>sentiment</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [date, ticker, sentiment]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# we had problems fetching the data from fv with get_batch_data function, tried everything and it just doesnt work \n",
+    "tsla_df_b = feature_view.get_batch_data(start_time = start_date, end_time = end_date)\n",
+    "tsla_df_b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading model artifact (0 dirs, 1 files)... DONE\r"
+     ]
+    },
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/var/folders/ty/fy7wpfqs4c39hnsfl21_rzyc0000gn/T/d6edbe1d-de39-488f-b12c-c0cbfd5ded37/stock_pred_model/7stock_model'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[1;32m/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb Cell 8\u001b[0m line \u001b[0;36m5\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m the_model \u001b[39m=\u001b[39m mr\u001b[39m.\u001b[39mget_model(\u001b[39m\"\u001b[39m\u001b[39mstock_pred_model\u001b[39m\u001b[39m\"\u001b[39m, version\u001b[39m=\u001b[39m\u001b[39m7\u001b[39m)\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m model_dir \u001b[39m=\u001b[39m the_model\u001b[39m.\u001b[39mdownload()\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/manos/Documents/BDS/Mlops_mod_final/MLops_mod/inference_pipeline.ipynb#X11sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m model \u001b[39m=\u001b[39m joblib\u001b[39m.\u001b[39mload(model_dir \u001b[39m+\u001b[39m \u001b[39m'\u001b[39m\u001b[39mstock_model\u001b[39m\u001b[39m'\u001b[39m)\n",
+      "File \u001b[0;32m/Applications/anaconda3/lib/python3.11/site-packages/joblib/numpy_pickle.py:650\u001b[0m, in \u001b[0;36mload\u001b[0;34m(filename, mmap_mode)\u001b[0m\n\u001b[1;32m    648\u001b[0m         obj \u001b[39m=\u001b[39m _unpickle(fobj)\n\u001b[1;32m    649\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(filename, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m    651\u001b[0m         \u001b[39mwith\u001b[39;00m _read_fileobject(f, filename, mmap_mode) \u001b[39mas\u001b[39;00m fobj:\n\u001b[1;32m    652\u001b[0m             \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(fobj, \u001b[39mstr\u001b[39m):\n\u001b[1;32m    653\u001b[0m                 \u001b[39m# if the returned file object is a string, this means we\u001b[39;00m\n\u001b[1;32m    654\u001b[0m                 \u001b[39m# try to load a pickle file generated with an version of\u001b[39;00m\n\u001b[1;32m    655\u001b[0m                 \u001b[39m# Joblib so we load it with joblib compatibility function.\u001b[39;00m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/var/folders/ty/fy7wpfqs4c39hnsfl21_rzyc0000gn/T/d6edbe1d-de39-488f-b12c-c0cbfd5ded37/stock_pred_model/7stock_model'"
+     ]
+    }
+   ],
+   "source": [
+    "import joblib\n",
+    "the_model = mr.get_model(\"stock_pred_model\", version=7)\n",
+    "model_dir = the_model.download()\n",
+    "\n",
+    "model = joblib.load(model_dir + 'stock_model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = model.predict(tsla_df_b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Stocks news prediction/SML/__pycache__/feature_pipeline.cpython-311.pyc ADDED Viewed

Binary file (2.74 kB). View file

Stocks news prediction/SML/__pycache__/news_preprocessing.cpython-311.pyc ADDED Viewed

Binary file (2.54 kB). View file

feature_pipeline.py → Stocks news prediction/SML/feature_pipeline.py RENAMED Viewed

@@ -1,10 +1,3 @@
-# %%
-from dotenv import load_dotenv
-import os
-# %%
-#!pip install great_expectations==0.18.12
 # %%
 # Import necessary libraries
 import pandas as pd               # For data manipulation using DataFrames
@@ -13,119 +6,85 @@ import matplotlib.pyplot as plt   # For data visualization
 import os                         # For operating system-related tasks
 import joblib                     # For saving and loading models
 import hopsworks                  # For getting access to hopsworks
 # Import specific modules from scikit-learn
 from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing
 from sklearn.metrics import accuracy_score                        # For evaluating model accuracy
-# %%
-#from alpha_vantage.timeseries import TimeSeries
-#import pandas as pd
-#load_dotenv()
-#api_key = os.environ.get('stocks_api') # Replace this with your actual API key
-#ts = TimeSeries(key=api_key, output_format='pandas')
-# Fetch daily adjusted stock prices; adjust the symbol as needed
-#data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
-#print(data.head())
-# %%
-#data.info()
-# %%
-#meta_data
-# %%
-# Define your file path and name
-#file_path = 'TSLA_stock_price.csv'  # Customize the path and filename
-# Save the DataFrame to CSV
-#stock_data.to_csv(file_path)
-#print(f"Data saved to {file_path}")
-# %%
-# Load and display the data from CSV to confirm
-tsla_df = pd.read_csv('TSLA_stock_price.csv')
-print(tsla_df.head())
-# %%
 api_key = os.environ.get('hopsworks_api')
 project = hopsworks.login(api_key_value=api_key)
 fs = project.get_feature_store()
 # %%
-import re
 # %%
 def clean_column_name(name):
     # Remove all non-letter characters
     cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
     return cleaned_name
 # %%
 tsla_df
 # %%
-# Assuming 'tsla_df' is your DataFrame
 tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
-# %%
 print(tsla_df.columns)
 # %%
-import pandas as pd
-# Assuming tsla_df is your pandas DataFrame
-# Convert the "date" column to timestamp
 tsla_df['date'] = pd.to_datetime(tsla_df['date'])
 # %%
-# Define a feature group
 tesla_fg = fs.get_or_create_feature_group(
     name="tesla_stock",
     description="Tesla stock dataset from alpha vantage",
-    version=2,
     primary_key=["ticker"],
     event_time=['date'],
     online_enabled=False,
 )
 # %%
 tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
 # %%
 news_df = pd.read_csv('news_articles_ema.csv')
 # %%
 news_df_updated = news_df.drop(columns=['exp_mean_7_days'])
 # %%
 news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])
 # %%
 news_sentiment_fg = fs.get_or_create_feature_group(
     name='news_sentiment_updated',
     description='News sentiment from Polygon',
-    version=2,
     primary_key=['ticker'],
     event_time=['date'],
     online_enabled=False,
 )
 # %%
 news_sentiment_fg.insert(news_df_updated)

 # %%
 # Import necessary libraries
 import pandas as pd               # For data manipulation using DataFrames
 import os                         # For operating system-related tasks
 import joblib                     # For saving and loading models
 import hopsworks                  # For getting access to hopsworks
+import re
 # Import specific modules from scikit-learn
 from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing
 from sklearn.metrics import accuracy_score                        # For evaluating model accuracy
+from dotenv import load_dotenv
+import os
+load_dotenv()
+#Connecting to hopsworks
 api_key = os.environ.get('hopsworks_api')
 project = hopsworks.login(api_key_value=api_key)
 fs = project.get_feature_store()
 # %%
+# Load and display the data from CSV to confirm
+tsla_df = pd.read_csv('TSLA_stock_price.csv')
+print(tsla_df.head())
 # %%
+#Defining a function to clean the column names
 def clean_column_name(name):
     # Remove all non-letter characters
     cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
     return cleaned_name
 # %%
 tsla_df
 # %%
+# Cleaning up column names for 'tsla_df'
 tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]
 print(tsla_df.columns)
 # %%
+# Converting the "date" column to timestamp
 tsla_df['date'] = pd.to_datetime(tsla_df['date'])
 # %%
+# Defining the stocks feature group
 tesla_fg = fs.get_or_create_feature_group(
     name="tesla_stock",
     description="Tesla stock dataset from alpha vantage",
+    version=1,
     primary_key=["ticker"],
     event_time=['date'],
     online_enabled=False,
 )
 # %%
+#Inserting the stock data into the stocks feature group
 tesla_fg.insert(tsla_df, write_options={"wait_for_job" : False})
 # %%
+#Collecting news df
 news_df = pd.read_csv('news_articles_ema.csv')
 # %%
+#Dropping exp mean 7 days
 news_df_updated = news_df.drop(columns=['exp_mean_7_days'])
 # %%
+#Updating date to datetime
 news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])
 # %%
+#Defining the news feature group
 news_sentiment_fg = fs.get_or_create_feature_group(
     name='news_sentiment_updated',
     description='News sentiment from Polygon',
+    version=1,
     primary_key=['ticker'],
     event_time=['date'],
     online_enabled=False,
 )
 # %%
+#Inserting the news data into the news feature group
 news_sentiment_fg.insert(news_df_updated)

feature_view.py → Stocks news prediction/SML/feature_view.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # %%
-# Import necessary libraries
 import pandas as pd               # For data manipulation using DataFrames
 import numpy as np                # For numerical operations
 import matplotlib.pyplot as plt   # For data visualization
@@ -7,39 +7,34 @@ import os                         # For operating system-related tasks
 import joblib                     # For saving and loading models
 import hopsworks                  # For getting access to hopsworks
-# Import specific modules from scikit-learn
-from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing
-from sklearn.metrics import accuracy_score                        # For evaluating model accuracy
-# %%
-from feature_pipeline import tesla_fg
-from feature_pipeline import news_sentiment_fg
-# %%
 from dotenv import load_dotenv
 import os
 load_dotenv()
-# %%
 api_key = os.environ.get('hopsworks_api')
 project = hopsworks.login(api_key_value=api_key)
 fs = project.get_feature_store()
 # %%
 def create_stocks_feature_view(fs, version):
     # Loading in the feature groups
     tesla_fg = fs.get_feature_group('tesla_stock', version=1)
     news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
-    # Define the query
     ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
         .join(news_sentiment_fg.select(['sentiment']))
-    # Create the feature view
     feature_view = fs.create_feature_view(
         name='tesla_stocks_fv',
         query=ds_query,
@@ -49,6 +44,7 @@ def create_stocks_feature_view(fs, version):
     return feature_view, tesla_fg
 # %%
 try:
     feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
     tesla_fg = fs.get_feature_group('tesla_stock', version=1)
@@ -56,6 +52,7 @@ except:
     feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
 # %%
 def fix_data_from_feature_view(df,start_date,end_date):
     df = df.sort_values("date")
     df = df.reset_index()
@@ -77,27 +74,4 @@ def fix_data_from_feature_view(df,start_date,end_date):
     return filtered_df
-# %%
-#def create_stocks_feature_view(fs, version):
-    #Loading in the feature groups
-#    tesla_fg = fs.get_feature_group('tesla_stock', version = 3)
-#    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)
-#    ds_query = tesla_fg.select(['date','open', 'ticker'])\
-#        .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))
-#    return (fs.create_tesla_feature_view(
-#        name = 'tsla_stocks_fv',
-#        query = ds_query,
-#        labels=['ticker']
-#    ), tesla_fg)
-# %%
-#try:
-#    feature_view = fs.get_feature_view("tsla_stocks_fv", version=1)
-#    tesla_fg = fs.get_feature_group('tesla_stock', version=3)
-#except:
-#    feature_view, tesla_fg = create_stocks_feature_view(fs, 1)

 # %%
+# Importing necessary libraries
 import pandas as pd               # For data manipulation using DataFrames
 import numpy as np                # For numerical operations
 import matplotlib.pyplot as plt   # For data visualization
 import joblib                     # For saving and loading models
 import hopsworks                  # For getting access to hopsworks
+from feature_pipeline import tesla_fg   #Loading in the tesla_fg
+from feature_pipeline import news_sentiment_fg  #Loading in the news_fg
+#Making the notebook able to fetch from the .env file
 from dotenv import load_dotenv
 import os
 load_dotenv()
+#Getting connected to hopsworks
 api_key = os.environ.get('hopsworks_api')
 project = hopsworks.login(api_key_value=api_key)
 fs = project.get_feature_store()
 # %%
+#Defining the function to create feature view
 def create_stocks_feature_view(fs, version):
     # Loading in the feature groups
     tesla_fg = fs.get_feature_group('tesla_stock', version=1)
     news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
+    # Defining the query
     ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
         .join(news_sentiment_fg.select(['sentiment']))
+    # Creating the feature view
     feature_view = fs.create_feature_view(
         name='tesla_stocks_fv',
         query=ds_query,
     return feature_view, tesla_fg
 # %%
+#Creating the feature view
 try:
     feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
     tesla_fg = fs.get_feature_group('tesla_stock', version=1)
     feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
 # %%
+#Defining a function to get fixed data from the feature view
 def fix_data_from_feature_view(df,start_date,end_date):
     df = df.sort_values("date")
     df = df.reset_index()
     return filtered_df

Stocks news prediction/SML/historical_news.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# %%
+#Importing necessary libraries
+from dotenv import load_dotenv
+from datetime import datetime, timedelta
+import requests
+import os
+import time
+import pandas as pd
+from news_preprocessing import *    #Importing everything from 'news_preprocessing'
+load_dotenv()
+# %%
+#Defining a function for fetching news
+def fetch_news(api_key, ticker, start_date, end_date):
+    base_url = os.environ.get("endpointnewsp")
+    headers = {"Authorization": f"Bearer {api_key}"}
+    all_news = []
+    current_date = start_date
+    while current_date <= end_date:
+        batch_end_date = current_date + timedelta(days=50)
+        if batch_end_date > end_date:
+            batch_end_date = end_date
+        params = {
+            "ticker": ticker,
+            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
+            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
+            "limit": 50,
+            "sort": "published_utc"
+        }
+        try:
+            response = requests.get(base_url, headers=headers, params=params)
+            if response.status_code == 200:
+                data = response.json()
+                articles = data.get('results', [])
+                # Creating a DataFrame from articles
+                df = pd.DataFrame(articles)
+                # Adding primary_key column if ticker is found
+                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
+                all_news.append(df)  # Append DataFrame to the list
+                print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
+                current_date = batch_end_date + timedelta(days=1)
+            elif response.status_code == 429:
+                print("Rate limit reached. Waiting to retry...")
+                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
+                continue  # Retry the current request
+            else:
+                print(f"Failed to fetch data: {response.status_code}, {response.text}")
+                break
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            break
+    return pd.concat(all_news, ignore_index=True)
+#Usage
+api_key = os.environ.get('newsp_api')
+ticker = 'TSLA'
+end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
+start_date = end_date - timedelta(days=365 * 2)
+news_articles = fetch_news(api_key, ticker, start_date, end_date)
+print(f"Total articles fetched: {len(news_articles)}")
+# %%
+# Process the news articles
+df = process_news_articles(news_articles)
+# %%
+df.info()
+# %%
+df.head()
+# %%
+df= df.sort_index(ascending=False)
+# %%
+#Putting the news articles into a csv
+df.to_csv('news_articles.csv', index=False)
+# %%
+df_processed = exponential_moving_average(df, window=7)
+# %%
+df_processed.to_csv('news_articles_ema.csv', index=False)
+# %%
+df_processed.head()
+# %%
+df_processed.tail()
+# %%
+print(df_processed['date'].min())
+print(df_processed['date'].max())
+# %%
+print(df_processed['date'].max() - df_processed['date'].min())
+# %%
+df_processed.shape
+# %%
+duplicates = df_processed[df_processed.duplicated('date')]
+# %%
+duplicates.shape
+# %%
+df_processed.head()

Stocks news prediction/SML/historical_stock.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# %%
+#Importing necessary librabries
+from dotenv import load_dotenv
+import os
+from alpha_vantage.timeseries import TimeSeries
+import pandas as pd
+import hopsworks
+import re
+import modal
+#prepocessing
+import requests
+import pandas as pd
+import json
+#import pandas_market_calendars as mcal
+import datetime
+import numpy as np
+from datetime import timedelta
+load_dotenv()   #Making the .env file work
+# %%
+#Setting up API key to being able to fetch stocks from Alpha Vantage
+api_key = os.environ.get('stocks_api')
+ts = TimeSeries(key=api_key, output_format='pandas')
+#Defining a function to fetch stocks
+def fetch_stock_prices(symbol):
+    # Fetch daily adjusted stock prices; adjust the symbol as needed
+    data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')
+    # Add a new column named 'ticker' and fill it with the ticker name
+    data['ticker'] = symbol
+    return data
+#Usage
+symbol = 'TSLA'
+stock_data = fetch_stock_prices(symbol)
+print(stock_data.head())
+# %%
+# Defining the file path and name
+file_path = 'TSLA_stock_price.csv'
+# Saving the DataFrame to CSV
+stock_data.to_csv(file_path)
+print(f"Data saved to {file_path}")

news_preprocessing.py → Stocks news prediction/SML/news_preprocessing.py RENAMED Viewed

@@ -1,4 +1,5 @@
 # %%
 from dotenv import load_dotenv
 from datetime import datetime, timedelta
 import requests
@@ -8,6 +9,7 @@ import pandas as pd
 from textblob import TextBlob
 # %%
 def process_news_articles(news_articles):
     # Convert list of dictionaries to DataFrame
     df = pd.DataFrame(news_articles)
@@ -27,19 +29,21 @@ def process_news_articles(news_articles):
     df['date'] = df['published_utc'].dt.date
     df['time'] = df['published_utc'].dt.time
-    # Drop unnecessary columns
     df.drop(['published_utc'], axis=1, inplace=True)
     # set date to index
     df = df.set_index("date")
     df.reset_index(inplace=True)
     df.index = pd.to_datetime(df.index)
-    df = df.groupby(['date', 'ticker'])['sentiment'].mean().reset_index(name='sentiment')
     return df
 # %%
 def exponential_moving_average(df, window):
-    # Calculate EMA on the 'sentiment' column
     df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()
     return df

 # %%
+#Importing necessary libraries
 from dotenv import load_dotenv
 from datetime import datetime, timedelta
 import requests
 from textblob import TextBlob
 # %%
+#Defining a function to process news articles
 def process_news_articles(news_articles):
     # Convert list of dictionaries to DataFrame
     df = pd.DataFrame(news_articles)
     df['date'] = df['published_utc'].dt.date
     df['time'] = df['published_utc'].dt.time
+    # Dropping unnecessary columns
     df.drop(['published_utc'], axis=1, inplace=True)
     # set date to index
     df = df.set_index("date")
     df.reset_index(inplace=True)
     df.index = pd.to_datetime(df.index)
+    df = df.groupby(['date', 'ticker'])['sentiment'].mean().reset_index()
     return df
 # %%
+#Defining a function for the exponential moving average
 def exponential_moving_average(df, window):
+   # Calculate EMA on the 'sentiment' column
     df[f'exp_mean_{window}_days'] = df['sentiment'].ewm(span=window, adjust=False).mean()
     return df

stock_preprocessing.py → Stocks news prediction/SML/stock_preprocessing.py RENAMED Viewed

@@ -1,4 +1,5 @@
 # %%
 from dotenv import load_dotenv
 import os
 from alpha_vantage.timeseries import TimeSeries
@@ -10,15 +11,14 @@ import modal
 import requests
 import pandas as pd
 import json
-#import pandas_market_calendars as mcal
 import datetime
 import numpy as np
 from datetime import datetime, timedelta
-# %%
 load_dotenv()
 api_key = os.environ.get('stocks_api') # Replace this with your actual API key
 ts = TimeSeries(key=api_key, output_format='pandas')
@@ -28,12 +28,11 @@ data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')
 print(data.head())
 # %%
-data
-# %%
 data.info()
 # %%
 meta_data
 # %%
@@ -50,6 +49,7 @@ def today_is_a_business_day(today):
         return False
 # %%
 def next_business_day(today):
     # Real tomorrow
@@ -71,6 +71,7 @@ def next_business_day(today):
     return isBusinessDay.to_numpy()[0]
 # %%
 def extract_business_day(start_date,end_date):
     """
     Given a start_date and end_date.
@@ -82,27 +83,27 @@ def extract_business_day(start_date,end_date):
         e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open
     """
-    # Save for later
     end_date_save = end_date
-    # Get the NYSE calendar
     cal = mcal.get_calendar('NYSE')
-    # Get the NYSE calendar's open and close times for the specified period
     schedule = cal.schedule(start_date=start_date, end_date=end_date)
     # Only need a list of dates when it's open (not open and close times)
     isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d'))
-    # Go over all days:
     delta = datetime.timedelta(days=1)
     start_date = datetime.datetime.strptime(start_date,"%Y-%m-%d") #datetime.date(2015, 7, 16)
     end_date = datetime.datetime.strptime(end_date,"%Y-%m-%d") #datetime.date(2023, 1, 4)
-    # Extract days from the timedelta object
     num_days = (end_date - start_date).days + 1
-    # Create boolean array for days being open (1) and closed (0)
     is_open = np.zeros(num_days)
     # iterate over range of dates
@@ -131,6 +132,7 @@ def extract_business_day(start_date,end_date):
     return isBusinessDay, is_open
 # %%
 def clean_column_name(name):
     # Remove all non-letter characters
     cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
@@ -150,15 +152,12 @@ data.reset_index(inplace=True)
 data.head()
 # %%
-data
-# %%
-# Define the date range you're interested in
 yesterday =datetime.now()-timedelta(days=1)
 two_years_back = yesterday - timedelta(days=684)
 # %%
-# Filter the DataFrame to this range
 filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]
 # %%
@@ -171,7 +170,4 @@ print(filtered_df['date'].max())
 # %%
 filtered_df.shape
-# %%

 # %%
+#Importing necessary libraries
 from dotenv import load_dotenv
 import os
 from alpha_vantage.timeseries import TimeSeries
 import requests
 import pandas as pd
 import json
+import pandas_market_calendars as mcal
 import datetime
 import numpy as np
 from datetime import datetime, timedelta
 load_dotenv()
+# %%
+#Connecting to Alpha vantage using API key
 api_key = os.environ.get('stocks_api') # Replace this with your actual API key
 ts = TimeSeries(key=api_key, output_format='pandas')
 print(data.head())
 # %%
+#Looking at data info
 data.info()
 # %%
+#Looking at the meta data
 meta_data
 # %%
         return False
 # %%
+#Defining a function to find the next business day
 def next_business_day(today):
     # Real tomorrow
     return isBusinessDay.to_numpy()[0]
 # %%
+#Defining a function to extract business day
 def extract_business_day(start_date,end_date):
     """
     Given a start_date and end_date.
         e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open
     """
+    # Saving for later
     end_date_save = end_date
+    # Getting the NYSE calendar
     cal = mcal.get_calendar('NYSE')
+    # Getting the NYSE calendar's open and close times for the specified period
     schedule = cal.schedule(start_date=start_date, end_date=end_date)
     # Only need a list of dates when it's open (not open and close times)
     isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d'))
+    # Going over all days:
     delta = datetime.timedelta(days=1)
     start_date = datetime.datetime.strptime(start_date,"%Y-%m-%d") #datetime.date(2015, 7, 16)
     end_date = datetime.datetime.strptime(end_date,"%Y-%m-%d") #datetime.date(2023, 1, 4)
+    # Extracting days from the timedelta object
     num_days = (end_date - start_date).days + 1
+    # Creating a boolean array for days being open (1) and closed (0)
     is_open = np.zeros(num_days)
     # iterate over range of dates
     return isBusinessDay, is_open
 # %%
+#Defining a function to clean the column names
 def clean_column_name(name):
     # Remove all non-letter characters
     cleaned_name = re.sub(r'[^a-zA-Z]', '', name)
 data.head()
 # %%
+# Define the date range we're interested in
 yesterday =datetime.now()-timedelta(days=1)
 two_years_back = yesterday - timedelta(days=684)
 # %%
+# Filtering the DataFrame to this range
 filtered_df = data[(data['date'] >= two_years_back) & (data['date'] <= yesterday)]
 # %%
 # %%
 filtered_df.shape

Stocks news prediction/SML/training_pipeline.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# %%
+#Importing necessary libraries
+import hopsworks
+import hsfs
+from dotenv import load_dotenv
+import os
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from hsml.schema import Schema
+from hsml.model_schema import ModelSchema
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
+from sklearn.preprocessing import StandardScaler  # Import StandardScaler from scikit-learn
+import joblib
+load_dotenv()
+#Connecting to hopsworks
+api_key = os.environ.get('hopsworks_api')
+project = hopsworks.login(api_key_value=api_key)
+fs = project.get_feature_store()
+#Another connection to hopsworks
+api_key = os.getenv('hopsworks_api')
+connection = hsfs.connection()
+fs = connection.get_feature_store()
+# %%
+#Getting the feature view
+feature_view = fs.get_feature_view(
+    name='tesla_stocks_fv',
+    version=1
+)
+# %%
+#Setting up train & test split dates
+train_start = "2022-06-22"
+train_end = "2023-12-31"
+test_start = '2024-01-01'
+test_end = "2024-05-03"
+# %%
+#Creating the train/test split on the feature view with the split dates
+feature_view.create_train_test_split(
+    train_start=train_start,
+    train_end=train_end,
+    test_start=test_start,
+    test_end=test_end,
+    data_format='csv',
+    coalesce= True,
+    statistics_config={'histogram':True,'correlations':True})
+# %%
+#Collecting the split from feature view
+X_train, X_test, y_train, y_test = feature_view.get_train_test_split(6)
+# %%
+#Inspecting X_train
+X_train
+# %%
+#Converting date into datetime
+X_train['date'] = pd.to_datetime(X_train['date']).dt.date
+X_test['date'] = pd.to_datetime(X_test['date']).dt.date
+X_train['date'] = pd.to_datetime(X_train['date'])
+X_test['date'] = pd.to_datetime(X_test['date'])
+# %%
+X_train.head()
+# %%
+# Extracting the 'ticker' column
+tickers = X_train[['ticker']]
+# Initializing OneHotEncoder
+encoder = OneHotEncoder()
+# Fitting and transforming the 'ticker' column
+ticker_encoded = encoder.fit_transform(tickers)
+# Converting the encoded column into a DataFrame
+ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))
+# Concatenating the encoded DataFrame with the original DataFrame
+X_train = pd.concat([X_train, ticker_encoded_df], axis=1)
+# Dropping the original 'ticker' column
+X_train.drop('ticker', axis=1, inplace=True)
+# %%
+#Inspecting X train after onehotencoding 'Ticker'
+X_train.head()
+# %%
+#Doing the same for X test as done to X train
+tickers = X_test[['ticker']]
+# Initializing OneHotEncoder
+encoder = OneHotEncoder()
+# Fitting and transforming the 'ticker' column
+ticker_encoded_test = encoder.fit_transform(tickers)
+# Converting the encoded column into a DataFrame
+ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))
+# Concatenating the encoded DataFrame with the original DataFrame
+X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)
+# Dropping the original 'ticker' column
+X_test.drop('ticker', axis=1, inplace=True)
+# %%
+#Loading in MinMaxScaler to be used on the target variable 'open'
+scaler = MinMaxScaler()
+# Fitting and transforming the 'open' column
+y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])
+y_train.drop('open', axis=1, inplace=True)
+# %%
+#Doing the same to y_test as done to y_train
+y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])
+y_test.drop('open', axis=1, inplace=True)
+# %%
+#Defining the function for the LSTM model
+def create_model(input_shape,
+                 LSTM_filters=64,
+                 dropout=0.1,
+                 recurrent_dropout=0.1,
+                 dense_dropout=0.5,
+                 activation='relu',
+                 depth=1):
+    model = Sequential()
+    # Input layer
+    model.add(Input(shape=input_shape))
+    if depth > 1:
+        for i in range(1, depth):
+            # Recurrent layer
+            model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))
+    # Recurrent layer
+    model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))
+    # Fully connected layer
+    if activation == 'relu':
+        model.add(Dense(LSTM_filters, activation='relu'))
+    elif activation == 'leaky_relu':
+        model.add(Dense(LSTM_filters))
+        model.add(tf.keras.layers.LeakyReLU(alpha=0.1))
+    # Dropout for regularization
+    model.add(Dropout(dense_dropout))
+    # Output layer for predicting one day forward
+    model.add(Dense(1, activation='linear'))
+    # Compile the model
+    model.compile(optimizer='adam', loss='mse')
+    return model
+# %%
+# As X_train['date'] column exists and is in datetime format, we're converting it
+X_train['year'] = X_train['date'].dt.year
+X_train['month'] = X_train['date'].dt.month
+X_train['day'] = X_train['date'].dt.day
+# Dropping the original date column
+X_train.drop(columns=['date'], inplace=True)
+# Converting dataframe to numpy array
+X_train_array = X_train.to_numpy()
+# Reshaping the array to have a shape suitable for LSTM
+X_train_array = np.expand_dims(X_train_array, axis=1)
+# %%
+# Convert DataFrame to numpy array
+X_train_array = X_train.values
+# Reshaping X_train_array to add a time step dimension
+X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])
+# Assuming X_train_reshaped shape is now (374, 1, 5)
+input_shape = X_train_reshaped.shape[1:]
+# Create the model
+model = create_model(input_shape=input_shape)
+# %%
+#Fitting the model on the training dataset
+model.fit(X_train_reshaped, y_train)
+# %%
+# As X_test['date'] column exists and is in datetime format, we're converting it
+X_test['year'] = X_test['date'].dt.year
+X_test['month'] = X_test['date'].dt.month
+X_test['day'] = X_test['date'].dt.day
+# Dropping the original date column
+X_test.drop(columns=['date'], inplace=True)
+# Converting dataframe to numpy array
+X_test_array = X_test.to_numpy()
+# Reshape the array to have a shape suitable for LSTM
+X_test_array = np.expand_dims(X_test_array, axis=1)
+# %%
+#Predicting y_pred with X_test
+y_pred = model.predict(X_test_array)
+# %%
+#Conneting to hopsworks model registry
+mr = project.get_model_registry()
+# %%
+# Compute RMSE metric for filling the model
+rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+rmse_metrics = {"RMSE": rmse}
+rmse_metrics
+# %%
+#Setting up the model schema
+input_schema = Schema(X_train)
+output_schema = Schema(y_train)
+model_schema = ModelSchema(input_schema, output_schema)
+# %%
+#Creating a file colled 'stock_model'
+model_dir="stock_model"
+if os.path.isdir(model_dir) == False:
+    os.mkdir(model_dir)
+# %%
+#Saving the model to hopsworks model registry
+stock_pred_model = mr.tensorflow.create_model(
+        name="stock_pred_model",
+        metrics= rmse_metrics,
+        model_schema=model_schema,
+        description="Stock Market TSLA Predictor from News Sentiment",
+    )
+stock_pred_model.save(model_dir)

TSLA_stock_price.csv → Stocks news prediction/TSLA_stock_price.csv RENAMED Viewed

File without changes

news_articles.csv → Stocks news prediction/news_articles.csv RENAMED Viewed

File without changes

news_articles_ema.csv → Stocks news prediction/news_articles_ema.csv RENAMED Viewed

File without changes

feature_engineering.ipynb DELETED Viewed

@@ -1,73 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import requests\n",
-    "import pandas as pd\n",
-    "import json\n",
-    "import datetime\n",
-    "import numpy as np\n",
-    "from datetime import timedelta "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def getNews(api_key,endpoint,ticker,from_date,to_date,num=1000):\n",
-    "    # Set the parameters for the request\n",
-    "    params = {\n",
-    "        \"api_token\": api_key,\n",
-    "        \"s\": ticker,\n",
-    "        \"from\": from_date, \n",
-    "        \"to\": to_date,\n",
-    "        \"limit\": num,\n",
-    "    }\n",
-    "    \n",
-    "    # Make the request to the API\n",
-    "    response = requests.get(endpoint, params=params)\n",
-    "    \n",
-    "    # Print the response from the API\n",
-    "    #print(response.json())\n",
-    "\n",
-    "    #Return a Pandas dataframe from the response\n",
-    "    return pd.DataFrame(response.json())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

feature_pipeline.ipynb DELETED Viewed

@@ -1,775 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dotenv import load_dotenv\n",
-    "import os "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: great_expectations==0.18.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (0.18.12)\n",
-      "Requirement already satisfied: altair<5.0.0,>=4.2.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.2.2)\n",
-      "Requirement already satisfied: Click>=7.1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.1.7)\n",
-      "Requirement already satisfied: colorama>=0.4.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (0.4.6)\n",
-      "Requirement already satisfied: cryptography>=3.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (42.0.6)\n",
-      "Requirement already satisfied: Ipython>=7.16.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.22.2)\n",
-      "Requirement already satisfied: ipywidgets>=7.5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (8.1.2)\n",
-      "Requirement already satisfied: jinja2>=2.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.1.3)\n",
-      "Requirement already satisfied: jsonpatch>=1.22 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.33)\n",
-      "Requirement already satisfied: jsonschema>=2.5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.22.0)\n",
-      "Requirement already satisfied: makefun<2,>=1.7.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.15.2)\n",
-      "Requirement already satisfied: marshmallow<4.0.0,>=3.7.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.21.2)\n",
-      "Requirement already satisfied: mistune>=0.8.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.0.2)\n",
-      "Requirement already satisfied: nbformat>=5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (5.10.4)\n",
-      "Requirement already satisfied: notebook>=6.4.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (7.1.3)\n",
-      "Requirement already satisfied: packaging in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (24.0)\n",
-      "Requirement already satisfied: pydantic>=1.9.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.7.1)\n",
-      "Requirement already satisfied: pyparsing>=2.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (3.1.2)\n",
-      "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.9.0)\n",
-      "Requirement already satisfied: pytz>=2021.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2024.1)\n",
-      "Requirement already satisfied: requests>=2.20 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.31.0)\n",
-      "Requirement already satisfied: ruamel.yaml<0.17.18,>=0.16 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (0.17.17)\n",
-      "Requirement already satisfied: scipy>=1.6.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.13.0)\n",
-      "Requirement already satisfied: tqdm>=4.59.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.66.4)\n",
-      "Requirement already satisfied: typing-extensions>=3.10.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (4.11.0)\n",
-      "Requirement already satisfied: tzlocal>=1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (5.2)\n",
-      "Requirement already satisfied: urllib3>=1.26 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (2.2.1)\n",
-      "Requirement already satisfied: numpy<2.0.0,>=1.22.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.26.4)\n",
-      "Requirement already satisfied: pandas>=1.3.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from great_expectations==0.18.12) (1.5.1)\n",
-      "Requirement already satisfied: entrypoints in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from altair<5.0.0,>=4.2.1->great_expectations==0.18.12) (0.4)\n",
-      "Requirement already satisfied: toolz in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from altair<5.0.0,>=4.2.1->great_expectations==0.18.12) (0.12.1)\n",
-      "Requirement already satisfied: cffi>=1.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from cryptography>=3.2->great_expectations==0.18.12) (1.16.0)\n",
-      "Requirement already satisfied: decorator in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (5.1.1)\n",
-      "Requirement already satisfied: jedi>=0.16 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.19.1)\n",
-      "Requirement already satisfied: matplotlib-inline in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.1.7)\n",
-      "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (3.0.42)\n",
-      "Requirement already satisfied: pygments>=2.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (2.17.2)\n",
-      "Requirement already satisfied: stack-data in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (0.6.2)\n",
-      "Requirement already satisfied: traitlets>=5.13.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from Ipython>=7.16.3->great_expectations==0.18.12) (5.14.3)\n",
-      "Requirement already satisfied: comm>=0.1.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (0.2.2)\n",
-      "Requirement already satisfied: widgetsnbextension~=4.0.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (4.0.10)\n",
-      "Requirement already satisfied: jupyterlab-widgets~=3.0.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipywidgets>=7.5.1->great_expectations==0.18.12) (3.0.10)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jinja2>=2.10->great_expectations==0.18.12) (2.1.5)\n",
-      "Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonpatch>=1.22->great_expectations==0.18.12) (2.4)\n",
-      "Requirement already satisfied: attrs>=22.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (23.2.0)\n",
-      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (2023.12.1)\n",
-      "Requirement already satisfied: referencing>=0.28.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (0.35.1)\n",
-      "Requirement already satisfied: rpds-py>=0.7.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema>=2.5.1->great_expectations==0.18.12) (0.18.0)\n",
-      "Requirement already satisfied: fastjsonschema>=2.15 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbformat>=5.0->great_expectations==0.18.12) (2.19.1)\n",
-      "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbformat>=5.0->great_expectations==0.18.12) (5.7.2)\n",
-      "Requirement already satisfied: jupyter-server<3,>=2.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (2.14.0)\n",
-      "Requirement already satisfied: jupyterlab-server<3,>=2.22.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (2.27.1)\n",
-      "Requirement already satisfied: jupyterlab<4.2,>=4.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (4.1.8)\n",
-      "Requirement already satisfied: notebook-shim<0.3,>=0.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (0.2.4)\n",
-      "Requirement already satisfied: tornado>=6.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from notebook>=6.4.10->great_expectations==0.18.12) (6.3.3)\n",
-      "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic>=1.9.2->great_expectations==0.18.12) (0.6.0)\n",
-      "Requirement already satisfied: pydantic-core==2.18.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic>=1.9.2->great_expectations==0.18.12) (2.18.2)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from python-dateutil>=2.8.1->great_expectations==0.18.12) (1.16.0)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (3.3.2)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (3.7)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from requests>=2.20->great_expectations==0.18.12) (2024.2.2)\n",
-      "Requirement already satisfied: tzdata in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from tzlocal>=1.2->great_expectations==0.18.12) (2024.1)\n",
-      "Requirement already satisfied: pycparser in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from cffi>=1.12->cryptography>=3.2->great_expectations==0.18.12) (2.22)\n",
-      "Requirement already satisfied: parso<0.9.0,>=0.8.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jedi>=0.16->Ipython>=7.16.3->great_expectations==0.18.12) (0.8.4)\n",
-      "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=5.0->great_expectations==0.18.12) (4.2.1)\n",
-      "Requirement already satisfied: pywin32>=300 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=5.0->great_expectations==0.18.12) (305.1)\n",
-      "Requirement already satisfied: anyio>=3.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (4.3.0)\n",
-      "Requirement already satisfied: argon2-cffi>=21.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (23.1.0)\n",
-      "Requirement already satisfied: jupyter-client>=7.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (8.6.1)\n",
-      "Requirement already satisfied: jupyter-events>=0.9.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.10.0)\n",
-      "Requirement already satisfied: jupyter-server-terminals>=0.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.5.3)\n",
-      "Requirement already satisfied: nbconvert>=6.4.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (7.16.4)\n",
-      "Requirement already satisfied: overrides>=5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (7.7.0)\n",
-      "Requirement already satisfied: prometheus-client>=0.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.20.0)\n",
-      "Requirement already satisfied: pywinpty>=2.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.0.13)\n",
-      "Requirement already satisfied: pyzmq>=24 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (25.1.2)\n",
-      "Requirement already satisfied: send2trash>=1.8.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.8.3)\n",
-      "Requirement already satisfied: terminado>=0.8.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.18.1)\n",
-      "Requirement already satisfied: websocket-client>=1.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.8.0)\n",
-      "Requirement already satisfied: async-lru>=1.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (2.0.4)\n",
-      "Requirement already satisfied: httpx>=0.25.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (0.27.0)\n",
-      "Requirement already satisfied: ipykernel>=6.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (6.29.3)\n",
-      "Requirement already satisfied: jupyter-lsp>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (2.2.5)\n",
-      "Requirement already satisfied: babel>=2.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab-server<3,>=2.22.1->notebook>=6.4.10->great_expectations==0.18.12) (2.14.0)\n",
-      "Requirement already satisfied: json5>=0.9.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyterlab-server<3,>=2.22.1->notebook>=6.4.10->great_expectations==0.18.12) (0.9.25)\n",
-      "Requirement already satisfied: wcwidth in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->Ipython>=7.16.3->great_expectations==0.18.12) (0.2.13)\n",
-      "Requirement already satisfied: executing>=1.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (2.0.1)\n",
-      "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (2.4.1)\n",
-      "Requirement already satisfied: pure-eval in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from stack-data->Ipython>=7.16.3->great_expectations==0.18.12) (0.2.2)\n",
-      "Requirement already satisfied: sniffio>=1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.1.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.1)\n",
-      "Requirement already satisfied: argon2-cffi-bindings in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from argon2-cffi>=21.1->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (21.2.0)\n",
-      "Requirement already satisfied: httpcore==1.* in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.0.5)\n",
-      "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpcore==1.*->httpx>=0.25.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (0.14.0)\n",
-      "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.6.7)\n",
-      "Requirement already satisfied: nest-asyncio in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (1.6.0)\n",
-      "Requirement already satisfied: psutil in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from ipykernel>=6.5.0->jupyterlab<4.2,>=4.1.1->notebook>=6.4.10->great_expectations==0.18.12) (5.9.0)\n",
-      "Requirement already satisfied: python-json-logger>=2.0.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.0.7)\n",
-      "Requirement already satisfied: pyyaml>=5.3 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (6.0.1)\n",
-      "Requirement already satisfied: rfc3339-validator in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.1.4)\n",
-      "Requirement already satisfied: rfc3986-validator>=0.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.1.1)\n",
-      "Requirement already satisfied: beautifulsoup4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (4.12.3)\n",
-      "Requirement already satisfied: bleach!=5.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (6.1.0)\n",
-      "Requirement already satisfied: defusedxml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.7.1)\n",
-      "Requirement already satisfied: jupyterlab-pygments in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.3.0)\n",
-      "Requirement already satisfied: nbclient>=0.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.10.0)\n",
-      "Requirement already satisfied: pandocfilters>=1.4.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.5.1)\n",
-      "Requirement already satisfied: tinycss2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
-      "Requirement already satisfied: webencodings in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from bleach!=5.0.0->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (0.5.1)\n",
-      "Requirement already satisfied: fqdn in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.5.1)\n",
-      "Requirement already satisfied: isoduration in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (20.11.0)\n",
-      "Requirement already satisfied: uri-template in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
-      "Requirement already satisfied: webcolors>=1.11 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.13)\n",
-      "Requirement already satisfied: soupsieve>1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from beautifulsoup4->nbconvert>=6.4.4->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.5)\n",
-      "Requirement already satisfied: arrow>=0.15.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (1.3.0)\n",
-      "Requirement already satisfied: types-python-dateutil>=2.8.10 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from arrow>=0.15.0->isoduration->jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=6.4.10->great_expectations==0.18.12) (2.9.0.20240316)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install great_expectations==0.18.12"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import necessary libraries\n",
-    "import pandas as pd               # For data manipulation using DataFrames\n",
-    "import numpy as np                # For numerical operations\n",
-    "import matplotlib.pyplot as plt   # For data visualization\n",
-    "import os                         # For operating system-related tasks\n",
-    "import joblib                     # For saving and loading models\n",
-    "import hopsworks                  # For getting access to hopsworks\n",
-    "\n",
-    "\n",
-    "\n",
-    "# Import specific modules from scikit-learn\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing\n",
-    "from sklearn.metrics import accuracy_score                        # For evaluating model accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: modal in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (0.62.141)\n",
-      "Requirement already satisfied: aiohttp in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (3.9.5)\n",
-      "Requirement already satisfied: aiostream~=0.5.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.5.2)\n",
-      "Requirement already satisfied: certifi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (2024.2.2)\n",
-      "Requirement already satisfied: click>=8.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (8.1.7)\n",
-      "Requirement already satisfied: fastapi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.111.0)\n",
-      "Requirement already satisfied: grpclib==0.4.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.4.7)\n",
-      "Requirement already satisfied: protobuf!=4.24.0,<5.0,>=3.19 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (4.25.3)\n",
-      "Requirement already satisfied: rich>=12.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (13.7.1)\n",
-      "Requirement already satisfied: synchronicity~=0.6.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.6.7)\n",
-      "Requirement already satisfied: toml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.10.2)\n",
-      "Requirement already satisfied: typer>=0.9 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.12.3)\n",
-      "Requirement already satisfied: types-certifi in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (2021.10.8.3)\n",
-      "Requirement already satisfied: types-toml in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.10.8.20240310)\n",
-      "Requirement already satisfied: watchfiles in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (0.21.0)\n",
-      "Requirement already satisfied: typing-extensions~=4.6 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from modal) (4.11.0)\n",
-      "Requirement already satisfied: h2<5,>=3.1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from grpclib==0.4.7->modal) (4.1.0)\n",
-      "Requirement already satisfied: multidict in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from grpclib==0.4.7->modal) (6.0.5)\n",
-      "Requirement already satisfied: colorama in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from click>=8.1.0->modal) (0.4.6)\n",
-      "Requirement already satisfied: markdown-it-py>=2.2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from rich>=12.0.0->modal) (3.0.0)\n",
-      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from rich>=12.0.0->modal) (2.17.2)\n",
-      "Requirement already satisfied: sigtools==4.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from synchronicity~=0.6.6->modal) (4.0.1)\n",
-      "Requirement already satisfied: attrs in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from sigtools==4.0.1->synchronicity~=0.6.6->modal) (23.2.0)\n",
-      "Requirement already satisfied: shellingham>=1.3.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from typer>=0.9->modal) (1.5.4)\n",
-      "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.3.1)\n",
-      "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.4.1)\n",
-      "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from aiohttp->modal) (1.9.4)\n",
-      "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.37.2)\n",
-      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (2.7.1)\n",
-      "Requirement already satisfied: fastapi-cli>=0.0.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.0.2)\n",
-      "Requirement already satisfied: httpx>=0.23.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.27.0)\n",
-      "Requirement already satisfied: jinja2>=2.11.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (3.1.3)\n",
-      "Requirement already satisfied: python-multipart>=0.0.7 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (0.0.9)\n",
-      "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (5.9.0)\n",
-      "Requirement already satisfied: orjson>=3.2.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (3.10.3)\n",
-      "Requirement already satisfied: email_validator>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from fastapi->modal) (2.1.1)\n",
-      "Requirement already satisfied: uvicorn>=0.12.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (0.29.0)\n",
-      "Requirement already satisfied: anyio>=3.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from watchfiles->modal) (4.3.0)\n",
-      "Requirement already satisfied: idna>=2.8 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.0.0->watchfiles->modal) (3.7)\n",
-      "Requirement already satisfied: sniffio>=1.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from anyio>=3.0.0->watchfiles->modal) (1.3.1)\n",
-      "Requirement already satisfied: dnspython>=2.0.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from email_validator>=2.0.0->fastapi->modal) (2.6.1)\n",
-      "Requirement already satisfied: hyperframe<7,>=6.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from h2<5,>=3.1.0->grpclib==0.4.7->modal) (6.0.1)\n",
-      "Requirement already satisfied: hpack<5,>=4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from h2<5,>=3.1.0->grpclib==0.4.7->modal) (4.0.0)\n",
-      "Requirement already satisfied: httpcore==1.* in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpx>=0.23.0->fastapi->modal) (1.0.5)\n",
-      "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from httpcore==1.*->httpx>=0.23.0->fastapi->modal) (0.14.0)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from jinja2>=2.11.2->fastapi->modal) (2.1.5)\n",
-      "Requirement already satisfied: mdurl~=0.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->modal) (0.1.2)\n",
-      "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->modal) (0.6.0)\n",
-      "Requirement already satisfied: pydantic-core==2.18.2 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->modal) (2.18.2)\n",
-      "Requirement already satisfied: httptools>=0.5.0 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (0.6.1)\n",
-      "Requirement already satisfied: python-dotenv>=0.13 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (1.0.1)\n",
-      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (6.0.1)\n",
-      "Requirement already satisfied: websockets>=10.4 in c:\\users\\frede\\onedrive\\dokumenter\\master\\mlops\\mlops_mod-2\\.conda\\lib\\site-packages (from uvicorn[standard]>=0.12.0->fastapi->modal) (12.0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install modal"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#from alpha_vantage.timeseries import TimeSeries\n",
-    "#import pandas as pd\n",
-    "\n",
-    "#load_dotenv()\n",
-    "\n",
-    "#api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
-    "#ts = TimeSeries(key=api_key, output_format='pandas')\n",
-    "\n",
-    "# Fetch daily adjusted stock prices; adjust the symbol as needed\n",
-    "#data, meta_data = ts.get_daily(symbol='TSLA', outputsize='full')\n",
-    "\n",
-    "#print(data.head())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#data.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#meta_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define your file path and name\n",
-    "#file_path = 'TSLA_stock_price.csv'  # Customize the path and filename\n",
-    "\n",
-    "# Save the DataFrame to CSV\n",
-    "#stock_data.to_csv(file_path)\n",
-    "\n",
-    "#print(f\"Data saved to {file_path}\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "         date  1. open  2. high    3. low  4. close    5. volume ticker\n",
-      "0  2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
-      "1  2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
-      "2  2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
-      "3  2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
-      "4  2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Load and display the data from CSV to confirm\n",
-    "tsla_df = pd.read_csv('TSLA_stock_price.csv')\n",
-    "print(tsla_df.head())\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "\n",
-      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n"
-     ]
-    }
-   ],
-   "source": [
-    "api_key = os.environ.get('hopsworks_api')\n",
-    "project = hopsworks.login(api_key_value=api_key)\n",
-    "fs = project.get_feature_store()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def clean_column_name(name):\n",
-    "    # Remove all non-letter characters\n",
-    "    cleaned_name = re.sub(r'[^a-zA-Z]', '', name)\n",
-    "    return cleaned_name\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>date</th>\n",
-       "      <th>1. open</th>\n",
-       "      <th>2. high</th>\n",
-       "      <th>3. low</th>\n",
-       "      <th>4. close</th>\n",
-       "      <th>5. volume</th>\n",
-       "      <th>ticker</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2024-05-03</td>\n",
-       "      <td>182.10</td>\n",
-       "      <td>184.7800</td>\n",
-       "      <td>178.4200</td>\n",
-       "      <td>181.19</td>\n",
-       "      <td>75491539.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2024-05-02</td>\n",
-       "      <td>182.86</td>\n",
-       "      <td>184.6000</td>\n",
-       "      <td>176.0200</td>\n",
-       "      <td>180.01</td>\n",
-       "      <td>89148041.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2024-05-01</td>\n",
-       "      <td>182.00</td>\n",
-       "      <td>185.8600</td>\n",
-       "      <td>179.0100</td>\n",
-       "      <td>179.99</td>\n",
-       "      <td>92829719.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2024-04-30</td>\n",
-       "      <td>186.98</td>\n",
-       "      <td>190.9500</td>\n",
-       "      <td>182.8401</td>\n",
-       "      <td>183.28</td>\n",
-       "      <td>127031787.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2024-04-29</td>\n",
-       "      <td>188.42</td>\n",
-       "      <td>198.8700</td>\n",
-       "      <td>184.5400</td>\n",
-       "      <td>194.05</td>\n",
-       "      <td>243869678.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3481</th>\n",
-       "      <td>2010-07-06</td>\n",
-       "      <td>20.00</td>\n",
-       "      <td>20.0000</td>\n",
-       "      <td>15.8300</td>\n",
-       "      <td>16.11</td>\n",
-       "      <td>6866900.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3482</th>\n",
-       "      <td>2010-07-02</td>\n",
-       "      <td>23.00</td>\n",
-       "      <td>23.1000</td>\n",
-       "      <td>18.7100</td>\n",
-       "      <td>19.20</td>\n",
-       "      <td>5139800.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3483</th>\n",
-       "      <td>2010-07-01</td>\n",
-       "      <td>25.00</td>\n",
-       "      <td>25.9200</td>\n",
-       "      <td>20.2700</td>\n",
-       "      <td>21.96</td>\n",
-       "      <td>8218800.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3484</th>\n",
-       "      <td>2010-06-30</td>\n",
-       "      <td>25.79</td>\n",
-       "      <td>30.4192</td>\n",
-       "      <td>23.3000</td>\n",
-       "      <td>23.83</td>\n",
-       "      <td>17187100.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3485</th>\n",
-       "      <td>2010-06-29</td>\n",
-       "      <td>19.00</td>\n",
-       "      <td>25.0000</td>\n",
-       "      <td>17.5400</td>\n",
-       "      <td>23.89</td>\n",
-       "      <td>18766300.0</td>\n",
-       "      <td>TSLA</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3486 rows × 7 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            date  1. open   2. high    3. low  4. close    5. volume ticker\n",
-       "0     2024-05-03   182.10  184.7800  178.4200    181.19   75491539.0   TSLA\n",
-       "1     2024-05-02   182.86  184.6000  176.0200    180.01   89148041.0   TSLA\n",
-       "2     2024-05-01   182.00  185.8600  179.0100    179.99   92829719.0   TSLA\n",
-       "3     2024-04-30   186.98  190.9500  182.8401    183.28  127031787.0   TSLA\n",
-       "4     2024-04-29   188.42  198.8700  184.5400    194.05  243869678.0   TSLA\n",
-       "...          ...      ...       ...       ...       ...          ...    ...\n",
-       "3481  2010-07-06    20.00   20.0000   15.8300     16.11    6866900.0   TSLA\n",
-       "3482  2010-07-02    23.00   23.1000   18.7100     19.20    5139800.0   TSLA\n",
-       "3483  2010-07-01    25.00   25.9200   20.2700     21.96    8218800.0   TSLA\n",
-       "3484  2010-06-30    25.79   30.4192   23.3000     23.83   17187100.0   TSLA\n",
-       "3485  2010-06-29    19.00   25.0000   17.5400     23.89   18766300.0   TSLA\n",
-       "\n",
-       "[3486 rows x 7 columns]"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tsla_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Assuming 'tsla_df' is your DataFrame\n",
-    "tsla_df.columns = [clean_column_name(col) for col in tsla_df.columns]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['date', 'open', 'high', 'low', 'close', 'volume', 'ticker'], dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tsla_df.columns)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "# Assuming tsla_df is your pandas DataFrame\n",
-    "# Convert the \"date\" column to timestamp\n",
-    "tsla_df['date'] = pd.to_datetime(tsla_df['date'])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-05-06 13:43:00,985 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Define a feature group\n",
-    "tesla_fg = fs.get_or_create_feature_group(\n",
-    "    name=\"tesla_stock\",\n",
-    "    description=\"Tesla stock dataset from alpha vantage\",\n",
-    "    version=1,\n",
-    "    primary_key=[\"ticker\"],\n",
-    "    event_time=['date'],\n",
-    "    online_enabled=False,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Feature Group created successfully, explore it at \n",
-      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/786781\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b3248b9d522a467db9ce202ef5815fe9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/3486 | Elapsed Time: 00:00 | Remaining Time: ?"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Launching job: tesla_stock_1_offline_fg_materialization\n",
-      "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stock_1_offline_fg_materialization/executions\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(<hsfs.core.job.Job at 0x19cffe27490>, None)"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tesla_fg.insert(tsla_df, write_options={\"wait_for_job\" : False})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "news_df = pd.read_csv('news_articles_ema.csv')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "news_df_updated = news_df.drop(columns=['exp_mean_7_days'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "news_df_updated['date'] = pd.to_datetime(news_df_updated['date'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-05-06 13:43:12,343 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "news_sentiment_fg = fs.get_or_create_feature_group(\n",
-    "    name='news_sentiment_updated',\n",
-    "    description='News sentiment from Polygon',\n",
-    "    version=1,\n",
-    "    primary_key=['ticker'],\n",
-    "    event_time=['date'],\n",
-    "    online_enabled=False,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Feature Group created successfully, explore it at \n",
-      "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/787796\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "524bb5481c34441ba708a4c14edac44b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/66 | Elapsed Time: 00:00 | Remaining Time: ?"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Launching job: news_sentiment_updated_1_offline_fg_materialization\n",
-      "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/693399/jobs/named/news_sentiment_updated_1_offline_fg_materialization/executions\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(<hsfs.core.job.Job at 0x19c811c2e90>, None)"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "news_sentiment_fg.insert(news_df_updated)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

feature_view_freddie.py DELETED Viewed

@@ -1,95 +0,0 @@
-# %%
-# Import necessary libraries
-import pandas as pd               # For data manipulation using DataFrames
-import numpy as np                # For numerical operations
-import matplotlib.pyplot as plt   # For data visualization
-import os                         # For operating system-related tasks
-import joblib                     # For saving and loading models
-import hopsworks                  # For getting access to hopsworks
-# Import specific modules from scikit-learn
-from sklearn.preprocessing import StandardScaler, OneHotEncoder   # For data preprocessing
-from sklearn.metrics import accuracy_score                        # For evaluating model accuracy
-# %%
-from feature_pipeline import tesla_fg
-from feature_pipeline import news_sentiment_fg
-# %%
-from dotenv import load_dotenv
-import os
-load_dotenv()
-# %%
-api_key = os.environ.get('hopsworks_api')
-project = hopsworks.login(api_key_value=api_key)
-fs = project.get_feature_store()
-# %%
-def create_stocks_feature_view(fs, version):
-    # Loading in the feature groups
-    tesla_fg = fs.get_feature_group('tesla_stock', version=1)
-    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version=1)
-    # Define the query
-    ds_query = tesla_fg.select(['date', 'open', 'ticker'])\
-        .join(news_sentiment_fg.select(['date','sentiment']))
-    # Create the feature view
-    feature_view = fs.create_feature_view(
-        name='tesla_stocks_fv',
-        query=ds_query,
-        labels=['ticker']
-    )
-    return feature_view, tesla_fg
-# %%
-try:
-    feature_view = fs.get_feature_view("tesla_stocks_fv", version=1)
-    tesla_fg = fs.get_feature_group('tesla_stock', version=1)
-except:
-    feature_view, tesla_fg = create_stocks_feature_view(fs, 1)
-# %%
-def fix_data_from_feature_view(df,start_date,end_date):
-    df = df.sort_values("date")
-    df = df.reset_index()
-    df = df.drop(columns=["index"])
-    # Create a boolean mask for rows that fall within the date range
-    mask = (pd.to_datetime(df['date']) >= pd.to_datetime(start_date)) & (pd.to_datetime(df['date']) <= pd.to_datetime(end_date))
-    len_df = np.shape(df)
-    df = df[mask] # Use the boolean mask to filter the DataFrame
-    print('From shape {} to {} after cropping to given date range: {} to {}'.format(len_df,np.shape(df),start_date,end_date))
-    return df
-# %%
-#def create_stocks_feature_view(fs, version):
-    #Loading in the feature groups
-#    tesla_fg = fs.get_feature_group('tesla_stock', version = 3)
-#    news_sentiment_fg = fs.get_feature_group('news_sentiment_updated', version = 2)
-#    ds_query = tesla_fg.select(['date','open', 'ticker'])\
-#        .join(news_sentiment_fg.select_except(['ticker','time', 'amp_url', 'image_url']))
-#    return (fs.create_tesla_feature_view(
-#        name = 'tsla_stocks_fv',
-#        query = ds_query,
-#        labels=['ticker']
-#    ), tesla_fg)
-# %%
-#try:
-#    feature_view = fs.get_feature_view("tsla_stocks_fv", version=1)
-#    tesla_fg = fs.get_feature_group('tesla_stock', version=3)
-#except:
-#    feature_view, tesla_fg = create_stocks_feature_view(fs, 1)

historical_stock.ipynb DELETED Viewed

@@ -1,257 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'modal'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn[1], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mhopsworks\u001b[39;00m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mre\u001b[39;00m \n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mmodal\u001b[39;00m \n\u001b[0;32m      8\u001b[0m \u001b[38;5;66;03m#prepocessing\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'modal'"
-     ]
-    }
-   ],
-   "source": [
-    "from dotenv import load_dotenv\n",
-    "import os \n",
-    "from alpha_vantage.timeseries import TimeSeries\n",
-    "import pandas as pd\n",
-    "import hopsworks\n",
-    "import re \n",
-    "import modal \n",
-    "#prepocessing\n",
-    "import requests\n",
-    "import pandas as pd\n",
-    "import json\n",
-    "#import pandas_market_calendars as mcal\n",
-    "import datetime\n",
-    "import numpy as np\n",
-    "from datetime import timedelta \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "            1. open  2. high    3. low  4. close    5. volume ticker\n",
-      "date                                                                \n",
-      "2024-05-03   182.10   184.78  178.4200    181.19   75491539.0   TSLA\n",
-      "2024-05-02   182.86   184.60  176.0200    180.01   89148041.0   TSLA\n",
-      "2024-05-01   182.00   185.86  179.0100    179.99   92829719.0   TSLA\n",
-      "2024-04-30   186.98   190.95  182.8401    183.28  127031787.0   TSLA\n",
-      "2024-04-29   188.42   198.87  184.5400    194.05  243869678.0   TSLA\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "\n",
-    "load_dotenv()\n",
-    "\n",
-    "api_key = os.environ.get('stocks_api') # Replace this with your actual API key\n",
-    "ts = TimeSeries(key=api_key, output_format='pandas')\n",
-    "\n",
-    "def fetch_stock_prices(symbol):\n",
-    "    # Fetch daily adjusted stock prices; adjust the symbol as needed\n",
-    "    data, meta_data = ts.get_daily(symbol=symbol, outputsize='full')\n",
-    "    \n",
-    "    # Add a new column named 'ticker' and fill it with the ticker name\n",
-    "    data['ticker'] = symbol\n",
-    "    \n",
-    "    return data\n",
-    "\n",
-    "# Example usage\n",
-    "symbol = 'TSLA'\n",
-    "stock_data = fetch_stock_prices(symbol)\n",
-    "print(stock_data.head())\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_tsla_history():\n",
-    "\n",
-    "    start_date = datetime.datetime.strptime('2015-07-16',\"%Y-%m-%d\")\n",
-    "    end_date = datetime.datetime.strptime('2023-01-05',\"%Y-%m-%d\")\n",
-    "\n",
-    "    # Get the TSLA stock data from yfinance\n",
-    "    tsla = Ticker(\"TSLA\")\n",
-    "    # info = tsla.info\n",
-    "\n",
-    "    # get historical market data\n",
-    "    data = tsla.history(start=start_date, end=end_date)\n",
-    "\n",
-    "    # drop some columns\n",
-    "    tesla_df = data.drop(columns=['Dividends','Stock Splits'])\n",
-    "    tesla_df.index = tesla_df.index.strftime('%Y-%m-%d')\n",
-    "    \n",
-    "    print('Number of business days included in data set: ',np.shape(tesla_df))\n",
-    "\n",
-    "    # Create an array of all dates in the specified period\n",
-    "    all_dates = np.array([start_date + datetime.timedelta(days=i) for i in range((end_date - start_date).days)])\n",
-    "    all_dates = [d.strftime('%Y-%m-%d') for d in all_dates]\n",
-    "\n",
-    "    # Use setdiff1d() to find the non-business days\n",
-    "    isBusinessDay, _ = extract_business_day(start_date='2015-07-16',end_date='2023-01-04')\n",
-    "    non_business_days = np.setdiff1d(all_dates, isBusinessDay)\n",
-    "\n",
-    "    # Add nan-values to the non-business days\n",
-    "    print('Add {} non business days with NaN-values'.format(len(non_business_days)))\n",
-    "    for d in non_business_days:\n",
-    "        tesla_df.loc[d,:] = [np.nan,np.nan,np.nan,np.nan,np.nan]\n",
-    "\n",
-    "    # sort index (dates)\n",
-    "    tesla_df = tesla_df.sort_index()\n",
-    " \n",
-    "    # move \"date\"-index into its own column\n",
-    "    tesla_df = tesla_df.reset_index()\n",
-    "    \n",
-    "    # Rename column 'Date' to 'date'\n",
-    "    tesla_df = tesla_df.rename(columns={'Date': 'date'})\n",
-    "    print('Final size of dataframe',np.shape(tesla_df))\n",
-    "    \n",
-    "    # Write the merged dataframe to a CSV file\n",
-    "    start_date ='2022-04-01'\n",
-    "    end_date = '2024-04-01'\n",
-    "    save_path = \"data/stock/tesla_{}-{}.csv\".format(start_date,end_date)\n",
-    "    \n",
-    "    print('Save at :',save_path)\n",
-    "    tesla_df.to_csv(save_path, index=False)\n",
-    "    \n",
-    "    return tesla_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_business_day(start_date,end_date):\n",
-    "    \"\"\"\n",
-    "    Given a start_date and end_date.\n",
-    "    \n",
-    "    `Returns`:\n",
-    "    \n",
-    "    isBusinessDay: list of str (with all dates being business days)\n",
-    "    is_open: boolean list\n",
-    "        e.g is_open = [1,0,...,1] means that start_date = open, day after start_date = closed, and end_date = open\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    # Save for later\n",
-    "    end_date_save = end_date\n",
-    "    \n",
-    "    # Get the NYSE calendar\n",
-    "    cal = mcal.get_calendar('NYSE')\n",
-    "\n",
-    "    # Get the NYSE calendar's open and close times for the specified period\n",
-    "    schedule = cal.schedule(start_date=start_date, end_date=end_date)\n",
-    "    \n",
-    "    # Only need a list of dates when it's open (not open and close times)\n",
-    "    isBusinessDay = np.array(schedule.market_open.dt.strftime('%Y-%m-%d')) \n",
-    "    \n",
-    "    # Go over all days: \n",
-    "    delta = datetime.timedelta(days=1)\n",
-    "    start_date = datetime.datetime.strptime(start_date,\"%Y-%m-%d\") #datetime.date(2015, 7, 16)\n",
-    "    end_date = datetime.datetime.strptime(end_date,\"%Y-%m-%d\") #datetime.date(2023, 1, 4)\n",
-    "    \n",
-    "    # Extract days from the timedelta object\n",
-    "    num_days = (end_date - start_date).days + 1\n",
-    "    \n",
-    "    # Create boolean array for days being open (1) and closed (0) \n",
-    "    is_open = np.zeros(num_days)\n",
-    "    \n",
-    "    # iterate over range of dates\n",
-    "    current_BusinessDay = isBusinessDay[0]\n",
-    "    count_dates = 0\n",
-    "    next_BusinessDay = 0\n",
-    "    \n",
-    "    while (start_date <= end_date):\n",
-    "    \n",
-    "        if start_date.strftime('%Y-%m-%d') == current_BusinessDay:\n",
-    "            is_open[count_dates] = True\n",
-    "\n",
-    "            if current_BusinessDay == end_date_save or current_BusinessDay==isBusinessDay[-1]:\n",
-    "                break\n",
-    "            else:\n",
-    "                next_BusinessDay += 1\n",
-    "                current_BusinessDay = isBusinessDay[next_BusinessDay]\n",
-    "        else:\n",
-    "            is_open[count_dates] = False\n",
-    "\n",
-    "        count_dates += 1   \n",
-    "        start_date += delta\n",
-    "        \n",
-    "    print(np.shape(is_open))\n",
-    "        \n",
-    "    return isBusinessDay, is_open"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Data saved to TSLA_stock_price.csv\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Define your file path and name\n",
-    "file_path = 'TSLA_stock_price.csv'  # Customize the path and filename\n",
-    "\n",
-    "# Save the DataFrame to CSV\n",
-    "stock_data.to_csv(file_path)\n",
-    "\n",
-    "print(f\"Data saved to {file_path}\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

requirements.txt CHANGED Viewed

@@ -19,3 +19,4 @@ textblob
 great_expectations==0.18.12
 prophet
 tensorflow

 great_expectations==0.18.12
 prophet
 tensorflow
+pandas_market_calendars

training_pipeline.ipynb DELETED Viewed

@@ -1,167 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import hopsworks\n",
-    "from dotenv import load_dotenv\n",
-    "import os\n",
-    "\n",
-    "load_dotenv()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "\n",
-      "Sample data from the feature view:\n",
-      "<class 'tuple'>\n",
-      "(                         date     open  sentiment\n",
-      "0    2023-06-26T00:00:00.000Z  250.065   0.119444\n",
-      "1    2023-07-25T00:00:00.000Z  272.380   0.119444\n",
-      "2    2023-01-10T00:00:00.000Z  121.070   0.102207\n",
-      "3    2023-05-11T00:00:00.000Z  168.700   0.141296\n",
-      "4    2023-08-01T00:00:00.000Z  266.260   0.011111\n",
-      "..                        ...      ...        ...\n",
-      "464  2022-12-22T00:00:00.000Z  136.000   0.102207\n",
-      "465  2023-08-23T00:00:00.000Z  229.340   0.024046\n",
-      "466  2022-09-08T00:00:00.000Z  281.300   0.087306\n",
-      "467  2023-07-06T00:00:00.000Z  278.090   0.119444\n",
-      "468  2023-10-27T00:00:00.000Z  210.600   0.164868\n",
-      "\n",
-      "[469 rows x 3 columns],     ticker\n",
-      "0     TSLA\n",
-      "1     TSLA\n",
-      "2     TSLA\n",
-      "3     TSLA\n",
-      "4     TSLA\n",
-      "..     ...\n",
-      "464   TSLA\n",
-      "465   TSLA\n",
-      "466   TSLA\n",
-      "467   TSLA\n",
-      "468   TSLA\n",
-      "\n",
-      "[469 rows x 1 columns])\n"
-     ]
-    }
-   ],
-   "source": [
-    "import hsfs\n",
-    "\n",
-    "# Connection setup\n",
-    "# Connect to Hopsworks\n",
-    "api_key = os.getenv('hopsworks_api')\n",
-    "connection = hsfs.connection()\n",
-    "fs = connection.get_feature_store()\n",
-    "\n",
-    "# Get feature view\n",
-    "feature_view = fs.get_feature_view(\n",
-    "    name='tesla_stocks_fv',\n",
-    "    version=1\n",
-    ")\n",
-    "td_version, td_job = feature_view.create_train_test_split(\n",
-    "    description = 'tesla and news sentiment training dataset',\n",
-    "    data_format = \"csv\",\n",
-    "    test_size = 0.2,\n",
-    "    coalesce = True,\n",
-    "    statistics_config={\n",
-    "        \"enabled\": True,\n",
-    "        \"histograms\": False,\n",
-    "        \"correlations\": False\n",
-    "    }    \n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(                         date     open  sentiment\n",
-       " 0    2023-06-26T00:00:00.000Z  250.065   0.119444\n",
-       " 1    2023-07-25T00:00:00.000Z  272.380   0.119444\n",
-       " 2    2023-01-10T00:00:00.000Z  121.070   0.102207\n",
-       " 3    2023-05-11T00:00:00.000Z  168.700   0.141296\n",
-       " 4    2023-08-01T00:00:00.000Z  266.260   0.011111\n",
-       " ..                        ...      ...        ...\n",
-       " 464  2022-12-22T00:00:00.000Z  136.000   0.102207\n",
-       " 465  2023-08-23T00:00:00.000Z  229.340   0.024046\n",
-       " 466  2022-09-08T00:00:00.000Z  281.300   0.087306\n",
-       " 467  2023-07-06T00:00:00.000Z  278.090   0.119444\n",
-       " 468  2023-10-27T00:00:00.000Z  210.600   0.164868\n",
-       " \n",
-       " [469 rows x 3 columns],\n",
-       "     ticker\n",
-       " 0     TSLA\n",
-       " 1     TSLA\n",
-       " 2     TSLA\n",
-       " 3     TSLA\n",
-       " 4     TSLA\n",
-       " ..     ...\n",
-       " 464   TSLA\n",
-       " 465   TSLA\n",
-       " 466   TSLA\n",
-       " 467   TSLA\n",
-       " 468   TSLA\n",
-       " \n",
-       " [469 rows x 1 columns])"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sample_data"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}