{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connection closed.\n", "Connected. Call `.close()` to terminate connection gracefully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "#Importing necessary libraries\n", "import hopsworks\n", "import hsfs\n", "from dotenv import load_dotenv\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.metrics import mean_squared_error\n", "from hsml.schema import Schema\n", "from hsml.model_schema import ModelSchema\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Input, LSTM, Dense, Dropout\n", "from sklearn.preprocessing import StandardScaler # Import StandardScaler from scikit-learn\n", "import joblib\n", "\n", "load_dotenv()\n", "\n", "#Connecting to hopsworks\n", "api_key = os.environ.get('hopsworks_api')\n", "project = hopsworks.login(api_key_value=api_key)\n", "fs = project.get_feature_store()\n", "\n", "#Another connection to hopsworks\n", "api_key = os.getenv('hopsworks_api')\n", "connection = hsfs.connection()\n", "fs = connection.get_feature_store()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "#Getting the feature view\n", "feature_view = fs.get_feature_view(\n", " name='tesla_stocks_fv',\n", " version=5\n", ")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "#Setting up train & test split dates\n", "train_start = \"2022-06-22\"\n", "train_end = \"2023-12-31\"\n", "\n", "test_start = '2024-01-01'\n", "test_end = \"2024-05-08\"" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training dataset job started successfully, you can follow the progress at \n", "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stocks_fv_5_create_fv_td_14052024101636/executions\n", "2024-05-14 12:18:32,042 WARNING: VersionWarning: Incremented version to `1`.\n", "\n" ] }, { "data": { "text/plain": [ "(1, )" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Creating the train/test split on the feature view with the split dates\n", "feature_view.create_train_test_split(\n", " train_start=train_start,\n", " train_end=train_end,\n", " test_start=test_start,\n", " test_end=test_end,\n", " data_format='csv',\n", " coalesce= True,\n", " statistics_config={'histogram':True,'correlations':True})" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "#Collecting the split from feature view\n", "X_train, X_test, y_train, y_test = feature_view.get_train_test_split(1)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "

379 rows × 3 columns

\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-12-14T00:00:00.000Z TSLA 0.091856\n", "1 2023-02-21T00:00:00.000Z TSLA 0.080574\n", "2 2023-08-17T00:00:00.000Z TSLA 0.214102\n", "3 2022-09-16T00:00:00.000Z TSLA 0.114323\n", "4 2023-08-28T00:00:00.000Z TSLA 0.214102\n", ".. ... ... ...\n", "374 2023-02-10T00:00:00.000Z TSLA 0.080574\n", "375 2023-05-08T00:00:00.000Z TSLA 0.011806\n", "376 2022-09-08T00:00:00.000Z TSLA 0.114323\n", "377 2023-07-06T00:00:00.000Z TSLA 0.150893\n", "378 2023-10-27T00:00:00.000Z TSLA 0.068181\n", "\n", "[379 rows x 3 columns]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Inspecting X_train\n", "X_train" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "#Converting date into datetime\n", "X_train['date'] = pd.to_datetime(X_train['date']).dt.date\n", "X_test['date'] = pd.to_datetime(X_test['date']).dt.date\n", "X_train['date'] = pd.to_datetime(X_train['date'])\n", "X_test['date'] = pd.to_datetime(X_test['date'])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-12-14 TSLA 0.091856\n", "1 2023-02-21 TSLA 0.080574\n", "2 2023-08-17 TSLA 0.214102\n", "3 2022-09-16 TSLA 0.114323\n", "4 2023-08-28 TSLA 0.214102" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# Extracting the 'ticker' column\n", "tickers = X_train[['ticker']]\n", "\n", "# Initializing OneHotEncoder\n", "encoder = OneHotEncoder()\n", "\n", "# Fitting and transforming the 'ticker' column\n", "ticker_encoded = encoder.fit_transform(tickers)\n", "\n", "# Converting the encoded column into a DataFrame\n", "ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n", "\n", "# Concatenating the encoded DataFrame with the original DataFrame\n", "X_train = pd.concat([X_train, ticker_encoded_df], axis=1)\n", "\n", "# Dropping the original 'ticker' column\n", "X_train.drop('ticker', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
" ], "text/plain": [ " date sentiment ticker_TSLA\n", "0 2022-12-14 0.091856 1.0\n", "1 2023-02-21 0.080574 1.0\n", "2 2023-08-17 0.214102 1.0\n", "3 2022-09-16 0.114323 1.0\n", "4 2023-08-28 0.214102 1.0" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Inspecting X train after onehotencoding 'Ticker'\n", "X_train.head()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "#Doing the same for X test as done to X train\n", "\n", "tickers = X_test[['ticker']]\n", "\n", "# Initializing OneHotEncoder\n", "encoder = OneHotEncoder()\n", "\n", "# Fitting and transforming the 'ticker' column\n", "ticker_encoded_test = encoder.fit_transform(tickers)\n", "\n", "# Converting the encoded column into a DataFrame\n", "ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n", "\n", "# Concatenating the encoded DataFrame with the original DataFrame\n", "X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)\n", "\n", "# Dropping the original 'ticker' column\n", "X_test.drop('ticker', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "#Loading in MinMaxScaler to be used on the target variable 'open'\n", "scaler = MinMaxScaler()\n", "\n", "# Fitting and transforming the 'open' column\n", "#y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])\n", "#y_train.drop('open', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "#Doing the same to y_test as done to y_train \n", "#y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])\n", "#y_test.drop('open', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "#Defining the function for the LSTM model\n", "def create_model(input_shape,\n", " LSTM_filters=64,\n", " dropout=0.1,\n", " recurrent_dropout=0.1,\n", " dense_dropout=0.5,\n", " activation='relu',\n", " depth=1):\n", "\n", " model = Sequential()\n", "\n", " # Input layer\n", " model.add(Input(shape=input_shape))\n", "\n", " if depth > 1:\n", " for i in range(1, depth):\n", " # Recurrent layer\n", " model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))\n", "\n", " # Recurrent layer\n", " model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))\n", "\n", " # Fully connected layer\n", " if activation == 'relu':\n", " model.add(Dense(LSTM_filters, activation='relu'))\n", " elif activation == 'leaky_relu':\n", " model.add(Dense(LSTM_filters))\n", " model.add(tf.keras.layers.LeakyReLU(alpha=0.1))\n", "\n", " # Dropout for regularization\n", " model.add(Dropout(dense_dropout))\n", "\n", " # Output layer for predicting one day forward\n", " model.add(Dense(1, activation='linear'))\n", "\n", " # Compile the model\n", " model.compile(optimizer='adam', loss='mse')\n", "\n", " return model" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-14 12:27:09,948 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", "\n" ] } ], "source": [ "# As X_train['date'] column exists and is in datetime format, we're converting it\n", "X_train['year'] = X_train['date'].dt.year\n", "X_train['month'] = X_train['date'].dt.month\n", "X_train['day'] = X_train['date'].dt.day\n", "\n", "# Dropping the original date column\n", "X_train.drop(columns=['date'], inplace=True)\n", "\n", "# Converting dataframe to numpy array\n", "X_train_array = X_train.to_numpy()\n", "\n", "# Reshaping the array to have a shape suitable for LSTM\n", "X_train_array = np.expand_dims(X_train_array, axis=1)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# Convert DataFrame to numpy array\n", "X_train_array = X_train.values\n", "\n", "# Reshaping X_train_array to add a time step dimension\n", "X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])\n", "\n", "# Assuming X_train_reshaped shape is now (374, 1, 5)\n", "input_shape = X_train_reshaped.shape[1:]\n", "\n", "# Create the model\n", "model = create_model(input_shape=input_shape)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 6ms/step - loss: 120898.4766\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Fitting the model on the training dataset\n", "model.fit(X_train_reshaped, y_train)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-14 12:27:25,395 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", "\n" ] } ], "source": [ "# As X_test['date'] column exists and is in datetime format, we're converting it\n", "X_test['year'] = X_test['date'].dt.year\n", "X_test['month'] = X_test['date'].dt.month\n", "X_test['day'] = X_test['date'].dt.day\n", "\n", "# Dropping the original date column\n", "X_test.drop(columns=['date'], inplace=True)\n", "\n", "# Converting dataframe to numpy array\n", "X_test_array = X_test.to_numpy()\n", "\n", "# Reshape the array to have a shape suitable for LSTM\n", "X_test_array = np.expand_dims(X_test_array, axis=1)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 274ms/step\n", "Root Mean Squared Error (RMSE): 187.9722523761173\n" ] } ], "source": [ "# Predicting y_pred with X_test\n", "y_pred = model.predict(X_test_array)\n", "\n", "# Calculate Mean Squared Error (MSE)\n", "mse = mean_squared_error(y_test, y_pred)\n", "\n", "# Calculate Root Mean Squared Error (RMSE)\n", "rmse = np.sqrt(mse)\n", "\n", "print(\"Root Mean Squared Error (RMSE):\", rmse)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "#Conneting to hopsworks model registry\n", "mr = project.get_model_registry()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 379 entries, 0 to 378\n", "Data columns (total 1 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 open 379 non-null float64\n", "dtypes: float64(1)\n", "memory usage: 3.1 KB\n" ] } ], "source": [ "y_train.info()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.8625605 ],\n", " [0.8625586 ],\n", " [0.8625606 ],\n", " [0.86255974],\n", " [0.8625601 ],\n", " [0.862559 ],\n", " [0.8625611 ],\n", " [0.8625565 ],\n", " [0.86256105],\n", " [0.86256105],\n", " [0.8625609 ],\n", " [0.86255926],\n", " [0.8625612 ],\n", " [0.86256075],\n", " [0.8625595 ],\n", " [0.8625567 ],\n", " [0.8625613 ],\n", " [0.8625613 ],\n", " [0.8625598 ],\n", " [0.86255765],\n", " [0.86256105],\n", " [0.86256033],\n", " [0.86256117],\n", " [0.86256 ],\n", " [0.86256105],\n", " [0.86255074],\n", " [0.86256117],\n", " [0.86256117],\n", " [0.8625596 ],\n", " [0.86256075],\n", " [0.8625611 ],\n", " [0.8625562 ],\n", " [0.8625593 ],\n", " [0.8625606 ],\n", " [0.86255825],\n", " [0.8625602 ],\n", " [0.86256117],\n", " [0.86255556],\n", " [0.8625559 ],\n", " [0.8625609 ],\n", " [0.86256045],\n", " [0.8625612 ],\n", " [0.86256117],\n", " [0.86255604],\n", " [0.86255944],\n", " [0.8625578 ],\n", " [0.8625609 ],\n", " [0.8625601 ],\n", " [0.86256063],\n", " [0.86255896],\n", " [0.86256105],\n", " [0.86256075],\n", " [0.8625613 ],\n", " [0.8625543 ],\n", " [0.86255914],\n", " [0.86256075],\n", " [0.8625611 ],\n", " [0.8625604 ],\n", " [0.86256 ],\n", " [0.8625611 ],\n", " [0.86256063],\n", " [0.8625612 ],\n", " [0.8625572 ],\n", " [0.8625559 ],\n", " [0.8625613 ],\n", " [0.8625582 ],\n", " [0.8625613 ],\n", " [0.862561 ],\n", " [0.8625606 ],\n", " [0.8625579 ],\n", " [0.86256105],\n", " [0.8625583 ],\n", " [0.86255735],\n", " [0.86255866],\n", " [0.8625613 ],\n", " [0.8625612 ],\n", " [0.8625552 ],\n", " [0.8625532 ],\n", " [0.8625613 ],\n", " [0.8625609 ],\n", " [0.86256045],\n", " [0.8625612 ],\n", " [0.86255205],\n", " [0.8625613 ],\n", " [0.8625613 ],\n", " [0.862559 ],\n", " [0.86256117],\n", " [0.8625567 ]], dtype=float32)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'RMSE': 187.9722523761173}" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute RMSE metric for filling the model\n", "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", "rmse_metrics = {\"RMSE\": rmse}\n", "rmse_metrics" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "#Setting up the model schema\n", "input_schema = Schema(X_train)\n", "output_schema = Schema(y_train)\n", "model_schema = ModelSchema(input_schema, output_schema)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "#Creating a file colled 'stock_model'\n", "model_dir=\"stock_model\"\n", "if os.path.isdir(model_dir) == False:\n", " os.mkdir(model_dir)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "#Saving the model to hopsworks model registry\n", "#stock_pred_model = mr.tensorflow.create_model(\n", "# name=\"stock_pred_model\",\n", "# metrics= rmse_metrics,\n", "# model_schema=model_schema,\n", "# description=\"Stock Market TSLA Predictor from News Sentiment\",\n", "# )\n", "\n", "#stock_pred_model.save(model_dir)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected. Call `.close()` to terminate connection gracefully.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fa33c5ea489f4cde9b6c4d3a4012d3a5", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/6 [00:00