{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import hopsworks\n", "from dotenv import load_dotenv\n", "import os\n", "import pandas as pd\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.preprocessing import MinMaxScaler\n", "from hsml.schema import Schema\n", "from hsml.model_schema import ModelSchema\n", "\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected. Call `.close()` to terminate connection gracefully.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n", "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "api_key = os.environ.get('hopsworks_api')\n", "project = hopsworks.login(api_key_value=api_key)\n", "fs = project.get_feature_store()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "import hsfs\n", "\n", "# Connection setup\n", "# Connect to Hopsworks\n", "api_key = os.getenv('hopsworks_api')\n", "connection = hsfs.connection()\n", "fs = connection.get_feature_store()\n", "\n", "# Get feature view\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "feature_view = fs.get_feature_view(\n", " name='tesla_stocks_fv',\n", " version=1\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "train_start = \"2022-06-22\"\n", "train_end = \"2023-12-31\"\n", "\n", "test_start = '2024-01-01'\n", "test_end = \"2024-05-03\"\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training dataset job started successfully, you can follow the progress at \n", "https://c.app.hopsworks.ai/p/693399/jobs/named/tesla_stocks_fv_1_create_fv_td_06052024212158/executions\n", "2024-05-06 23:23:21,130 WARNING: VersionWarning: Incremented version to `5`.\n", "\n" ] }, { "data": { "text/plain": [ "(5, )" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_view.create_train_test_split(\n", " train_start=train_start,\n", " train_end=train_end,\n", " test_start=test_start,\n", " test_end=test_end,\n", " data_format='csv',\n", " coalesce= True,\n", " statistics_config={'histogram':True,'correlations':True})" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = feature_view.get_train_test_split(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02022-12-14T00:00:00.000ZTSLA0.102207
12023-02-21T00:00:00.000ZTSLA0.155833
22023-08-17T00:00:00.000ZTSLA0.024046
32022-09-16T00:00:00.000ZTSLA0.087306
42023-08-28T00:00:00.000ZTSLA0.024046
............
3782023-02-10T00:00:00.000ZTSLA0.155833
3792023-05-08T00:00:00.000ZTSLA0.141296
3802022-09-08T00:00:00.000ZTSLA0.087306
3812023-07-06T00:00:00.000ZTSLA0.119444
3822023-10-27T00:00:00.000ZTSLA0.164868
\n", "

383 rows × 3 columns

\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-12-14T00:00:00.000Z TSLA 0.102207\n", "1 2023-02-21T00:00:00.000Z TSLA 0.155833\n", "2 2023-08-17T00:00:00.000Z TSLA 0.024046\n", "3 2022-09-16T00:00:00.000Z TSLA 0.087306\n", "4 2023-08-28T00:00:00.000Z TSLA 0.024046\n", ".. ... ... ...\n", "378 2023-02-10T00:00:00.000Z TSLA 0.155833\n", "379 2023-05-08T00:00:00.000Z TSLA 0.141296\n", "380 2022-09-08T00:00:00.000Z TSLA 0.087306\n", "381 2023-07-06T00:00:00.000Z TSLA 0.119444\n", "382 2023-10-27T00:00:00.000Z TSLA 0.164868\n", "\n", "[383 rows x 3 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02024-04-16T00:00:00.000ZTSLA0.018769
12024-02-22T00:00:00.000ZTSLA0.212963
22024-02-13T00:00:00.000ZTSLA0.099363
32024-01-17T00:00:00.000ZTSLA0.099363
42024-02-16T00:00:00.000ZTSLA0.099363
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2024-04-16T00:00:00.000Z TSLA 0.018769\n", "1 2024-02-22T00:00:00.000Z TSLA 0.212963\n", "2 2024-02-13T00:00:00.000Z TSLA 0.099363\n", "3 2024-01-17T00:00:00.000Z TSLA 0.099363\n", "4 2024-02-16T00:00:00.000Z TSLA 0.099363" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
802024-05-02T00:00:00.000ZTSLA0.001443
812024-04-02T00:00:00.000ZTSLA0.080911
822024-03-22T00:00:00.000ZTSLA0.080911
832024-01-02T00:00:00.000ZTSLA-0.122579
842024-02-26T00:00:00.000ZTSLA0.152764
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "80 2024-05-02T00:00:00.000Z TSLA 0.001443\n", "81 2024-04-02T00:00:00.000Z TSLA 0.080911\n", "82 2024-03-22T00:00:00.000Z TSLA 0.080911\n", "83 2024-01-02T00:00:00.000Z TSLA -0.122579\n", "84 2024-02-26T00:00:00.000Z TSLA 0.152764" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test.tail()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "X_train['date'] = pd.to_datetime(X_train['date']).dt.date\n", "X_test['date'] = pd.to_datetime(X_test['date']).dt.date\n", "X_train['date'] = pd.to_datetime(X_train['date'])\n", "X_test['date'] = pd.to_datetime(X_test['date'])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02022-12-14TSLA0.102207
12023-02-21TSLA0.155833
22023-08-17TSLA0.024046
32022-09-16TSLA0.087306
42023-08-28TSLA0.024046
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2022-12-14 TSLA 0.102207\n", "1 2023-02-21 TSLA 0.155833\n", "2 2023-08-17 TSLA 0.024046\n", "3 2022-09-16 TSLA 0.087306\n", "4 2023-08-28 TSLA 0.024046" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Extract the 'ticker' column\n", "tickers = X_train[['ticker']]\n", "\n", "# Initialize OneHotEncoder\n", "encoder = OneHotEncoder()\n", "\n", "# Fit and transform the 'ticker' column\n", "ticker_encoded = encoder.fit_transform(tickers)\n", "\n", "# Convert the encoded column into a DataFrame\n", "ticker_encoded_df = pd.DataFrame(ticker_encoded.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n", "\n", "# Concatenate the encoded DataFrame with the original DataFrame\n", "X_train = pd.concat([X_train, ticker_encoded_df], axis=1)\n", "\n", "# Drop the original 'ticker' column\n", "X_train.drop('ticker', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datesentimentticker_TSLA
02022-12-140.1022071.0
12023-02-210.1558331.0
22023-08-170.0240461.0
32022-09-160.0873061.0
42023-08-280.0240461.0
\n", "
" ], "text/plain": [ " date sentiment ticker_TSLA\n", "0 2022-12-14 0.102207 1.0\n", "1 2023-02-21 0.155833 1.0\n", "2 2023-08-17 0.024046 1.0\n", "3 2022-09-16 0.087306 1.0\n", "4 2023-08-28 0.024046 1.0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "tickers = X_test[['ticker']]\n", "\n", "# Initialize OneHotEncoder\n", "encoder = OneHotEncoder()\n", "\n", "# Fit and transform the 'ticker' column\n", "ticker_encoded_test = encoder.fit_transform(tickers)\n", "\n", "# Convert the encoded column into a DataFrame\n", "ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n", "\n", "# Concatenate the encoded DataFrame with the original DataFrame\n", "X_test = pd.concat([X_test, ticker_encoded_df_test], axis=1)\n", "\n", "# Drop the original 'ticker' column\n", "X_test.drop('ticker', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "scaler = MinMaxScaler()\n", "\n", "# Fit and transform the 'open' column\n", "y_train['open_scaled'] = scaler.fit_transform(y_train[['open']])\n", "y_train.drop('open', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "y_test['open_scaled'] = scaler.fit_transform(y_test[['open']])\n", "y_test.drop('open', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Input, LSTM, Dense, Dropout\n", "from sklearn.preprocessing import StandardScaler # Import StandardScaler from scikit-learn\n", "\n", "def create_model(input_shape,\n", " LSTM_filters=64,\n", " dropout=0.1,\n", " recurrent_dropout=0.1,\n", " dense_dropout=0.5,\n", " activation='relu',\n", " depth=1):\n", "\n", " model = Sequential()\n", "\n", " # Input layer\n", " model.add(Input(shape=input_shape))\n", "\n", " if depth > 1:\n", " for i in range(1, depth):\n", " # Recurrent layer\n", " model.add(LSTM(LSTM_filters, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))\n", "\n", " # Recurrent layer\n", " model.add(LSTM(LSTM_filters, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout))\n", "\n", " # Fully connected layer\n", " if activation == 'relu':\n", " model.add(Dense(LSTM_filters, activation='relu'))\n", " elif activation == 'leaky_relu':\n", " model.add(Dense(LSTM_filters))\n", " model.add(tf.keras.layers.LeakyReLU(alpha=0.1))\n", "\n", " # Dropout for regularization\n", " model.add(Dropout(dense_dropout))\n", "\n", " # Output layer for predicting one day forward\n", " model.add(Dense(1, activation='linear'))\n", "\n", " # Compile the model\n", " model.compile(optimizer='adam', loss='mse')\n", "\n", " return model" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-06 23:23:33,215 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", "\n" ] } ], "source": [ "import numpy as np\n", "\n", "# Assuming X_train['date'] column exists and is in datetime format\n", "X_train['year'] = X_train['date'].dt.year\n", "X_train['month'] = X_train['date'].dt.month\n", "X_train['day'] = X_train['date'].dt.day\n", "\n", "# Drop the original date column\n", "X_train.drop(columns=['date'], inplace=True)\n", "\n", "# Convert dataframe to numpy array\n", "X_train_array = X_train.to_numpy()\n", "\n", "# Reshape the array to have a shape suitable for LSTM\n", "# Assuming each row represents a sample and each column represents a feature\n", "# Reshape to [samples, timesteps, features]\n", "X_train_array = np.expand_dims(X_train_array, axis=1)\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Convert DataFrame to numpy array\n", "X_train_array = X_train.values\n", "\n", "# Reshape X_train_array to add a time step dimension\n", "X_train_reshaped = X_train_array.reshape(X_train_array.shape[0], 1, X_train_array.shape[1])\n", "\n", "# Assuming X_train_reshaped shape is now (374, 1, 5)\n", "input_shape = X_train_reshaped.shape[1:]\n", "\n", "# Create the model\n", "model = create_model(input_shape=input_shape)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 3ms/step - loss: 0.5165\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X_train_reshaped, y_train)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-06 23:23:37,549 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", "\n" ] } ], "source": [ "# Assuming X_test['date'] column exists and is in datetime format\n", "X_test['year'] = X_test['date'].dt.year\n", "X_test['month'] = X_test['date'].dt.month\n", "X_test['day'] = X_test['date'].dt.day\n", "\n", "# Drop the original date column\n", "X_test.drop(columns=['date'], inplace=True)\n", "\n", "# Convert dataframe to numpy array\n", "X_test_array = X_test.to_numpy()\n", "\n", "# Reshape the array to have a shape suitable for LSTM\n", "# Assuming each row represents a sample and each column represents a feature\n", "# Reshape to [samples, timesteps, features]\n", "X_test_array = np.expand_dims(X_test_array, axis=1)\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 237ms/step\n" ] } ], "source": [ "y_pred = model.predict(X_test_array)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "mr = project.get_model_registry()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['LSTM_model.keras']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(model, 'LSTM_model.keras')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'RMSE': 0.40675989895763576}" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import mean_squared_error\n", "import numpy as np\n", "\n", "# Compute RMSE\n", "rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", "rmse_metrics = {\"RMSE\": rmse}\n", "rmse_metrics\n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "input_schema = Schema(X_train)\n", "output_schema = Schema(y_train)\n", "model_schema = ModelSchema(input_schema, output_schema)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1dd08fa9a7c144638a9f5c4600df04fa", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/6 [00:00