{ "cells": [ { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "import hopsworks \n", "from datetime import datetime, timedelta\n", "from SML.training_pipeline import model_dir\n", "import numpy as np\n", "\n", "\n", "\n", "#Making the notebook able to fetch from the .env file\n", "from dotenv import load_dotenv\n", "import os\n", "\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connection closed.\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "api_key = os.environ.get('hopsworks_api')\n", "project = hopsworks.login(api_key_value=api_key)\n", "fs = project.get_feature_store()\n", "mr = project.get_model_registry() " ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-12\n" ] } ], "source": [ "start_date = datetime.now() - timedelta(hours=48)\n", "print(start_date.strftime(\"%Y-%m-%d\"))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-13\n" ] } ], "source": [ "end_date = datetime.now() - timedelta(hours=24)\n", "print(end_date.strftime(\"%Y-%m-%d\"))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "feature_view = fs.get_feature_view('tesla_stocks_fv', 5)\n", "feature_view.init_batch_scoring(training_dataset_version=1)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WITH right_fg0 AS (SELECT *\n", "FROM (SELECT `fg1`.`date` `date`, `fg1`.`ticker` `ticker`, `fg1`.`ticker` `join_pk_ticker`, `fg1`.`date` `join_evt_date`, `fg0`.`sentiment` `sentiment`, RANK() OVER (PARTITION BY `fg1`.`ticker`, `fg1`.`date` ORDER BY `fg0`.`date` DESC) pit_rank_hopsworks\n", "FROM `klittefr_featurestore`.`tesla_stock_5` `fg1`\n", "INNER JOIN `klittefr_featurestore`.`news_sentiment_updated_5` `fg0` ON `fg1`.`ticker` = `fg0`.`ticker` AND `fg1`.`date` >= `fg0`.`date`) NA\n", "WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`date` `date`, `right_fg0`.`ticker` `ticker`, `right_fg0`.`sentiment` `sentiment`\n", "FROM right_fg0)\n" ] } ], "source": [ "print(feature_view.get_batch_query())" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.20s) \n" ] } ], "source": [ "# we had problems fetching the data from fv with get_batch_data function, tried everything and it just doesnt work \n", "tesla_df_b = feature_view.get_batch_data(start_time=start_date, end_time=end_date)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datetickersentiment
02024-05-13 00:00:00+00:00TSLA0.115443
\n", "
" ], "text/plain": [ " date ticker sentiment\n", "0 2024-05-13 00:00:00+00:00 TSLA 0.115443" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tesla_df_b.head()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "#OneHotEncoding the tesla_df_b column 'ticker'\n", "\n", "tickers = tesla_df_b[['ticker']]\n", "\n", "# Initializing OneHotEncoder\n", "encoder = OneHotEncoder()\n", "\n", "# Fitting and transforming the 'ticker' column\n", "ticker_encoded_test = encoder.fit_transform(tickers)\n", "\n", "# Converting the encoded column into a DataFrame\n", "ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n", "\n", "# Concatenating the encoded DataFrame with the original DataFrame\n", "tesla_df_b = pd.concat([tesla_df_b, ticker_encoded_df_test], axis=1)\n", "\n", "# Dropping the original 'ticker' column\n", "tesla_df_b.drop('ticker', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-14 12:30:49,197 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", "\n" ] } ], "source": [ "# As X_train['date'] column exists and is in datetime format, we're converting it\n", "tesla_df_b['year'] = tesla_df_b['date'].dt.year\n", "tesla_df_b['month'] = tesla_df_b['date'].dt.month\n", "tesla_df_b['day'] = tesla_df_b['date'].dt.day\n", "\n", "# Dropping the original date column\n", "tesla_df_b.drop(columns=['date'], inplace=True)\n", "\n", "# Converting dataframe to numpy array\n", "tesla_df_b_array = tesla_df_b.to_numpy()\n", "\n", "# Reshaping the array to have a shape suitable for LSTM\n", "tesla_df_b_array = np.expand_dims(tesla_df_b_array, axis=1)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading model artifact (0 dirs, 3 files)... DONE\r" ] } ], "source": [ "import joblib\n", "\n", "the_model = mr.get_model(\"stock_pred_model\", version=28)\n", "model_dir = the_model.download()\n", "\n", "model = joblib.load(model_dir + \"/stock_prediction_model.pkl\")" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1s/step\n" ] } ], "source": [ "predictions = model.predict(tesla_df_b_array)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.8625609]], dtype=float32)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions " ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "86.25609278678894\n" ] } ], "source": [ "import numpy as np\n", "\n", "# Our predictions array\n", "predictions = np.array(predictions, dtype=np.float32)\n", "\n", "# Changing the format of the predicted value to correspond with format of \"open\"\n", "predictions = predictions[0][0]*100\n", "print(predictions)\n" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "tesla_df_b['predictions'] = predictions.tolist()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "# Assuming you have 'year', 'month', and 'day' columns in your DataFrame\n", "tesla_df_b['date'] = pd.to_datetime(tesla_df_b[['year', 'month', 'day']])\n", "\n", "# Now you can drop the 'year', 'month', and 'day' columns if you want\n", "tesla_df_b.drop(columns=['year', 'month', 'day'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "tesla_df_b['date'] = pd.to_datetime(tesla_df_b['date'])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentimentticker_TSLApredictionsdate
00.1154431.086.2560932024-05-13
\n", "
" ], "text/plain": [ " sentiment ticker_TSLA predictions date\n", "0 0.115443 1.0 86.256093 2024-05-13" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tesla_df_b" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# Convert the encoded DataFrame back to numpy array\n", "ticker_encoded_array = ticker_encoded_df_test.to_numpy()\n", "\n", "# Inverse transform the encoded array to retrieve the original values\n", "original_tickers = encoder.inverse_transform(ticker_encoded_array)\n", "\n", "# Convert the original_tickers array to a DataFrame\n", "original_tickers_df = pd.DataFrame(original_tickers, columns=['ticker'])\n", "\n", "# Concatenate the original ticker column with the remaining columns from tesla_df_b\n", "tesla_df_b = pd.concat([tesla_df_b.drop(columns=['ticker_TSLA']), original_tickers_df], axis=1)\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentimentpredictionsdateticker
00.11544386.2560932024-05-13TSLA
\n", "
" ], "text/plain": [ " sentiment predictions date ticker\n", "0 0.115443 86.256093 2024-05-13 TSLA" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tesla_df_b.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "#from sklearn.preprocessing import MinMaxScaler\n", "\n", "# Flatten the list of lists into a single list\n", "#flat_predictions_scaled = [item for sublist in predictions_scaled for item in sublist]\n", "\n", "# Initialize the MinMaxScaler\n", "#scaler = MinMaxScaler()\n", "\n", "# Fit the scaler to the scaled predictions\n", "#scaler.fit(flat_predictions_scaled)\n", "\n", "# Inverse transform the scaled predictions to get the original values\n", "#predictions_unscaled = scaler.inverse_transform(flat_predictions_scaled)\n", "\n", "# Update the 'predictions' column with the unscaled values\n", "#tesla_df_b['predictions'] = predictions_unscaled" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connection closed.\n", "Connected. Call `.close()` to terminate connection gracefully.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n", "Connected. Call `.close()` to terminate connection gracefully.\n" ] } ], "source": [ "api_key = os.environ.get('hopsworks_api')\n", "project = hopsworks.login(api_key_value=api_key)\n", "fs = project.get_feature_store()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-14 12:39:44,585 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n", "\n" ] } ], "source": [ "results_fg = fs.get_or_create_feature_group(\n", " name= 'stock_prediction_results',\n", " version = 4,\n", " description = 'Predction of TSLA open stock price',\n", " primary_key = ['ticker'],\n", " event_time = ['date'],\n", " online_enabled = False,\n", ")" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Feature Group created successfully, explore it at \n", "https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/814414\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "33665584853d402aaa2c6c8dc2386ed5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Uploading Dataframe: 0.00% | | Rows 0/1 | Elapsed Time: 00:00 | Remaining Time: ?" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Launching job: stock_prediction_results_4_offline_fg_materialization\n", "Job started successfully, you can follow the progress at \n", "https://c.app.hopsworks.ai/p/693399/jobs/named/stock_prediction_results_4_offline_fg_materialization/executions\n" ] }, { "data": { "text/plain": [ "(, None)" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Inserting the stock data into the stocks feature group\n", "results_fg.insert(tesla_df_b, write_options={\"wait_for_job\" : False})" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }