{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd \n",
"import hopsworks \n",
"from datetime import datetime, timedelta\n",
"from SML.training_pipeline import model_dir\n",
"import numpy as np\n",
"\n",
"\n",
"\n",
"#Making the notebook able to fetch from the .env file\n",
"from dotenv import load_dotenv\n",
"import os\n",
"\n",
"load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Connection closed.\n",
"Connected. Call `.close()` to terminate connection gracefully.\n",
"\n",
"Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
"Connected. Call `.close()` to terminate connection gracefully.\n",
"Connected. Call `.close()` to terminate connection gracefully.\n"
]
}
],
"source": [
"api_key = os.environ.get('hopsworks_api')\n",
"project = hopsworks.login(api_key_value=api_key)\n",
"fs = project.get_feature_store()\n",
"mr = project.get_model_registry() "
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-05-12\n"
]
}
],
"source": [
"start_date = datetime.now() - timedelta(hours=48)\n",
"print(start_date.strftime(\"%Y-%m-%d\"))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-05-13\n"
]
}
],
"source": [
"end_date = datetime.now() - timedelta(hours=24)\n",
"print(end_date.strftime(\"%Y-%m-%d\"))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"feature_view = fs.get_feature_view('tesla_stocks_fv', 5)\n",
"feature_view.init_batch_scoring(training_dataset_version=1)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WITH right_fg0 AS (SELECT *\n",
"FROM (SELECT `fg1`.`date` `date`, `fg1`.`ticker` `ticker`, `fg1`.`ticker` `join_pk_ticker`, `fg1`.`date` `join_evt_date`, `fg0`.`sentiment` `sentiment`, RANK() OVER (PARTITION BY `fg1`.`ticker`, `fg1`.`date` ORDER BY `fg0`.`date` DESC) pit_rank_hopsworks\n",
"FROM `klittefr_featurestore`.`tesla_stock_5` `fg1`\n",
"INNER JOIN `klittefr_featurestore`.`news_sentiment_updated_5` `fg0` ON `fg1`.`ticker` = `fg0`.`ticker` AND `fg1`.`date` >= `fg0`.`date`) NA\n",
"WHERE `pit_rank_hopsworks` = 1) (SELECT `right_fg0`.`date` `date`, `right_fg0`.`ticker` `ticker`, `right_fg0`.`sentiment` `sentiment`\n",
"FROM right_fg0)\n"
]
}
],
"source": [
"print(feature_view.get_batch_query())"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.20s) \n"
]
}
],
"source": [
"# we had problems fetching the data from fv with get_batch_data function, tried everything and it just doesnt work \n",
"tesla_df_b = feature_view.get_batch_data(start_time=start_date, end_time=end_date)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" date | \n",
" ticker | \n",
" sentiment | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2024-05-13 00:00:00+00:00 | \n",
" TSLA | \n",
" 0.115443 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" date ticker sentiment\n",
"0 2024-05-13 00:00:00+00:00 TSLA 0.115443"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tesla_df_b.head()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"#OneHotEncoding the tesla_df_b column 'ticker'\n",
"\n",
"tickers = tesla_df_b[['ticker']]\n",
"\n",
"# Initializing OneHotEncoder\n",
"encoder = OneHotEncoder()\n",
"\n",
"# Fitting and transforming the 'ticker' column\n",
"ticker_encoded_test = encoder.fit_transform(tickers)\n",
"\n",
"# Converting the encoded column into a DataFrame\n",
"ticker_encoded_df_test = pd.DataFrame(ticker_encoded_test.toarray(), columns=encoder.get_feature_names_out(['ticker']))\n",
"\n",
"# Concatenating the encoded DataFrame with the original DataFrame\n",
"tesla_df_b = pd.concat([tesla_df_b, ticker_encoded_df_test], axis=1)\n",
"\n",
"# Dropping the original 'ticker' column\n",
"tesla_df_b.drop('ticker', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-05-14 12:30:49,197 WARNING: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n",
"See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n",
"\n"
]
}
],
"source": [
"# As X_train['date'] column exists and is in datetime format, we're converting it\n",
"tesla_df_b['year'] = tesla_df_b['date'].dt.year\n",
"tesla_df_b['month'] = tesla_df_b['date'].dt.month\n",
"tesla_df_b['day'] = tesla_df_b['date'].dt.day\n",
"\n",
"# Dropping the original date column\n",
"tesla_df_b.drop(columns=['date'], inplace=True)\n",
"\n",
"# Converting dataframe to numpy array\n",
"tesla_df_b_array = tesla_df_b.to_numpy()\n",
"\n",
"# Reshaping the array to have a shape suitable for LSTM\n",
"tesla_df_b_array = np.expand_dims(tesla_df_b_array, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading model artifact (0 dirs, 3 files)... DONE\r"
]
}
],
"source": [
"import joblib\n",
"\n",
"the_model = mr.get_model(\"stock_pred_model\", version=28)\n",
"model_dir = the_model.download()\n",
"\n",
"model = joblib.load(model_dir + \"/stock_prediction_model.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1s/step\n"
]
}
],
"source": [
"predictions = model.predict(tesla_df_b_array)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.8625609]], dtype=float32)"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions "
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"86.25609278678894\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Our predictions array\n",
"predictions = np.array(predictions, dtype=np.float32)\n",
"\n",
"# Changing the format of the predicted value to correspond with format of \"open\"\n",
"predictions = predictions[0][0]*100\n",
"print(predictions)\n"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"tesla_df_b['predictions'] = predictions.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"# Assuming you have 'year', 'month', and 'day' columns in your DataFrame\n",
"tesla_df_b['date'] = pd.to_datetime(tesla_df_b[['year', 'month', 'day']])\n",
"\n",
"# Now you can drop the 'year', 'month', and 'day' columns if you want\n",
"tesla_df_b.drop(columns=['year', 'month', 'day'], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"tesla_df_b['date'] = pd.to_datetime(tesla_df_b['date'])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sentiment | \n",
" ticker_TSLA | \n",
" predictions | \n",
" date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.115443 | \n",
" 1.0 | \n",
" 86.256093 | \n",
" 2024-05-13 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sentiment ticker_TSLA predictions date\n",
"0 0.115443 1.0 86.256093 2024-05-13"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tesla_df_b"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# Convert the encoded DataFrame back to numpy array\n",
"ticker_encoded_array = ticker_encoded_df_test.to_numpy()\n",
"\n",
"# Inverse transform the encoded array to retrieve the original values\n",
"original_tickers = encoder.inverse_transform(ticker_encoded_array)\n",
"\n",
"# Convert the original_tickers array to a DataFrame\n",
"original_tickers_df = pd.DataFrame(original_tickers, columns=['ticker'])\n",
"\n",
"# Concatenate the original ticker column with the remaining columns from tesla_df_b\n",
"tesla_df_b = pd.concat([tesla_df_b.drop(columns=['ticker_TSLA']), original_tickers_df], axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sentiment | \n",
" predictions | \n",
" date | \n",
" ticker | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.115443 | \n",
" 86.256093 | \n",
" 2024-05-13 | \n",
" TSLA | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sentiment predictions date ticker\n",
"0 0.115443 86.256093 2024-05-13 TSLA"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tesla_df_b.head()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"#from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"# Flatten the list of lists into a single list\n",
"#flat_predictions_scaled = [item for sublist in predictions_scaled for item in sublist]\n",
"\n",
"# Initialize the MinMaxScaler\n",
"#scaler = MinMaxScaler()\n",
"\n",
"# Fit the scaler to the scaled predictions\n",
"#scaler.fit(flat_predictions_scaled)\n",
"\n",
"# Inverse transform the scaled predictions to get the original values\n",
"#predictions_unscaled = scaler.inverse_transform(flat_predictions_scaled)\n",
"\n",
"# Update the 'predictions' column with the unscaled values\n",
"#tesla_df_b['predictions'] = predictions_unscaled"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Connection closed.\n",
"Connected. Call `.close()` to terminate connection gracefully.\n",
"\n",
"Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/693399\n",
"Connected. Call `.close()` to terminate connection gracefully.\n"
]
}
],
"source": [
"api_key = os.environ.get('hopsworks_api')\n",
"project = hopsworks.login(api_key_value=api_key)\n",
"fs = project.get_feature_store()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-05-14 12:39:44,585 WARNING: DeprecationWarning: Providing event_time as a single-element list is deprecated and will be dropped in future versions. Provide the feature_name string instead.\n",
"\n"
]
}
],
"source": [
"results_fg = fs.get_or_create_feature_group(\n",
" name= 'stock_prediction_results',\n",
" version = 4,\n",
" description = 'Predction of TSLA open stock price',\n",
" primary_key = ['ticker'],\n",
" event_time = ['date'],\n",
" online_enabled = False,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Group created successfully, explore it at \n",
"https://c.app.hopsworks.ai:443/p/693399/fs/689222/fg/814414\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "33665584853d402aaa2c6c8dc2386ed5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Uploading Dataframe: 0.00% | | Rows 0/1 | Elapsed Time: 00:00 | Remaining Time: ?"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Launching job: stock_prediction_results_4_offline_fg_materialization\n",
"Job started successfully, you can follow the progress at \n",
"https://c.app.hopsworks.ai/p/693399/jobs/named/stock_prediction_results_4_offline_fg_materialization/executions\n"
]
},
{
"data": {
"text/plain": [
"(, None)"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Inserting the stock data into the stocks feature group\n",
"results_fg.insert(tesla_df_b, write_options={\"wait_for_job\" : False})"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}