{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyO1k4BL8RYqJqf+FTqQWuh+", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard", "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')\n", "project_path = '/content/drive/MyDrive/projects/Stock_Predicter/v1'\n", "%cd $project_path" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xr3Qozgfktoc", "outputId": "7d93f2be-afa1-402c-ec77-24e5d1cf5f8e" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n", "/content/drive/MyDrive/projects/Stock_Predicter/v1\n" ] } ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "e8SQqogMQYLh" }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import pandas_datareader as web\n", "import datetime as dt\n", "import yfinance as yfin\n", "import tensorflow as tf\n", "import os\n", "import re\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout, LSTM\n" ] }, { "cell_type": "markdown", "source": [ "# Get Data" ], "metadata": { "id": "5vO8pty3VwkG" } }, { "cell_type": "code", "source": [ "# Select a company for now\n", "ticker = 'AAPL'\n", "\n", "start = dt.datetime(2013,1,1)\n", "end = dt.datetime(2023,4,5)" ], "metadata": { "id": "O6dtJpJwS5Eg" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "yfin.pdr_override()\n", "data = web.data.get_data_yahoo(ticker, start, end)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LwPyk8Uh-Zz_", "outputId": "5a636265-2f20-46ad-bc0d-30adbf2b630b" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\r[*********************100%***********************] 1 of 1 completed\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Preprocess_data" ], "metadata": { "id": "SSuS9OONV5-a" } }, { "cell_type": "code", "source": [ "def create_remove_columns(data):\n", " # create jump column\n", " data = pd.DataFrame.copy(data)\n", " data['Jump'] = data['Open'] - data['Close'].shift(1)\n", " data['Jump'].fillna(0, inplace=True)\n", " # data = data.reindex(columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Jump'])\n", " data.insert(0,'Jump', data.pop('Jump'))\n", " return data" ], "metadata": { "id": "Bpym8x-Kxf0p" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "def normalize_data(data, scaler=None):\n", " the_data = pd.DataFrame.copy(data)\n", " # substract the open value to all columns but the first one and the last one which are \"Jump\" and \"Volume\"\n", " the_data.iloc[:, 1:-1] = the_data.iloc[:,1:-1] - the_data['Open'].values[:, np.newaxis]\n", " # print('the_data')\n", " # print(the_data)\n", "\n", " the_data.pop('Open')\n", " # todo save an csv with the values for the scaler\n", " if scaler is None:\n", " # Create the scaler\n", " values = np.abs(the_data.values)\n", " max_value = np.max(values[:,:-1])\n", " max_volume = np.max(values[:,-1])\n", " def scaler(d):\n", " data = pd.DataFrame.copy(d)\n", " print('max_value: ', max_value)\n", " print('max_volume: ', max_volume)\n", " data.iloc[:, :-1] = data.iloc[:,:-1].apply(lambda x: x/max_value)\n", " data.iloc[:, -1] = data.iloc[:,-1].apply(lambda x: x/max_volume)\n", " return data\n", " def decoder(values):\n", " decoded_values = values * max_value\n", " return decoded_values\n", " else:\n", " decoder = None\n", " \n", " normalized_data = scaler(the_data)\n", "\n", " return normalized_data, scaler, decoder\n", "\n", "\n" ], "metadata": { "id": "v9RoqzBvtrOb" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "def create_training_data(norm_data):\n", " prediction_days = 500\n", " \n", " x_train_list = []\n", " y_train_list = []\n", " print('shape norm_data')\n", " print(norm_data.shape)\n", " \n", " for i in range(prediction_days, len(norm_data)):\n", " x_train_list.append(norm_data.iloc[i-prediction_days:i])\n", " y_train_list.append(norm_data.iloc[i-prediction_days+1:i+1,0:4])\n", " \n", " x_train = np.array(x_train_list)\n", " y_train = np.array(y_train_list)\n", " return x_train, y_train" ], "metadata": { "id": "jMXkRAYFomHM" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "x = np.random.randint(5, size=(5,6))\n", "print(x)\n", "print(x[2:4, 0:4])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "TX8fsigbOOzE", "outputId": "a5e4e0f3-8814-4e28-d5e2-33f4e80954b5" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[0 3 4 2 4 1]\n", " [4 4 1 0 3 4]\n", " [0 3 3 3 2 0]\n", " [3 1 3 3 0 4]\n", " [1 0 0 1 0 3]]\n", "[[0 3 3 3]\n", " [3 1 3 3]]\n" ] } ] }, { "cell_type": "code", "source": [ "#Make all the preprocesing\n", "def preprocessing(data, scaler=None):\n", " # print(data.head(3))\n", " data_0 = create_remove_columns(data)\n", " # print(data_0.head(3))\n", " #todo: save the_scaler somehow to use in new runtimes\n", " norm_data, scaler, decoder = normalize_data(data_0, scaler=scaler)\n", " # print(norm_data.head(3))\n", " x_train, y_train = create_training_data(norm_data)\n", " # print(x_train.shape, y_train.shape)\n", " return x_train, y_train, scaler, decoder" ], "metadata": { "id": "YZWMfusT-I7Z" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "x_train, y_train, scaler, decoder = preprocessing(data)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PeJjDC0VBG_6", "outputId": "ec7bf037-3d03-47b5-b97b-43e19b9ac9ea" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "max_value: 10.589996337890625\n", "max_volume: 1460852400.0\n", "shape norm_data\n", "(2582, 6)\n" ] } ] }, { "cell_type": "code", "source": [ "print(x_train.shape)\n", "print(y_train.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YkI8vSguuS8A", "outputId": "51e71e32-7e5b-4686-8e01-872ea97f976f" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(2082, 500, 6)\n", "(2082, 500, 4)\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Model" ], "metadata": { "id": "Z3N2WMYNV-qX" } }, { "cell_type": "markdown", "source": [ "## Create Model" ], "metadata": { "id": "emDyvzVUp5KJ" } }, { "cell_type": "code", "source": [ "def create_model():\n", " model = Sequential()\n", " model.add(LSTM(units=1024, return_sequences=True, input_shape=(None,x_train.shape[-1],)))\n", " # model.add(Dropout(0.2))\n", " model.add(LSTM(units=1024, return_sequences=True))\n", " # model.add(Dropout(0.2))\n", " model.add(LSTM(units=1024, return_sequences=True))\n", " model.add(Dense(4))\n", " return model\n", "\n", "model = create_model()\n", "print(model.summary())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GXhYAKzXVfku", "outputId": "0a471170-8a4b-416e-874c-ec5061a21744" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " lstm (LSTM) (None, None, 1024) 4222976 \n", " \n", " lstm_1 (LSTM) (None, None, 1024) 8392704 \n", " \n", " lstm_2 (LSTM) (None, None, 1024) 8392704 \n", " \n", " dense (Dense) (None, None, 4) 4100 \n", " \n", "=================================================================\n", "Total params: 21,012,484\n", "Trainable params: 21,012,484\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ] } ] }, { "cell_type": "code", "source": [ "# model.compile(optimizer='adam', loss='mean_squared_error')\n", "model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())" ], "metadata": { "id": "ZhoWj_XeXQws" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "if False:\n", " model.load_weights('./training_checkpoints_20230408062729/ckpt_epoch24_loss0.00024580140598118305')" ], "metadata": { "id": "jYCXXkRZraaF" }, "execution_count": 14, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Model Train" ], "metadata": { "id": "65QbfffusPoJ" } }, { "cell_type": "code", "source": [ "print(x_train.shape)\n", "print(y_train.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HDT9XPXHvqyN", "outputId": "041baee8-f87c-47aa-af07-8fb6dc56186f" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(2082, 500, 6)\n", "(2082, 500, 4)\n" ] } ] }, { "cell_type": "code", "source": [ "# Change to False to avoid trainging the model\n", "to_train = True\n", "if to_train:\n", "# if True:\n", " # Directory where the checkpoints will be saved\n", " checkpoint_dir = './training_checkpoints_'+dt.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", " # Name of the checkpoint files\n", " # checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_epoch{epoch}_loss{loss}\")\n", " checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", " \n", " checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(\n", " filepath=checkpoint_prefix,\n", " save_weights_only=True,\n", " monitor=\"loss\", mode=\"min\",\n", " save_best_only=True)\n", "\n", " model.fit(x_train, y_train, epochs=25, batch_size=32, callbacks=[checkpoint_callback])\n" ], "metadata": { "id": "9Ccc_Ej2TmYO" }, "execution_count": 16, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Testing a model" ], "metadata": { "id": "dbSKl47vZvpe" } }, { "cell_type": "code", "source": [ "#print trainings directories to pick one\n", "!ls -ld training_checkpoints_*/" ], "metadata": { "id": "59CDDB0i4yTx", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fa0788e1-b75e-45dc-a95d-ec3e24d00d70" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "drwx------ 2 root root 4096 Apr 8 07:34 training_checkpoints_20230408073359/\n" ] } ] }, { "cell_type": "code", "source": [ "model = create_model()" ], "metadata": { "id": "tpmru7nG9kbW" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "# if checkpoint_dir does not exists, select the one stated in the except block\n", "try:\n", " checkpoint_dir\n", "except NameError: \n", " checkpoint_dir = 'training_checkpoints_20230408073359'\n", "\n", "def load_weights(epoch=None):\n", " if epoch is None:\n", " weights_file = tf.train.latest_checkpoint(checkpoint_dir)\n", " else:\n", " with os.scandir(checkpoint_dir) as entries:\n", " for entry in entries:\n", " if re.search(f'^ckpt_epoch{epoch}_.*\\.index', entry.name):\n", " weights_file = checkpoint_dir + '/'+ entry.name[:-6]\n", "\n", " print('weights_file')\n", " print(weights_file)\n", " model.load_weights(weights_file)\n", " return model\n", "\n", "model = load_weights(epoch=None)\n", "model_filepath = 'saved_model'\n", "model.save(model_filepath)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wQ0JTXsp4VKF", "outputId": "57440f25-6468-404c-f0a6-e99f004f9e67" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "weights_file\n", "training_checkpoints_20230408073359/ckpt\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "WARNING:absl:Found untraced functions such as lstm_cell_3_layer_call_fn, lstm_cell_3_layer_call_and_return_conditional_losses, lstm_cell_4_layer_call_fn, lstm_cell_4_layer_call_and_return_conditional_losses, lstm_cell_5_layer_call_fn while saving (showing 5 of 6). These functions will not be directly callable after loading.\n" ] } ] }, { "cell_type": "code", "source": [ "test_start = dt.datetime(2016,1,1)\n", "test_end = dt.datetime(2023,4,5)\n", "ticker = 'AAPL'\n", "\n", "yfin.pdr_override()\n", "test_data = web.data.get_data_yahoo(ticker, test_start, test_end)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mf4q97pfaSCA", "outputId": "5682a5dd-6968-4098-c203-31bc6c14e47c" }, "execution_count": 24, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\r[*********************100%***********************] 1 of 1 completed\n" ] } ] }, { "cell_type": "code", "source": [ "load_whole_model = False\n", "if load_whole_model:\n", " model = tf.keras.models.load_model(model_filepath)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uloiDJXRbrJs", "outputId": "a3aeb021-5555-413f-eb40-cce7ebe60ed5" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n" ] } ] }, { "cell_type": "code", "source": [ "# def close_tester(model, test_data, scaler=None):\n", "scaler = scaler\n", "test_x_train, test_y_train, _, _ = preprocessing(test_data, scaler=scaler)\n", "print(test_x_train.shape)\n", "print(test_y_train.shape)\n", "results = model.predict(test_x_train)\n", "# the results are tensors of 4 numbers, Jump, High, Low, and Close respectively\n", "\n", "# close_tester(test_model, test_data, scaler=the_scaler)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MqCeMf3UoxZm", "outputId": "1a448123-6253-4585-c7af-5ff7491c1628" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "max_value: 10.589996337890625\n", "max_volume: 1460852400.0\n", "shape norm_data\n", "(1826, 6)\n", "(1326, 500, 6)\n", "(1326, 500, 4)\n", "42/42 [==============================] - 12s 270ms/step\n" ] } ] }, { "cell_type": "code", "source": [ "right_counter = 0\n", "wrong_counter = 0\n", "no_action_counter = 0\n", "# for result, expected in zip(results[:2], test_y_train[:2]):\n", "for result, expected in zip(results[:], test_y_train[:]):\n", " # print(result)\n", " # print(expected)\n", " comparer = result[-1][3] * expected[-1][3]\n", " if comparer > 0:\n", " right_counter += 1\n", " elif comparer == 0:\n", " no_action_counter\n", " elif comparer < 0:\n", " wrong_counter += 1\n", "\n", " # print('expected: ', decoder(expected))\n", " # print('result: ', decoder(result))\n", "\n", "print('right_counter :', right_counter)\n", "print('no_action_counter :',no_action_counter)\n", "print('wrong_counter :', wrong_counter)\n", "print('success rate: {}%'.format(right_counter*100/len(results)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AVYFQZnqEqhx", "outputId": "8110f58a-7344-42b0-ed9a-ba77a4999e30" }, "execution_count": 31, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "right_counter : 1300\n", "no_action_counter : 0\n", "wrong_counter : 22\n", "success rate: 98.03921568627452%\n" ] } ] }, { "cell_type": "code", "source": [ "test_data.iloc[500,:]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gyhzy_l6sAvi", "outputId": "6bd3c88a-b61b-44e8-d16d-0880a17ebab2" }, "execution_count": 32, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Open 4.252500e+01\n", "High 4.269500e+01\n", "Low 4.242750e+01\n", "Close 4.265000e+01\n", "Adj Close 4.049405e+01\n", "Volume 8.599280e+07\n", "Name: 2017-12-27 00:00:00, dtype: float64" ] }, "metadata": {}, "execution_count": 32 } ] } ] }