{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyO1k4BL8RYqJqf+FTqQWuh+",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "gpuClass": "standard",
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/jsebdev/Stock_Predictor/blob/main/stock_predictor.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "project_path = '/content/drive/MyDrive/projects/Stock_Predicter/v1'\n",
        "%cd $project_path"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Xr3Qozgfktoc",
        "outputId": "7d93f2be-afa1-402c-ec77-24e5d1cf5f8e"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n",
            "/content/drive/MyDrive/projects/Stock_Predicter/v1\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "e8SQqogMQYLh"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "import pandas_datareader as web\n",
        "import datetime as dt\n",
        "import yfinance as yfin\n",
        "import tensorflow as tf\n",
        "import os\n",
        "import re\n",
        "\n",
        "from sklearn.preprocessing import MinMaxScaler\n",
        "from tensorflow.keras.models import Sequential\n",
        "from tensorflow.keras.layers import Dense, Dropout, LSTM\n"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Get Data"
      ],
      "metadata": {
        "id": "5vO8pty3VwkG"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Select a company for now\n",
        "ticker = 'AAPL'\n",
        "\n",
        "start = dt.datetime(2013,1,1)\n",
        "end = dt.datetime(2023,4,5)"
      ],
      "metadata": {
        "id": "O6dtJpJwS5Eg"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "yfin.pdr_override()\n",
        "data = web.data.get_data_yahoo(ticker, start, end)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LwPyk8Uh-Zz_",
        "outputId": "5a636265-2f20-46ad-bc0d-30adbf2b630b"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\r[*********************100%***********************]  1 of 1 completed\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Preprocess_data"
      ],
      "metadata": {
        "id": "SSuS9OONV5-a"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def create_remove_columns(data):\n",
        "  # create jump column\n",
        "  data = pd.DataFrame.copy(data)\n",
        "  data['Jump'] = data['Open'] - data['Close'].shift(1)\n",
        "  data['Jump'].fillna(0, inplace=True)\n",
        "  # data = data.reindex(columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Jump'])\n",
        "  data.insert(0,'Jump', data.pop('Jump'))\n",
        "  return data"
      ],
      "metadata": {
        "id": "Bpym8x-Kxf0p"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def normalize_data(data, scaler=None):\n",
        "  the_data = pd.DataFrame.copy(data)\n",
        "  # substract the open value to all columns but the first one and the last one which are \"Jump\" and \"Volume\"\n",
        "  the_data.iloc[:, 1:-1] = the_data.iloc[:,1:-1] - the_data['Open'].values[:, np.newaxis]\n",
        "  # print('the_data')\n",
        "  # print(the_data)\n",
        "\n",
        "  the_data.pop('Open')\n",
        "  # todo save an csv with the values for the scaler\n",
        "  if scaler is None:\n",
        "    # Create the scaler\n",
        "    values = np.abs(the_data.values)\n",
        "    max_value = np.max(values[:,:-1])\n",
        "    max_volume = np.max(values[:,-1])\n",
        "    def scaler(d):\n",
        "      data = pd.DataFrame.copy(d)\n",
        "      print('max_value: ', max_value)\n",
        "      print('max_volume: ', max_volume)\n",
        "      data.iloc[:, :-1] = data.iloc[:,:-1].apply(lambda x: x/max_value)\n",
        "      data.iloc[:, -1] = data.iloc[:,-1].apply(lambda x: x/max_volume)\n",
        "      return data\n",
        "    def decoder(values):\n",
        "      decoded_values = values * max_value\n",
        "      return decoded_values\n",
        "  else:\n",
        "    decoder = None\n",
        "  \n",
        "  normalized_data = scaler(the_data)\n",
        "\n",
        "  return normalized_data, scaler, decoder\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "v9RoqzBvtrOb"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def create_training_data(norm_data):\n",
        "  prediction_days = 500\n",
        "  \n",
        "  x_train_list = []\n",
        "  y_train_list = []\n",
        "  print('shape norm_data')\n",
        "  print(norm_data.shape)\n",
        "  \n",
        "  for i in range(prediction_days, len(norm_data)):\n",
        "    x_train_list.append(norm_data.iloc[i-prediction_days:i])\n",
        "    y_train_list.append(norm_data.iloc[i-prediction_days+1:i+1,0:4])\n",
        "  \n",
        "  x_train = np.array(x_train_list)\n",
        "  y_train = np.array(y_train_list)\n",
        "  return x_train, y_train"
      ],
      "metadata": {
        "id": "jMXkRAYFomHM"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x = np.random.randint(5, size=(5,6))\n",
        "print(x)\n",
        "print(x[2:4, 0:4])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TX8fsigbOOzE",
        "outputId": "a5e4e0f3-8814-4e28-d5e2-33f4e80954b5"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[[0 3 4 2 4 1]\n",
            " [4 4 1 0 3 4]\n",
            " [0 3 3 3 2 0]\n",
            " [3 1 3 3 0 4]\n",
            " [1 0 0 1 0 3]]\n",
            "[[0 3 3 3]\n",
            " [3 1 3 3]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#Make all the preprocesing\n",
        "def preprocessing(data, scaler=None):\n",
        "  # print(data.head(3))\n",
        "  data_0 = create_remove_columns(data)\n",
        "  # print(data_0.head(3))\n",
        "  #todo: save the_scaler somehow to use in new runtimes\n",
        "  norm_data, scaler, decoder = normalize_data(data_0, scaler=scaler)\n",
        "  # print(norm_data.head(3))\n",
        "  x_train, y_train = create_training_data(norm_data)\n",
        "  # print(x_train.shape, y_train.shape)\n",
        "  return x_train, y_train, scaler, decoder"
      ],
      "metadata": {
        "id": "YZWMfusT-I7Z"
      },
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "x_train, y_train, scaler, decoder = preprocessing(data)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PeJjDC0VBG_6",
        "outputId": "ec7bf037-3d03-47b5-b97b-43e19b9ac9ea"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "max_value:  10.589996337890625\n",
            "max_volume:  1460852400.0\n",
            "shape norm_data\n",
            "(2582, 6)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(x_train.shape)\n",
        "print(y_train.shape)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YkI8vSguuS8A",
        "outputId": "51e71e32-7e5b-4686-8e01-872ea97f976f"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(2082, 500, 6)\n",
            "(2082, 500, 4)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Model"
      ],
      "metadata": {
        "id": "Z3N2WMYNV-qX"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Create Model"
      ],
      "metadata": {
        "id": "emDyvzVUp5KJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def create_model():\n",
        "  model = Sequential()\n",
        "  model.add(LSTM(units=1024, return_sequences=True, input_shape=(None,x_train.shape[-1],)))\n",
        "  # model.add(Dropout(0.2))\n",
        "  model.add(LSTM(units=1024, return_sequences=True))\n",
        "  # model.add(Dropout(0.2))\n",
        "  model.add(LSTM(units=1024, return_sequences=True))\n",
        "  model.add(Dense(4))\n",
        "  return model\n",
        "\n",
        "model = create_model()\n",
        "print(model.summary())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GXhYAKzXVfku",
        "outputId": "0a471170-8a4b-416e-874c-ec5061a21744"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Model: \"sequential\"\n",
            "_________________________________________________________________\n",
            " Layer (type)                Output Shape              Param #   \n",
            "=================================================================\n",
            " lstm (LSTM)                 (None, None, 1024)        4222976   \n",
            "                                                                 \n",
            " lstm_1 (LSTM)               (None, None, 1024)        8392704   \n",
            "                                                                 \n",
            " lstm_2 (LSTM)               (None, None, 1024)        8392704   \n",
            "                                                                 \n",
            " dense (Dense)               (None, None, 4)           4100      \n",
            "                                                                 \n",
            "=================================================================\n",
            "Total params: 21,012,484\n",
            "Trainable params: 21,012,484\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n",
            "None\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# model.compile(optimizer='adam', loss='mean_squared_error')\n",
        "model.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())"
      ],
      "metadata": {
        "id": "ZhoWj_XeXQws"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "if False:\n",
        "  model.load_weights('./training_checkpoints_20230408062729/ckpt_epoch24_loss0.00024580140598118305')"
      ],
      "metadata": {
        "id": "jYCXXkRZraaF"
      },
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Model Train"
      ],
      "metadata": {
        "id": "65QbfffusPoJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(x_train.shape)\n",
        "print(y_train.shape)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HDT9XPXHvqyN",
        "outputId": "041baee8-f87c-47aa-af07-8fb6dc56186f"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(2082, 500, 6)\n",
            "(2082, 500, 4)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Change to False to avoid trainging the model\n",
        "to_train = True\n",
        "if to_train:\n",
        "# if True:\n",
        "  # Directory where the checkpoints will be saved\n",
        "  checkpoint_dir = './training_checkpoints_'+dt.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
        "  # Name of the checkpoint files\n",
        "  # checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_epoch{epoch}_loss{loss}\")\n",
        "  checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
        "  \n",
        "  checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(\n",
        "      filepath=checkpoint_prefix,\n",
        "      save_weights_only=True,\n",
        "      monitor=\"loss\", mode=\"min\",\n",
        "      save_best_only=True)\n",
        "\n",
        "  model.fit(x_train, y_train, epochs=25, batch_size=32, callbacks=[checkpoint_callback])\n"
      ],
      "metadata": {
        "id": "9Ccc_Ej2TmYO"
      },
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Testing a model"
      ],
      "metadata": {
        "id": "dbSKl47vZvpe"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#print trainings directories to pick one\n",
        "!ls -ld training_checkpoints_*/"
      ],
      "metadata": {
        "id": "59CDDB0i4yTx",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "fa0788e1-b75e-45dc-a95d-ec3e24d00d70"
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "drwx------ 2 root root 4096 Apr  8 07:34 training_checkpoints_20230408073359/\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model = create_model()"
      ],
      "metadata": {
        "id": "tpmru7nG9kbW"
      },
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# if checkpoint_dir does not exists, select the one stated in the except block\n",
        "try:\n",
        "  checkpoint_dir\n",
        "except NameError: \n",
        "  checkpoint_dir = 'training_checkpoints_20230408073359'\n",
        "\n",
        "def load_weights(epoch=None):\n",
        "  if epoch is None:\n",
        "    weights_file = tf.train.latest_checkpoint(checkpoint_dir)\n",
        "  else:\n",
        "    with os.scandir(checkpoint_dir) as entries:\n",
        "      for entry in entries:\n",
        "        if re.search(f'^ckpt_epoch{epoch}_.*\\.index', entry.name):\n",
        "          weights_file = checkpoint_dir + '/'+ entry.name[:-6]\n",
        "\n",
        "  print('weights_file')\n",
        "  print(weights_file)\n",
        "  model.load_weights(weights_file)\n",
        "  return model\n",
        "\n",
        "model = load_weights(epoch=None)\n",
        "model_filepath = 'saved_model'\n",
        "model.save(model_filepath)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wQ0JTXsp4VKF",
        "outputId": "57440f25-6468-404c-f0a6-e99f004f9e67"
      },
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "weights_file\n",
            "training_checkpoints_20230408073359/ckpt\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:absl:Found untraced functions such as lstm_cell_3_layer_call_fn, lstm_cell_3_layer_call_and_return_conditional_losses, lstm_cell_4_layer_call_fn, lstm_cell_4_layer_call_and_return_conditional_losses, lstm_cell_5_layer_call_fn while saving (showing 5 of 6). These functions will not be directly callable after loading.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_start = dt.datetime(2016,1,1)\n",
        "test_end = dt.datetime(2023,4,5)\n",
        "ticker = 'AAPL'\n",
        "\n",
        "yfin.pdr_override()\n",
        "test_data = web.data.get_data_yahoo(ticker, test_start, test_end)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Mf4q97pfaSCA",
        "outputId": "5682a5dd-6968-4098-c203-31bc6c14e47c"
      },
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\r[*********************100%***********************]  1 of 1 completed\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "load_whole_model = False\n",
        "if load_whole_model:\n",
        "  model = tf.keras.models.load_model(model_filepath)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uloiDJXRbrJs",
        "outputId": "a3aeb021-5555-413f-eb40-cce7ebe60ed5"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# def close_tester(model, test_data, scaler=None):\n",
        "scaler = scaler\n",
        "test_x_train, test_y_train, _, _ = preprocessing(test_data, scaler=scaler)\n",
        "print(test_x_train.shape)\n",
        "print(test_y_train.shape)\n",
        "results = model.predict(test_x_train)\n",
        "# the results are tensors of 4 numbers, Jump, High, Low, and Close respectively\n",
        "\n",
        "# close_tester(test_model, test_data, scaler=the_scaler)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MqCeMf3UoxZm",
        "outputId": "1a448123-6253-4585-c7af-5ff7491c1628"
      },
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "max_value:  10.589996337890625\n",
            "max_volume:  1460852400.0\n",
            "shape norm_data\n",
            "(1826, 6)\n",
            "(1326, 500, 6)\n",
            "(1326, 500, 4)\n",
            "42/42 [==============================] - 12s 270ms/step\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "right_counter = 0\n",
        "wrong_counter = 0\n",
        "no_action_counter = 0\n",
        "# for result, expected in zip(results[:2], test_y_train[:2]):\n",
        "for result, expected in zip(results[:], test_y_train[:]):\n",
        "  # print(result)\n",
        "  # print(expected)\n",
        "  comparer = result[-1][3] * expected[-1][3]\n",
        "  if comparer > 0:\n",
        "    right_counter += 1\n",
        "  elif comparer == 0:\n",
        "    no_action_counter\n",
        "  elif comparer < 0:\n",
        "    wrong_counter += 1\n",
        "\n",
        "  # print('expected: ', decoder(expected))\n",
        "  # print('result: ', decoder(result))\n",
        "\n",
        "print('right_counter :', right_counter)\n",
        "print('no_action_counter :',no_action_counter)\n",
        "print('wrong_counter :', wrong_counter)\n",
        "print('success rate: {}%'.format(right_counter*100/len(results)))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "AVYFQZnqEqhx",
        "outputId": "8110f58a-7344-42b0-ed9a-ba77a4999e30"
      },
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "right_counter : 1300\n",
            "no_action_counter : 0\n",
            "wrong_counter : 22\n",
            "success rate: 98.03921568627452%\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "test_data.iloc[500,:]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gyhzy_l6sAvi",
        "outputId": "6bd3c88a-b61b-44e8-d16d-0880a17ebab2"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Open         4.252500e+01\n",
              "High         4.269500e+01\n",
              "Low          4.242750e+01\n",
              "Close        4.265000e+01\n",
              "Adj Close    4.049405e+01\n",
              "Volume       8.599280e+07\n",
              "Name: 2017-12-27 00:00:00, dtype: float64"
            ]
          },
          "metadata": {},
          "execution_count": 32
        }
      ]
    }
  ]
}