{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Vx7KFfeieD7z"
      },
      "source": [
        "# RWKV-v4-RNN-Pile Fine-Tuning\n",
        "\n",
        "[RWKV](https://github.com/BlinkDL/RWKV-LM) is an RNN with transformer-level performance\n",
        "\n",
        "\n",
        "This notebook aims to streamline fine-tuning RWKV-v4 models"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7JFIiAsrfvJy"
      },
      "source": [
        "\n",
        "## Setup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "g_qFjgYmtSfK"
      },
      "outputs": [],
      "source": [
        "#@title Google Drive Options { display-mode: \"form\" }\n",
        "save_models_to_drive = True #@param {type:\"boolean\"}\n",
        "drive_mount = '/content/drive' #@param {type:\"string\"}\n",
        "output_dir = 'rwkv-v4-rnn-pile-tuning' #@param {type:\"string\"}\n",
        "tuned_model_name = 'tuned' #@param {type:\"string\"}\n",
        "\n",
        "import os\n",
        "from google.colab import drive\n",
        "if save_models_to_drive:\n",
        "    from google.colab import drive\n",
        "    drive.mount(drive_mount)\n",
        "    \n",
        "output_path = f\"{drive_mount}/MyDrive/{output_dir}\" if save_models_to_drive else f\"/content/{output_dir}\"\n",
        "os.makedirs(f\"{output_path}/{tuned_model_name}\", exist_ok=True)\n",
        "os.makedirs(f\"{output_path}/base_models/\", exist_ok=True)\n",
        "\n",
        "print(f\"Saving models to {output_path}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "eivKJ6FP1_9z",
        "outputId": "a687e3ad-8158-492a-da86-4f4ed8804699",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Fri Sep  2 16:11:37 2022       \n",
            "+-----------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
            "|-------------------------------+----------------------+----------------------+\n",
            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                               |                      |               MIG M. |\n",
            "|===============================+======================+======================|\n",
            "|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   35C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |\n",
            "|                               |                      |                  N/A |\n",
            "+-------------------------------+----------------------+----------------------+\n",
            "                                                                               \n",
            "+-----------------------------------------------------------------------------+\n",
            "| Processes:                                                                  |\n",
            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
            "|        ID   ID                                                   Usage      |\n",
            "|=============================================================================|\n",
            "|  No running processes found                                                 |\n",
            "+-----------------------------------------------------------------------------+\n"
          ]
        }
      ],
      "source": [
        "!nvidia-smi"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "R4lt0FTegJw9"
      },
      "outputs": [],
      "source": [
        "!git clone https://github.com/blinkdl/RWKV-LM\n",
        "repo_dir = \"/content/RWKV-LM/RWKV-v4\"\n",
        "%cd $repo_dir"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RDavUrBsgKIV"
      },
      "outputs": [],
      "source": [
        "!pip install transformers pytorch-lightning==1.9 deepspeed wandb ninja"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Wt7y7vR6e6U3"
      },
      "source": [
        "## Load Base Model\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KIgagN-Se3wi"
      },
      "outputs": [],
      "source": [
        "#@title Base Model Options\n",
        "#@markdown Using any of the listed options will download the checkpoint from huggingface\n",
        "\n",
        "base_model_name = \"RWKV-4-Pile-169M\" #@param [\"RWKV-4-Pile-1B5\", \"RWKV-4-Pile-430M\", \"RWKV-4-Pile-169M\"]\n",
        "base_model_url = f\"https://huggingface.co/BlinkDL/{base_model_name.lower()}\"\n",
        "\n",
        "# This may take a while\n",
        "!git lfs clone $base_model_url\n",
        "\n",
        "from glob import glob\n",
        "base_model_path = glob(f\"{base_model_name.lower()}/{base_model_name}*.pth\")[0]\n",
        "\n",
        "print(f\"Using {base_model_path} as base\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hCOPnLelfJgP"
      },
      "source": [
        "## Generate Training Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wW5OmlXmvaIU",
        "cellView": "form"
      },
      "outputs": [],
      "source": [
        "#@title Training Data Options\n",
        "#@markdown `input_file` should be the path to a single file that contains the text you want to fine-tune with.\n",
        "#@markdown Either upload a file to this notebook instance or reference a file in your Google drive.\n",
        "\n",
        "import numpy as np\n",
        "from transformers import PreTrainedTokenizerFast\n",
        "\n",
        "tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'{repo_dir}/20B_tokenizer.json')\n",
        "\n",
        "input_file = \"/content/drive/MyDrive/training.txt\" #@param {type:\"string\"}\n",
        "output_file = 'train.npy'\n",
        "\n",
        "print(f'Tokenizing {input_file} (VERY slow. please wait)')\n",
        "\n",
        "data_raw = open(input_file, encoding=\"utf-8\").read()\n",
        "print(f'Raw length = {len(data_raw)}')\n",
        "\n",
        "data_code = tokenizer.encode(data_raw)\n",
        "print(f'Tokenized length = {len(data_code)}')\n",
        "\n",
        "out = np.array(data_code, dtype='uint16')\n",
        "np.save(output_file, out, allow_pickle=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "I4lz-3maeIwY"
      },
      "source": [
        "## Training"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fuCw5_ASwMud"
      },
      "outputs": [],
      "source": [
        "#@title Training Options { display-mode: \"form\" }\n",
        "from shutil import copy\n",
        "import os\n",
        "\n",
        "def training_options():\n",
        "    EXPRESS_PILE_MODE = True\n",
        "    EXPRESS_PILE_MODEL_NAME = base_model_path.split(\".\")[0]\n",
        "    EXPRESS_PILE_MODEL_TYPE = base_model_name\n",
        "    n_epoch = 100 #@param {type:\"integer\"}\n",
        "    epoch_save_frequency = 25 #@param {type:\"integer\"}\n",
        "    batch_size =  11#@param {type:\"integer\"} \n",
        "    ctx_len = 384 #@param {type:\"integer\"}\n",
        "    epoch_save_path = f\"{output_path}/{tuned_model_name}\"\n",
        "    return locals()\n",
        "\n",
        "def model_options():\n",
        "    T_MAX = 384 #@param {type:\"integer\"}\n",
        "    return locals()\n",
        "\n",
        "def env_vars():\n",
        "    RWKV_FLOAT_MODE = 'fp16' #@param ['fp16', 'bf16', 'bf32'] {type:\"string\"}\n",
        "    RWKV_DEEPSPEED = '0' #@param ['0', '1'] {type:\"string\"}\n",
        "    return {f\"os.environ['{key}']\": value for key, value in locals().items()}\n",
        "\n",
        "def replace_lines(file_name, to_replace):\n",
        "    with open(file_name, 'r') as f:\n",
        "        lines = f.readlines()\n",
        "    with open(f'{file_name}.tmp', 'w') as f:\n",
        "        for line in lines:\n",
        "            key = line.split(\" =\")[0]\n",
        "            if key.strip() in to_replace:\n",
        "                value = to_replace[key.strip()]\n",
        "                if isinstance(value, str):\n",
        "                    f.write(f'{key} = \"{value}\"\\n')\n",
        "                else:\n",
        "                    f.write(f'{key} = {value}\\n')\n",
        "            else:\n",
        "                f.write(line)\n",
        "    copy(f'{file_name}.tmp', file_name)\n",
        "    os.remove(f'{file_name}.tmp')\n",
        "\n",
        "values = training_options()\n",
        "values.update(env_vars())\n",
        "replace_lines('train.py', values)\n",
        "replace_lines('src/model.py', model_options())"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py "
      ],
      "metadata": {
        "id": "0ZSF8U-nzylI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "pcDci4O7xJiZ"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "name": "RWKV-v4-RNN-Pile Fine-Tuning",
      "provenance": [],
      "toc_visible": true
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}