diff --git "a/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" "b/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" --- "a/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" +++ "b/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" @@ -1,3 +1,141911 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12f10826a59669d9dcf90268a0377596a642007f5642c452e87f76612a62939d -size 52757309 +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "718f1c9a", + "metadata": { + "papermill": { + "duration": 0.005477, + "end_time": "2023-09-06T17:24:46.367615", + "exception": false, + "start_time": "2023-09-06T17:24:46.362138", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# RWKV v5\n", + "\n", + "Simple memory training for a small model\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3237126c", + "metadata": { + "papermill": { + "duration": 0.004583, + "end_time": "2023-09-06T17:24:46.377203", + "exception": false, + "start_time": "2023-09-06T17:24:46.372620", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3b172c13", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:24:46.388594Z", + "iopub.status.busy": "2023-09-06T17:24:46.388114Z", + "iopub.status.idle": "2023-09-06T17:24:47.401851Z", + "shell.execute_reply": "2023-09-06T17:24:47.400352Z" + }, + "papermill": { + "duration": 1.022435, + "end_time": "2023-09-06T17:24:47.404551", + "exception": false, + "start_time": "2023-09-06T17:24:46.382116", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CITATION.cff RWKV-v4wavenet\t RWKV-v5headsize2x checkpoint\tnotebook\r\n", + "LICENSE RWKV-v5\t\t RWKV-v5headsize32 datapath\toutput\r\n", + "README.md RWKV-v5-beta2\t RWKV-v5rstack\t docker\r\n", + "RWKV-v4neo RWKV-v5altwavenet RWKV-v5wavenet model\r\n" + ] + } + ], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!ls ../../../../../\n", + "!mkdir -p ../../../../../model/\n", + "!mkdir -p ../../../../../datapath/\n", + "!mkdir -p ../../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6a5b2ca0", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:24:47.418082Z", + "iopub.status.busy": "2023-09-06T17:24:47.416819Z", + "iopub.status.idle": "2023-09-06T17:24:50.719845Z", + "shell.execute_reply": "2023-09-06T17:24:50.718258Z" + }, + "papermill": { + "duration": 3.312823, + "end_time": "2023-09-06T17:24:50.722482", + "exception": false, + "start_time": "2023-09-06T17:24:47.409659", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c7c5f2ce", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:24:50.736379Z", + "iopub.status.busy": "2023-09-06T17:24:50.735150Z", + "iopub.status.idle": "2023-09-06T17:24:50.748734Z", + "shell.execute_reply": "2023-09-06T17:24:50.747177Z" + }, + "papermill": { + "duration": 0.023207, + "end_time": "2023-09-06T17:24:50.750984", + "exception": false, + "start_time": "2023-09-06T17:24:50.727777", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEEPSPEED_STRAT: deepspeed_stage_1\n", + "ENABLE_WANDB: True\n", + "GPU_DEVICES: auto\n", + "DIR_NAME: L6-D2048-E1e-1-ctx4k\n", + "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k\n", + "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" + ] + } + ], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=6\n", + "EMBED_DIM=2048\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "WANDB_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(CONFIG_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "# Get the notebook dir name\n", + "DIR_NAME=os.path.basename(NOTEBOOK_DIR)\n", + "\n", + "# Log names and dir\n", + "print(\"DIR_NAME:\", DIR_NAME)\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "23272857", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:24:50.764946Z", + "iopub.status.busy": "2023-09-06T17:24:50.764040Z", + "iopub.status.idle": "2023-09-06T17:25:12.804705Z", + "shell.execute_reply": "2023-09-06T17:25:12.803903Z" + }, + "papermill": { + "duration": 22.05082, + "end_time": "2023-09-06T17:25:12.807245", + "exception": false, + "start_time": "2023-09-06T17:24:50.756425", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-09-06 17:24:50-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-4k.pth\r\n", + "Resolving huggingface.co (huggingface.co)... 18.154.227.67, 18.154.227.69, 18.154.227.87, ...\r\n", + "Connecting to huggingface.co (huggingface.co)|18.154.227.67|:443... connected.\r\n", + "HTTP request sent, awaiting response... 302 Found\r\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/7eb7abfda2e4cfb2a961ba4d52564f9b330830ba1a836966556e28753468ea1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694280290&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI4MDI5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzdlYjdhYmZkYTJlNGNmYjJhOTYxYmE0ZDUyNTY0ZjliMzMwODMwYmExYTgzNjk2NjU1NmUyODc1MzQ2OGVhMWU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=l0Mwep-i6GNV-J%7EepezD7A17T72n6mA%7ENVRke24jJ9%7E2CDFf-7C7BUXFmpr2PCyka%7EO123V-aSM9kVMGZj6QIErLtWvw%7ER6iQmC9OFwIRUHp3HyFg-ZkMVj-b97ycZB2mCm3DPehloQrbgQkQcZqzyKTY5kK34eUVuSFcD%7EyM8V7vCuFr5fzKzGw87ji5hdxrxJJ5JbLMqcbtq-dlHHgzDtDI5bFsES5DOVLV0Lk02gg2fU-KxeCXDMPU3MTSuaUky2kQQgy4r2%7ENv20mFp5lSIuedQ2-kCzA8A%7EY50E9EP5qpkWGRBOE7Q52xZVZfwZ6GgXmiz0hw1a1XW0W27C5A__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", + "--2023-09-06 17:24:50-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/7eb7abfda2e4cfb2a961ba4d52564f9b330830ba1a836966556e28753468ea1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694280290&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI4MDI5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzdlYjdhYmZkYTJlNGNmYjJhOTYxYmE0ZDUyNTY0ZjliMzMwODMwYmExYTgzNjk2NjU1NmUyODc1MzQ2OGVhMWU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=l0Mwep-i6GNV-J%7EepezD7A17T72n6mA%7ENVRke24jJ9%7E2CDFf-7C7BUXFmpr2PCyka%7EO123V-aSM9kVMGZj6QIErLtWvw%7ER6iQmC9OFwIRUHp3HyFg-ZkMVj-b97ycZB2mCm3DPehloQrbgQkQcZqzyKTY5kK34eUVuSFcD%7EyM8V7vCuFr5fzKzGw87ji5hdxrxJJ5JbLMqcbtq-dlHHgzDtDI5bFsES5DOVLV0Lk02gg2fU-KxeCXDMPU3MTSuaUky2kQQgy4r2%7ENv20mFp5lSIuedQ2-kCzA8A%7EY50E9EP5qpkWGRBOE7Q52xZVZfwZ6GgXmiz0hw1a1XW0W27C5A__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 108.138.64.121, 108.138.64.49, 108.138.64.111, ...\r\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|108.138.64.121|:443... connected.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HTTP request sent, awaiting response... " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 OK\r\n", + "Length: 1066536937 (1017M) [binary/octet-stream]\r\n", + "Saving to: ‘v5r3-L6-D2048-E0_1-enwiki-4k.pth’\r\n", + "\r\n", + "\r", + " v5r3-L6-D 0%[ ] 0 --.-KB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2 1%[ ] 14.74M 60.4MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D20 1%[ ] 15.39M 34.6MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D204 3%[ ] 30.52M 37.4MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048 4%[ ] 45.78M 38.8MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048- 6%[> ] 61.03M 41.4MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E 7%[> ] 76.29M 45.5MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0 8%[> ] 91.20M 48.6MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_ 10%[=> ] 106.29M 49.2MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_1 11%[=> ] 113.73M 48.2MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 12%[=> ] 122.07M 46.1MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5r3-L6-D2048-E0_1-e 13%[=> ] 139.89M 49.1MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "r3-L6-D2048-E0_1-en 15%[==> ] 152.59M 48.2MB/s eta 18s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "3-L6-D2048-E0_1-enw 16%[==> ] 167.33M 48.8MB/s eta 18s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L6-D2048-E0_1-enwi 16%[==> ] 172.31M 47.5MB/s eta 18s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L6-D2048-E0_1-enwik 18%[==> ] 183.10M 47.5MB/s eta 18s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "6-D2048-E0_1-enwiki 19%[==> ] 198.36M 47.4MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_1-enwiki- 20%[===> ] 210.46M 48.9MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_1-enwiki-4 21%[===> ] 215.68M 46.7MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_1-enwiki-4k 23%[===> ] 243.63M 51.0MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_1-enwiki-4k. 24%[===> ] 244.26M 49.1MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_1-enwiki-4k.p 25%[====> ] 259.40M 48.4MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_1-enwiki-4k.pt 27%[====> ] 274.66M 50.2MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_1-enwiki-4k.pth 28%[====> ] 289.92M 50.1MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_1-enwiki-4k.pth 30%[=====> ] 305.18M 49.8MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_1-enwiki-4k.pth 31%[=====> ] 322.50M 52.4MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_1-enwiki-4k.pth 32%[=====> ] 335.18M 52.3MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-enwiki-4k.pth 34%[=====> ] 350.44M 47.2MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-enwiki-4k.pth 35%[======> ] 364.38M 47.1MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "enwiki-4k.pth 37%[======> ] 380.15M 50.5MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "nwiki-4k.pth 38%[======> ] 392.91M 50.5MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "wiki-4k.pth 39%[======> ] 396.73M 47.0MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "iki-4k.pth 40%[=======> ] 411.47M 47.9MB/s eta 13s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ki-4k.pth 41%[=======> ] 426.73M 50.0MB/s eta 13s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "i-4k.pth 43%[=======> ] 440.68M 46.6MB/s eta 13s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-4k.pth 44%[=======> ] 455.93M 51.3MB/s eta 13s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "4k.pth 45%[========> ] 457.76M 47.6MB/s eta 13s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "k.pth 46%[========> ] 472.50M 46.5MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + ".pth 46%[========> ] 475.08M 45.9MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "pth 47%[========> ] 487.77M 44.0MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "th 49%[========> ] 503.54M 43.1MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "h 51%[=========> ] 518.80M 44.5MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " 52%[=========> ] 534.05M 47.1MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v 54%[=========> ] 549.31M 49.2MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5 55%[==========> ] 564.58M 51.1MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r 57%[==========> ] 579.83M 49.1MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3 58%[==========> ] 595.09M 50.9MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3- 59%[==========> ] 608.93M 52.9MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L 61%[===========> ] 623.04M 52.4MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6 61%[===========> ] 625.61M 51.2MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6- 62%[===========> ] 640.36M 49.9MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D 63%[===========> ] 640.87M 50.0MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2 64%[===========> ] 655.62M 50.5MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D20 65%[============> ] 661.31M 51.0MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D204 66%[============> ] 671.38M 51.0MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048 67%[============> ] 686.64M 52.9MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048- 68%[============> ] 694.35M 51.9MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E 68%[============> ] 699.39M 49.5MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0 69%[============> ] 703.65M 46.0MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_ 70%[=============> ] 716.64M 44.8MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_1 71%[=============> ] 730.72M 47.1MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 72%[=============> ] 732.42M 43.0MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5r3-L6-D2048-E0_1-e 72%[=============> ] 736.61M 41.3MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "r3-L6-D2048-E0_1-en 73%[=============> ] 747.17M 40.4MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "3-L6-D2048-E0_1-enw 75%[==============> ] 762.94M 41.6MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L6-D2048-E0_1-enwi 76%[==============> ] 774.56M 40.6MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L6-D2048-E0_1-enwik 77%[==============> ] 789.24M 44.0MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "6-D2048-E0_1-enwiki 78%[==============> ] 793.46M 41.0MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_1-enwiki- 79%[==============> ] 808.71M 44.1MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_1-enwiki-4 80%[===============> ] 823.46M 43.7MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_1-enwiki-4k 82%[===============> ] 838.71M 46.8MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_1-enwiki-4k. 83%[===============> ] 849.35M 45.0MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_1-enwiki-4k.p 84%[===============> ] 854.49M 42.2MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_1-enwiki-4k.pt 85%[================> ] 869.75M 43.5MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_1-enwiki-4k.pth 86%[================> ] 884.49M 45.5MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_1-enwiki-4k.pth 88%[================> ] 896.90M 47.7MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_1-enwiki-4k.pth 88%[================> ] 900.27M 46.5MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_1-enwiki-4k.pth 89%[================> ] 915.01M 48.9MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-enwiki-4k.pth 90%[=================> ] 918.15M 44.7MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-enwiki-4k.pth 91%[=================> ] 930.27M 47.6MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "enwiki-4k.pth 92%[=================> ] 945.53M 50.1MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "nwiki-4k.pth 94%[=================> ] 960.79M 48.5MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "wiki-4k.pth 96%[==================> ] 976.55M 49.3MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "iki-4k.pth 97%[==================> ] 991.82M 50.0MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ki-4k.pth 99%[==================> ] 1007M 49.7MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "i-4k.pth 99%[==================> ] 1016M 48.8MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 100%[===================>] 1017M 48.7MB/s in 21s \r\n", + "\r\n", + "2023-09-06 17:25:12 (47.8 MB/s) - ‘v5r3-L6-D2048-E0_1-enwiki-4k.pth’ saved [1066536937/1066536937]\r\n", + "\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1018M\r\n", + "drwxr-xr-x 2 root root 4.0K Sep 6 17:24 .\r\n", + "drwxr-xr-x 20 root root 4.0K Sep 6 17:24 ..\r\n", + "-rw-r--r-- 1 root root 1018M Sep 6 17:07 v5r3-L6-D2048-E0_1-enwiki-4k.pth\r\n" + ] + } + ], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/{DIR_NAME}/{FILENAME_PREFIX}-enwiki-4k.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " ls -alh ." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a7640f2b", + "metadata": { + "papermill": { + "duration": 0.012199, + "end_time": "2023-09-06T17:25:12.831966", + "exception": false, + "start_time": "2023-09-06T17:25:12.819767", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dbfa1a63", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:25:12.852580Z", + "iopub.status.busy": "2023-09-06T17:25:12.851982Z", + "iopub.status.idle": "2023-09-06T17:25:20.359135Z", + "shell.execute_reply": "2023-09-06T17:25:20.357791Z" + }, + "papermill": { + "duration": 7.52148, + "end_time": "2023-09-06T17:25:20.362711", + "exception": false, + "start_time": "2023-09-06T17:25:12.841231", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Saving the dataset (0/1 shards): 0%| | 0/14932 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 4.000e-04 (0.0004)\r\n", + " - lr_final: 3.000e-04 (0.0003)\r\n", + "\r\n", + "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.08642840385437012 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [1, 1] and sizes[(533245952, False), (384, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.985 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/14932 [00:00\r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 233, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", + " self._run_subcommand(self.subcommand)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", + " fn(**fn_kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", + " call._call_and_handle_interrupt(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", + " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", + " return function(*args, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", + " self._run(model, ckpt_path=ckpt_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", + " self._data_connector.prepare_data()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", + " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", + " return fn(*args, **kwargs)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 542, in prepare_data\r\n", + " prepare_data_static(**self._init_locals)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 101, in prepare_data_static\r\n", + " src_dataset = load_dataset(**load_dataset_params)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2112, in load_dataset\r\n", + " builder_instance = load_dataset_builder(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1798, in load_dataset_builder\r\n", + " dataset_module = dataset_module_factory(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1413, in dataset_module_factory\r\n", + " ).get_module()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 948, in get_module\r\n", + " patterns = sanitize_patterns(self.data_files) if self.data_files is not None else get_data_patterns(base_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/data_files.py\", line 459, in get_data_patterns\r\n", + " raise EmptyDatasetError(f\"The directory at {base_path} doesn't contain any data files\") from None\r\n", + "datasets.data_files.EmptyDatasetError: The directory at /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/dataset doesn't contain any data files\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: - 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mv5r3-L6-D2048-E0.1 - Mem-Instruct (train-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/xuck99wm\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjkzMjg5ODA3/version_details/v27\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230906_175222-xuck99wm/logs\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python3 lightning_trainer.py fit \\\n", + " -c \"{CONFIG_DIR}/config-mem-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \\\n", + " --model.ctx_len=512 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c93850c1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:52:39.516649Z", + "iopub.status.busy": "2023-09-06T17:52:39.515772Z", + "iopub.status.idle": "2023-09-06T17:52:43.445323Z", + "shell.execute_reply": "2023-09-06T17:52:43.444433Z" + }, + "papermill": { + "duration": 4.911758, + "end_time": "2023-09-06T17:52:43.448281", + "exception": false, + "start_time": "2023-09-06T17:52:38.536523", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-09-06 17:52:41,942] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5r3-L6-D2048-E0_1-mem-instruct/last.ckpt/latest\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '../model/v5r3-L6-D2048-E0_1-mem-instruct.pth': No such file or directory\r\n" + ] + } + ], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9cbbdc1c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:52:45.335301Z", + "iopub.status.busy": "2023-09-06T17:52:45.334570Z", + "iopub.status.idle": "2023-09-06T17:52:45.599928Z", + "shell.execute_reply": "2023-09-06T17:52:45.598870Z" + }, + "papermill": { + "duration": 1.153922, + "end_time": "2023-09-06T17:52:45.602059", + "exception": false, + "start_time": "2023-09-06T17:52:44.448137", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/eval_v5_memory_guided.py': [Errno 2] No such file or directory\r\n" + ] + } + ], + "source": [ + "# Lets do a quick memory test\n", + "!python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "markdown", + "id": "9759931b", + "metadata": { + "papermill": { + "duration": 0.980039, + "end_time": "2023-09-06T17:52:47.551588", + "exception": false, + "start_time": "2023-09-06T17:52:46.571549", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Tune 2 : Low ctx size (512), memory training\n", + "\n", + "- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e4b68f37", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:52:49.396209Z", + "iopub.status.busy": "2023-09-06T17:52:49.395658Z", + "iopub.status.idle": "2023-09-06T17:52:49.450143Z", + "shell.execute_reply": "2023-09-06T17:52:49.449447Z" + }, + "papermill": { + "duration": 1.025106, + "end_time": "2023-09-06T17:52:49.452088", + "exception": false, + "start_time": "2023-09-06T17:52:48.426982", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Generating word reptition dataset ##\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Done ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 8.0K\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "drwxr-xr-x 2 root root 4.0K Sep 6 17:52 .\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "drwxr-xr-x 6 root root 4.0K Sep 6 17:52 ..\n" + ] + } + ], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We switch over to fully masked instruct+input, to properly learn the memorization task\n", + "#\n", + "python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 5000 &\n", + "for i in {5..95..5} \n", + "do\n", + " python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 5000 & \n", + "done\n", + "python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-100-count.jsonl 100 5000 &\n", + "python3 ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-200-count.jsonl 200 5000 &\n", + "\n", + "#\n", + "# We mixin the shuffled word list, so that we ensure all words / tokens are learned\n", + "# however this might intrduce an exclusion bias (if seen this word, never repeat it), \n", + "# so we limit the mixture of this data samples\n", + "#\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-10-count.jsonl 10 20 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-15-count.jsonl 15 20 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-25-count.jsonl 25 30 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-50-count.jsonl 50 50 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-75-count.jsonl 75 50 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-100-count.jsonl 100 50 &\n", + "python3 ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-200-count.jsonl 200 50 &\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "dfe7c26c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:52:51.295342Z", + "iopub.status.busy": "2023-09-06T17:52:51.294782Z", + "iopub.status.idle": "2023-09-06T17:53:11.646941Z", + "shell.execute_reply": "2023-09-06T17:53:11.645847Z" + }, + "papermill": { + "duration": 21.229481, + "end_time": "2023-09-06T17:53:11.649754", + "exception": false, + "start_time": "2023-09-06T17:52:50.420273", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-09-06 17:52:56,145] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/config-mem-template.yaml', '--trainer.logger.init_args.name=v5r3-L6-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5r3-L6-D2048-E0_1-mem-ctx-512/', '--model.lr_init=5e-4', '--model.lr_final=4e-4', '--data.max_token_size=512', '--model.ctx_len=512', '--model.bptt_learning_range=1', '--model.load_model=../model/v5r3-L6-D2048-E0_1-mem-instruct.pth'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/config-mem-template.yaml', '--trainer.logger.init_args.name=v5r3-L6-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5r3-L6-D2048-E0_1-mem-ctx-512/', '--model.lr_init=5e-4', '--model.lr_final=4e-4', '--data.max_token_size=512', '--model.ctx_len=512', '--model.bptt_learning_range=1', '--model.load_model=../model/v5r3-L6-D2048-E0_1-mem-instruct.pth'].\r\n", + " rank_zero_warn(\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 1381932438\r\n", + " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n", + "Global seed set to 1381932438\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: - Waiting for wandb.init()...\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \\ Waiting for wandb.init()...\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.9\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230906_175300-6yfdmqhq\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mv5r3-L6-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/6yfdmqhq\u001b[0m\r\n", + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 254, in \r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 233, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n", + " self.instantiate_classes()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n", + " self.config_init = self.parser.instantiate_classes(self.config)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n", + " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n", + " component.instantiate_class(component, cfg)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n", + " parent[key] = group.group_class(**value)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 559, in __init__\r\n", + " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", + "ValueError: load_model file '../model/v5r3-L6-D2048-E0_1-mem-instruct.pth' does not exist\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mv5r3-L6-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/6yfdmqhq\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjkzMjg5ODA3/version_details/v28\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230906_175300-6yfdmqhq/logs\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python3 lightning_trainer.py fit \\\n", + " -c \"{CONFIG_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-512 (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/\" \\\n", + " --model.lr_init=5e-4 \\\n", + " --model.lr_final=4e-4 \\\n", + " --data.max_token_size=512 \\\n", + " --model.ctx_len=512 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a64f121e", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:53:13.678266Z", + "iopub.status.busy": "2023-09-06T17:53:13.677894Z", + "iopub.status.idle": "2023-09-06T17:53:17.614786Z", + "shell.execute_reply": "2023-09-06T17:53:17.613856Z" + }, + "papermill": { + "duration": 4.956429, + "end_time": "2023-09-06T17:53:17.617091", + "exception": false, + "start_time": "2023-09-06T17:53:12.660662", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-09-06 17:53:16,116] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5r3-L6-D2048-E0_1-mem-ctx-512/last.ckpt/latest\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '../model/v5r3-L6-D2048-E0_1-mem-ctx-512.pth': No such file or directory\r\n" + ] + } + ], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7b3383b0", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:53:19.466424Z", + "iopub.status.busy": "2023-09-06T17:53:19.465359Z", + "iopub.status.idle": "2023-09-06T17:53:19.731128Z", + "shell.execute_reply": "2023-09-06T17:53:19.730200Z" + }, + "papermill": { + "duration": 1.231112, + "end_time": "2023-09-06T17:53:19.733404", + "exception": false, + "start_time": "2023-09-06T17:53:18.502292", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/../memory_script/eval_v5_memory_guided.py': [Errno 2] No such file or directory\r\n" + ] + } + ], + "source": [ + "# Lets do a quick memory test\n", + "!python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 1715.908096, + "end_time": "2023-09-06T17:53:21.033393", + "environment_variables": {}, + "exception": null, + "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb", + "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb", + "parameters": {}, + "start_time": "2023-09-06T17:24:45.125297", + "version": "2.4.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file