diff --git "a/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" "b/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" --- "a/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" +++ "b/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb" @@ -1,3 +1,7092 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12f10826a59669d9dcf90268a0377596a642007f5642c452e87f76612a62939d -size 52757309 +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "6303a399", + "metadata": { + "papermill": { + "duration": 0.005464, + "end_time": "2023-09-06T17:17:07.017280", + "exception": false, + "start_time": "2023-09-06T17:17:07.011816", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# RWKV v5\n", + "\n", + "Simple memory training for a small model\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f39c274d", + "metadata": { + "papermill": { + "duration": 0.004562, + "end_time": "2023-09-06T17:17:07.026887", + "exception": false, + "start_time": "2023-09-06T17:17:07.022325", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "339857f4", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:17:07.038414Z", + "iopub.status.busy": "2023-09-06T17:17:07.037888Z", + "iopub.status.idle": "2023-09-06T17:17:08.034427Z", + "shell.execute_reply": "2023-09-06T17:17:08.033578Z" + }, + "papermill": { + "duration": 1.005021, + "end_time": "2023-09-06T17:17:08.036897", + "exception": false, + "start_time": "2023-09-06T17:17:07.031876", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CITATION.cff RWKV-v4wavenet\t RWKV-v5headsize32 checkpoint\tnotebook\r\n", + "LICENSE RWKV-v5\t\t RWKV-v5r2\t datapath\toutput\r\n", + "README.md RWKV-v5altwavenet RWKV-v5rstack\t docker\r\n", + "RWKV-v4neo RWKV-v5headsize2x RWKV-v5wavenet model\r\n" + ] + } + ], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!ls ../../../../../\n", + "!mkdir -p ../../../../../model/\n", + "!mkdir -p ../../../../../datapath/\n", + "!mkdir -p ../../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7b45aafc", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:17:08.049253Z", + "iopub.status.busy": "2023-09-06T17:17:08.048613Z", + "iopub.status.idle": "2023-09-06T17:17:11.256715Z", + "shell.execute_reply": "2023-09-06T17:17:11.255796Z" + }, + "papermill": { + "duration": 3.217287, + "end_time": "2023-09-06T17:17:11.259293", + "exception": false, + "start_time": "2023-09-06T17:17:08.042006", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5a6fa0ec", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:17:11.271967Z", + "iopub.status.busy": "2023-09-06T17:17:11.271338Z", + "iopub.status.idle": "2023-09-06T17:17:11.279590Z", + "shell.execute_reply": "2023-09-06T17:17:11.278754Z" + }, + "papermill": { + "duration": 0.016902, + "end_time": "2023-09-06T17:17:11.281556", + "exception": false, + "start_time": "2023-09-06T17:17:11.264654", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEEPSPEED_STRAT: deepspeed_stage_1\n", + "ENABLE_WANDB: True\n", + "GPU_DEVICES: auto\n", + "DIR_NAME: L6-D2048-E1e-1-ctx4k\n", + "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k\n", + "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" + ] + } + ], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=6\n", + "EMBED_DIM=2048\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "WANDB_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(CONFIG_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "# Get the notebook dir name\n", + "DIR_NAME=os.path.basename(NOTEBOOK_DIR)\n", + "\n", + "# Log names and dir\n", + "print(\"DIR_NAME:\", DIR_NAME)\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5715cb46", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:17:11.294198Z", + "iopub.status.busy": "2023-09-06T17:17:11.293662Z", + "iopub.status.idle": "2023-09-06T17:17:31.891567Z", + "shell.execute_reply": "2023-09-06T17:17:31.890659Z" + }, + "papermill": { + "duration": 20.606807, + "end_time": "2023-09-06T17:17:31.893643", + "exception": false, + "start_time": "2023-09-06T17:17:11.286836", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-09-06 17:17:11-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-4k.pth\r\n", + "Resolving huggingface.co (huggingface.co)... 18.154.227.87, 18.154.227.67, 18.154.227.69, ...\r\n", + "Connecting to huggingface.co (huggingface.co)|18.154.227.87|:443... connected.\r\n", + "HTTP request sent, awaiting response... 302 Found\r\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/7eb7abfda2e4cfb2a961ba4d52564f9b330830ba1a836966556e28753468ea1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694279831&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3OTgzMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzdlYjdhYmZkYTJlNGNmYjJhOTYxYmE0ZDUyNTY0ZjliMzMwODMwYmExYTgzNjk2NjU1NmUyODc1MzQ2OGVhMWU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=iCSVplgSsG5AjmmXo591m6nLTcfvPpRlnGWiePyk13mX8KhpZ9pdk-CgfNOOMeUtGGTe7Dax%7EqOKvdtmYwBAGLh94g2O8F3Enb9ju0hOqXZa8Z0LoWk7-vfWY-DupCpdaUAS-vgVDxUcUL-VULBW2JnqqXO0Vzm2TaBAOxRSY6u86wfuSrMzfuxu7vPtGKnxU2tO8CFkltBdkumTlWJ8KSHLx8A0BfIn23aR2Wzq77%7EmXJ7dJvBt19%7EiS5p6m-ITw0yXoKhohGLmiubk4%7EmK543ibTfPBbRtxjkxzepwDwuOCLomwXnaGvmVfBjnrHa1ETJTjkro9qITPESxU6TOxg__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", + "--2023-09-06 17:17:11-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/7eb7abfda2e4cfb2a961ba4d52564f9b330830ba1a836966556e28753468ea1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694279831&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3OTgzMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzdlYjdhYmZkYTJlNGNmYjJhOTYxYmE0ZDUyNTY0ZjliMzMwODMwYmExYTgzNjk2NjU1NmUyODc1MzQ2OGVhMWU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=iCSVplgSsG5AjmmXo591m6nLTcfvPpRlnGWiePyk13mX8KhpZ9pdk-CgfNOOMeUtGGTe7Dax%7EqOKvdtmYwBAGLh94g2O8F3Enb9ju0hOqXZa8Z0LoWk7-vfWY-DupCpdaUAS-vgVDxUcUL-VULBW2JnqqXO0Vzm2TaBAOxRSY6u86wfuSrMzfuxu7vPtGKnxU2tO8CFkltBdkumTlWJ8KSHLx8A0BfIn23aR2Wzq77%7EmXJ7dJvBt19%7EiS5p6m-ITw0yXoKhohGLmiubk4%7EmK543ibTfPBbRtxjkxzepwDwuOCLomwXnaGvmVfBjnrHa1ETJTjkro9qITPESxU6TOxg__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 108.138.64.121, 108.138.64.49, 108.138.64.36, ...\r\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|108.138.64.121|:443... connected.\r\n", + "HTTP request sent, awaiting response... " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 OK\r\n", + "Length: 1066536937 (1017M) [binary/octet-stream]\r\n", + "Saving to: ‘v5r3-L6-D2048-E0_1-enwiki-4k.pth’\r\n", + "\r\n", + "\r", + " v5r3-L6-D 0%[ ] 0 --.-KB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2 1%[ ] 15.26M 53.2MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D20 3%[ ] 30.52M 46.6MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D204 4%[ ] 45.78M 48.4MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048 5%[> ] 60.52M 52.7MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048- 7%[> ] 74.46M 46.7MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E 8%[> ] 91.03M 50.4MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0 10%[=> ] 101.80M 47.0MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_ 10%[=> ] 106.81M 44.8MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_1 11%[=> ] 121.56M 46.7MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 13%[=> ] 134.53M 48.0MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5r3-L6-D2048-E0_1-e 14%[=> ] 151.99M 50.6MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "r3-L6-D2048-E0_1-en 15%[==> ] 155.00M 48.4MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "3-L6-D2048-E0_1-enw 16%[==> ] 167.85M 48.7MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L6-D2048-E0_1-enwi 18%[==> ] 189.61M 52.0MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L6-D2048-E0_1-enwik 19%[==> ] 202.61M 52.7MB/s eta 17s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "6-D2048-E0_1-enwiki 21%[===> ] 213.62M 48.7MB/s eta 16s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_1-enwiki- 22%[===> ] 226.41M 50.0MB/s eta 16s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_1-enwiki-4 23%[===> ] 243.63M 50.1MB/s eta 16s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_1-enwiki-4k 24%[===> ] 250.70M 50.7MB/s eta 16s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_1-enwiki-4k. 25%[====> ] 259.40M 49.3MB/s eta 16s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_1-enwiki-4k.p 27%[====> ] 274.66M 52.3MB/s eta 15s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_1-enwiki-4k.pt 27%[====> ] 281.34M 50.0MB/s eta 15s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_1-enwiki-4k.pth 28%[====> ] 289.92M 50.7MB/s eta 15s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_1-enwiki-4k.pth 30%[=====> ] 305.18M 50.2MB/s eta 15s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_1-enwiki-4k.pth 31%[=====> ] 320.43M 51.6MB/s eta 15s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_1-enwiki-4k.pth 32%[=====> ] 325.55M 47.9MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-enwiki-4k.pth 33%[=====> ] 335.69M 45.4MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-enwiki-4k.pth 34%[=====> ] 350.95M 46.9MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "enwiki-4k.pth 37%[======> ] 377.49M 49.4MB/s eta 14s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "nwiki-4k.pth 38%[======> ] 396.22M 50.4MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "wiki-4k.pth 40%[=======> ] 411.47M 51.8MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "iki-4k.pth 41%[=======> ] 423.97M 52.9MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ki-4k.pth 42%[=======> ] 428.21M 50.3MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "i-4k.pth 43%[=======> ] 440.68M 50.5MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-4k.pth 43%[=======> ] 442.66M 50.3MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "4k.pth 44%[=======> ] 456.72M 49.8MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "k.pth 46%[========> ] 472.50M 51.1MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + ".pth 47%[========> ] 480.25M 51.7MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "pth 48%[========> ] 488.28M 50.6MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "th 50%[=========> ] 511.80M 53.9MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "h 51%[=========> ] 528.83M 53.4MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " 52%[=========> ] 534.05M 54.8MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v 54%[=========> ] 549.31M 57.4MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5 55%[==========> ] 566.76M 59.5MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r 56%[==========> ] 573.51M 56.4MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3 57%[==========> ] 579.83M 49.8MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3- 58%[==========> ] 594.57M 48.9MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L 60%[===========> ] 610.35M 50.6MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6 61%[===========> ] 625.61M 51.2MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6- 63%[===========> ] 640.87M 53.4MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D 64%[===========> ] 656.13M 53.0MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2 66%[============> ] 671.38M 53.7MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D20 67%[============> ] 686.13M 56.4MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D204 67%[============> ] 686.77M 52.2MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048 69%[============> ] 701.90M 52.3MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048- 70%[=============> ] 717.16M 51.2MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E 72%[=============> ] 732.42M 52.4MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0 73%[=============> ] 742.67M 51.0MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_ 74%[=============> ] 756.60M 50.1MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5r3-L6-D2048-E0_1 75%[==============> ] 771.87M 52.0MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 76%[==============> ] 781.29M 53.7MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5r3-L6-D2048-E0_1-e 78%[==============> ] 793.46M 52.7MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "r3-L6-D2048-E0_1-en 79%[==============> ] 808.71M 51.9MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "3-L6-D2048-E0_1-enw 81%[===============> ] 823.97M 52.1MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L6-D2048-E0_1-enwi 82%[===============> ] 840.69M 53.9MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L6-D2048-E0_1-enwik 84%[===============> ] 854.49M 52.7MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "6-D2048-E0_1-enwiki 85%[================> ] 869.75M 52.5MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_1-enwiki- 87%[================> ] 885.01M 53.8MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_1-enwiki-4 88%[================> ] 899.75M 53.7MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_1-enwiki-4k 89%[================> ] 906.85M 51.7MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_1-enwiki-4k. 90%[=================> ] 915.66M 53.2MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_1-enwiki-4k.p 91%[=================> ] 930.27M 53.4MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_1-enwiki-4k.pt 92%[=================> ] 945.53M 54.0MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_1-enwiki-4k.pth 94%[=================> ] 960.79M 51.7MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_1-enwiki-4k.pth 96%[==================> ] 976.55M 53.0MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_1-enwiki-4k.pth 97%[==================> ] 986.81M 50.5MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_1-enwiki-4k.pth 97%[==================> ] 991.82M 49.7MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-enwiki-4k.pth 99%[==================> ] 1007M 48.7MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5r3-L6-D2048-E0_1- 100%[===================>] 1017M 50.0MB/s in 20s \r\n", + "\r\n", + "2023-09-06 17:17:31 (51.1 MB/s) - ‘v5r3-L6-D2048-E0_1-enwiki-4k.pth’ saved [1066536937/1066536937]\r\n", + "\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1018M\r\n", + "drwxr-xr-x 2 root root 4.0K Sep 6 17:17 .\r\n", + "drwxr-xr-x 20 root root 4.0K Sep 6 17:17 ..\r\n", + "-rw-r--r-- 1 root root 1018M Sep 6 17:07 v5r3-L6-D2048-E0_1-enwiki-4k.pth\r\n" + ] + } + ], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/{DIR_NAME}/{FILENAME_PREFIX}-enwiki-4k.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " ls -alh ." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4e6c3e65", + "metadata": { + "papermill": { + "duration": 0.008152, + "end_time": "2023-09-06T17:17:31.910423", + "exception": false, + "start_time": "2023-09-06T17:17:31.902271", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b243d3f6", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:17:31.929289Z", + "iopub.status.busy": "2023-09-06T17:17:31.928880Z", + "iopub.status.idle": "2023-09-06T17:17:44.337863Z", + "shell.execute_reply": "2023-09-06T17:17:44.336545Z" + }, + "papermill": { + "duration": 12.421926, + "end_time": "2023-09-06T17:17:44.340880", + "exception": false, + "start_time": "2023-09-06T17:17:31.918954", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Downloading readme: 0%| | 0.00/7.79k [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 4.000e-04 (0.0004)\r\n", + " - lr_final: 3.000e-04 (0.0003)\r\n", + "\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n", + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.08282637596130371 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [1, 1] and sizes[(533245952, False), (384, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.985 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/14932 [00:00