diff --git "a/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb" "b/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb" new file mode 100644--- /dev/null +++ "b/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb" @@ -0,0 +1,252616 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "5151cc25", + "metadata": { + "papermill": { + "duration": 0.004632, + "end_time": "2023-10-01T09:57:46.594149", + "exception": false, + "start_time": "2023-10-01T09:57:46.589517", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# RWKV v5 multi-size training experiment\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c67b1a80", + "metadata": { + "papermill": { + "duration": 0.00306, + "end_time": "2023-10-01T09:57:46.600575", + "exception": false, + "start_time": "2023-10-01T09:57:46.597515", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "40388689", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T09:57:46.605504Z", + "iopub.status.busy": "2023-10-01T09:57:46.605241Z", + "iopub.status.idle": "2023-10-01T09:57:47.263577Z", + "shell.execute_reply": "2023-10-01T09:57:47.262750Z" + }, + "papermill": { + "duration": 0.662999, + "end_time": "2023-10-01T09:57:47.265535", + "exception": false, + "start_time": "2023-10-01T09:57:46.602536", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!mkdir -p ../../../../model/\n", + "!mkdir -p ../../../../datapath/\n", + "!mkdir -p ../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b25172dd", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T09:57:47.273586Z", + "iopub.status.busy": "2023-10-01T09:57:47.273334Z", + "iopub.status.idle": "2023-10-01T09:57:47.281549Z", + "shell.execute_reply": "2023-10-01T09:57:47.281025Z" + }, + "papermill": { + "duration": 0.01381, + "end_time": "2023-10-01T09:57:47.282899", + "exception": false, + "start_time": "2023-10-01T09:57:47.269089", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEEPSPEED_STRAT: deepspeed_stage_2_offload\n", + "ENABLE_WANDB: True\n", + "GPU_DEVICES: auto\n", + "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", + "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", + "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" + ] + } + ], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_2_offload\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "EMBED_SCALE=0.01\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "EMBED_SIZE=2048\n", + "\n", + "WANDB_PREFIX=f\"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b519f60b", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T09:57:47.290646Z", + "iopub.status.busy": "2023-10-01T09:57:47.290505Z", + "iopub.status.idle": "2023-10-01T09:58:12.652070Z", + "shell.execute_reply": "2023-10-01T09:58:12.651335Z" + }, + "papermill": { + "duration": 25.367302, + "end_time": "2023-10-01T09:58:12.653786", + "exception": false, + "start_time": "2023-10-01T09:57:47.286484", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-10-01 09:57:47-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth\r\n", + "Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.55, 13.33.33.110, ...\r\n", + "Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.\r\n", + "HTTP request sent, awaiting response... " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "302 Found\r\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/235d88b0aa939596392f2b5734a426940535816aa13106498974a809051a4c75?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-enwiki-4k-p1.pth%3B+filename%3D%22v5-L6-D2048-E0_01-enwiki-4k-p1.pth%22%3B&Expires=1696413467&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NjQxMzQ2N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzIzNWQ4OGIwYWE5Mzk1OTYzOTJmMmI1NzM0YTQyNjk0MDUzNTgxNmFhMTMxMDY0OTg5NzRhODA5MDUxYTRjNzU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=wDUzeqxdQEcRlooaN1Z-vyIWpSMoZX7qd8txk8fcev82ujeAoDVDmnWcsCf8-QivzrmkDKmtpOGXXiSZopdwIgdPA2Bq1z-kOI62MVdmAEOsgV-BPGd6yLtFiIrTyAKsz7jxgD1b6Q0gg1y6-b9rFXvnS4HMZwCPcJ1m6hKL40nz-sKHM5d9Yy38NYzj-BxuFU6GqgDqvpCe1JSioY9EdMqfZkxAHr4PqLzd71PmzMhQeTUEVpMARnctaCvfa1CpGNuVVNuFT8-rDprFwyWuUJEj%7EvUB68Gu6FIgldLq3G4x-om%7EWJWOOX6JffH-x3-gcWjVeBdVe7CHk080IfyaoA__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", + "--2023-10-01 09:57:47-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/235d88b0aa939596392f2b5734a426940535816aa13106498974a809051a4c75?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-enwiki-4k-p1.pth%3B+filename%3D%22v5-L6-D2048-E0_01-enwiki-4k-p1.pth%22%3B&Expires=1696413467&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NjQxMzQ2N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzIzNWQ4OGIwYWE5Mzk1OTYzOTJmMmI1NzM0YTQyNjk0MDUzNTgxNmFhMTMxMDY0OTg5NzRhODA5MDUxYTRjNzU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=wDUzeqxdQEcRlooaN1Z-vyIWpSMoZX7qd8txk8fcev82ujeAoDVDmnWcsCf8-QivzrmkDKmtpOGXXiSZopdwIgdPA2Bq1z-kOI62MVdmAEOsgV-BPGd6yLtFiIrTyAKsz7jxgD1b6Q0gg1y6-b9rFXvnS4HMZwCPcJ1m6hKL40nz-sKHM5d9Yy38NYzj-BxuFU6GqgDqvpCe1JSioY9EdMqfZkxAHr4PqLzd71PmzMhQeTUEVpMARnctaCvfa1CpGNuVVNuFT8-rDprFwyWuUJEj%7EvUB68Gu6FIgldLq3G4x-om%7EWJWOOX6JffH-x3-gcWjVeBdVe7CHk080IfyaoA__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.155.68.98, 18.155.68.94, 18.155.68.73, ...\r\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.98|:443... connected.\r\n", + "HTTP request sent, awaiting response... 200 OK\r\n", + "Length: 1066537217 (1017M) [binary/octet-stream]\r\n", + "Saving to: ‘v5-L6-D2048-E0_01-enwiki-4k-p1.pth’\r\n", + "\r\n", + "\r", + " v5-L6-D20 0%[ ] 0 --.-KB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D204 2%[ ] 20.95M 105MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048 4%[ ] 43.35M 108MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048- 6%[> ] 65.74M 110MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E 8%[> ] 88.14M 110MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E0 10%[=> ] 110.54M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E0_ 13%[=> ] 132.95M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E0_0 15%[==> ] 155.35M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E0_01 17%[==> ] 177.74M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L6-D2048-E0_01- 19%[==> ] 200.15M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5-L6-D2048-E0_01-e 21%[===> ] 222.54M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5-L6-D2048-E0_01-en 24%[===> ] 244.94M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L6-D2048-E0_01-enw 26%[====> ] 267.34M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L6-D2048-E0_01-enwi 28%[====> ] 289.74M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "6-D2048-E0_01-enwik 30%[=====> ] 312.14M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_01-enwiki 32%[=====> ] 334.54M 111MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_01-enwiki- 35%[======> ] 356.92M 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_01-enwiki-4 37%[======> ] 379.31M 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_01-enwiki-4k 39%[======> ] 401.71M 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_01-enwiki-4k- 41%[=======> ] 423.51M 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_01-enwiki-4k-p 43%[=======> ] 445.91M 112MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_01-enwiki-4k-p1 46%[========> ] 468.31M 112MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_01-enwiki-4k-p1. 48%[========> ] 490.56M 112MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_01-enwiki-4k-p1.p 50%[=========> ] 512.22M 111MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_01-enwiki-4k-p1.pt 52%[=========> ] 534.13M 111MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "01-enwiki-4k-p1.pth 54%[=========> ] 556.52M 111MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-enwiki-4k-p1.pth 56%[==========> ] 578.93M 111MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-enwiki-4k-p1.pth 59%[==========> ] 601.33M 111MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "enwiki-4k-p1.pth 61%[===========> ] 623.74M 111MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "nwiki-4k-p1.pth 63%[===========> ] 646.15M 111MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "wiki-4k-p1.pth 65%[============> ] 668.55M 111MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "iki-4k-p1.pth 67%[============> ] 690.95M 111MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ki-4k-p1.pth 70%[=============> ] 713.36M 111MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "i-4k-p1.pth 72%[=============> ] 735.75M 111MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-4k-p1.pth 74%[=============> ] 758.16M 112MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "4k-p1.pth 76%[==============> ] 780.57M 112MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "k-p1.pth 78%[==============> ] 802.97M 112MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-p1.pth 81%[===============> ] 825.33M 112MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "p1.pth 83%[===============> ] 847.73M 112MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1.pth 85%[================> ] 870.13M 112MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + ".pth 87%[================> ] 892.51M 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "pth 89%[================> ] 914.92M 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "th 92%[=================> ] 937.32M 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "h 94%[=================> ] 959.72M 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " 96%[==================> ] 982.12M 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v 98%[==================> ] 1005M 112MB/s eta 0s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5-L6-D2048-E0_01-e 100%[===================>] 1017M 112MB/s in 9.1s \r\n", + "\r\n", + "2023-10-01 09:57:56 (112 MB/s) - ‘v5-L6-D2048-E0_01-enwiki-4k-p1.pth’ saved [1066537217/1066537217]\r\n", + "\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-10-01 09:57:57-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n", + "Resolving huggingface.co (huggingface.co)... 13.33.33.20, 13.33.33.110, 13.33.33.55, ...\r\n", + "Connecting to huggingface.co (huggingface.co)|13.33.33.20|:443... connected.\r\n", + "HTTP request sent, awaiting response... " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "302 Found\r\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/06105d96413046fce0ec189b9c4685a813cfa7147300851c5d2afc7b5adbcb38?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L12-D2048-E0_01-neox-v5base-init.pth%3B+filename%3D%22v5-L12-D2048-E0_01-neox-v5base-init.pth%22%3B&Expires=1696413477&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NjQxMzQ3N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzA2MTA1ZDk2NDEzMDQ2ZmNlMGVjMTg5YjljNDY4NWE4MTNjZmE3MTQ3MzAwODUxYzVkMmFmYzdiNWFkYmNiMzg%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=LWC6SrRhPM60aed7LTQgbpbgyfPH%7ErKagx7ILRIli596Au0p-L3ORW3oFZtLHnd8Z%7EE8dbw7cjlUxBZymFfB7nuDeJavLCXDTilLOUn0MiMc768TEbjPtNxQTAFpCt-HkgaQcpxegor6yPaOyS0boCdX0%7Egs4H%7Evxpd5KhIJSH1UGQ9B6Za7pGhPJd2N0g-Zpg3EBoFy8BJ0zEtsvioSc7RdRNHs3FuaHZgQEweYVcw-peyRRlJaMBgQLPob8YmeruLHGiVCYOoVUw6YYtE85ENwcnqxnic8YWYCZgjY0WbNWlzu6bx2V62TP5ZjaKl1hkQYhgSq68RB5Z0LEFWvuw__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", + "--2023-10-01 09:57:57-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/06105d96413046fce0ec189b9c4685a813cfa7147300851c5d2afc7b5adbcb38?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L12-D2048-E0_01-neox-v5base-init.pth%3B+filename%3D%22v5-L12-D2048-E0_01-neox-v5base-init.pth%22%3B&Expires=1696413477&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NjQxMzQ3N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzA2MTA1ZDk2NDEzMDQ2ZmNlMGVjMTg5YjljNDY4NWE4MTNjZmE3MTQ3MzAwODUxYzVkMmFmYzdiNWFkYmNiMzg%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=LWC6SrRhPM60aed7LTQgbpbgyfPH%7ErKagx7ILRIli596Au0p-L3ORW3oFZtLHnd8Z%7EE8dbw7cjlUxBZymFfB7nuDeJavLCXDTilLOUn0MiMc768TEbjPtNxQTAFpCt-HkgaQcpxegor6yPaOyS0boCdX0%7Egs4H%7Evxpd5KhIJSH1UGQ9B6Za7pGhPJd2N0g-Zpg3EBoFy8BJ0zEtsvioSc7RdRNHs3FuaHZgQEweYVcw-peyRRlJaMBgQLPob8YmeruLHGiVCYOoVUw6YYtE85ENwcnqxnic8YWYCZgjY0WbNWlzu6bx2V62TP5ZjaKl1hkQYhgSq68RB5Z0LEFWvuw__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.155.68.128, 18.155.68.73, 18.155.68.94, ...\r\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.128|:443... connected.\r\n", + "HTTP request sent, awaiting response... " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 OK\r\n", + "Length: 1721189797 (1.6G) [binary/octet-stream]\r\n", + "Saving to: ‘v5-L12-D2048-E0_01-neox-v5base-init.pth’\r\n", + "\r\n", + "\r", + " v5-L12-D2 0%[ ] 0 --.-KB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D20 1%[ ] 20.90M 104MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D204 2%[ ] 43.30M 108MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048 3%[ ] 64.92M 108MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048- 5%[> ] 88.08M 110MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E 6%[> ] 110.44M 110MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0 8%[> ] 132.85M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_ 9%[> ] 155.21M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_0 10%[=> ] 177.58M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_01 12%[=> ] 199.98M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5-L12-D2048-E0_01- 13%[=> ] 222.38M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5-L12-D2048-E0_01-n 14%[=> ] 244.79M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L12-D2048-E0_01-ne 16%[==> ] 267.19M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L12-D2048-E0_01-neo 17%[==> ] 289.60M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "12-D2048-E0_01-neox 19%[==> ] 311.98M 111MB/s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2-D2048-E0_01-neox- 20%[===> ] 334.37M 111MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_01-neox-v 21%[===> ] 356.77M 112MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_01-neox-v5 23%[===> ] 379.18M 112MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2048-E0_01-neox-v5b 24%[===> ] 401.58M 112MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "048-E0_01-neox-v5ba 25%[====> ] 423.98M 112MB/s eta 12s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "48-E0_01-neox-v5bas 27%[====> ] 446.38M 112MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "8-E0_01-neox-v5base 28%[====> ] 468.77M 112MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-E0_01-neox-v5base- 29%[====> ] 491.18M 112MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "E0_01-neox-v5base-i 31%[=====> ] 513.54M 112MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "0_01-neox-v5base-in 32%[=====> ] 535.98M 112MB/s eta 11s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "_01-neox-v5base-ini 34%[=====> ] 558.37M 112MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "01-neox-v5base-init 35%[======> ] 580.44M 112MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "1-neox-v5base-init. 36%[======> ] 602.78M 112MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-neox-v5base-init.p 38%[======> ] 625.17M 112MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "neox-v5base-init.pt 39%[======> ] 647.58M 112MB/s eta 10s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "eox-v5base-init.pth 40%[=======> ] 669.98M 112MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ox-v5base-init.pth 42%[=======> ] 692.39M 112MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "x-v5base-init.pth 43%[=======> ] 714.14M 112MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-v5base-init.pth 44%[=======> ] 736.54M 112MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5base-init.pth 46%[========> ] 758.73M 112MB/s eta 9s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5base-init.pth 47%[========> ] 781.34M 112MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "base-init.pth 48%[========> ] 803.74M 112MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "ase-init.pth 50%[=========> ] 826.13M 112MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "se-init.pth 51%[=========> ] 848.54M 112MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "e-init.pth 53%[=========> ] 870.95M 112MB/s eta 8s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-init.pth 54%[=========> ] 893.33M 112MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "init.pth 55%[==========> ] 915.74M 112MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "nit.pth 57%[==========> ] 938.13M 112MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "it.pth 58%[==========> ] 960.54M 112MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "t.pth 59%[==========> ] 982.94M 112MB/s eta 7s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + ".pth 61%[===========> ] 1005M 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "pth 62%[===========> ] 1.00G 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "th 63%[===========> ] 1.03G 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "h 65%[============> ] 1.05G 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " 66%[============> ] 1.07G 112MB/s eta 6s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v 68%[============> ] 1.09G 112MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5 68%[============> ] 1.09G 105MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5- 69%[============> ] 1.11G 104MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L 70%[=============> ] 1.13G 104MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L1 71%[=============> ] 1.15G 101MB/s eta 5s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12 72%[=============> ] 1.17G 99.9MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12- 74%[=============> ] 1.19G 100MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D 75%[==============> ] 1.20G 98.0MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2 76%[==============> ] 1.22G 95.9MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D20 77%[==============> ] 1.24G 95.6MB/s eta 4s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D204 78%[==============> ] 1.26G 95.9MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048 80%[===============> ] 1.28G 95.3MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048- 81%[===============> ] 1.30G 95.6MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E 82%[===============> ] 1.33G 95.9MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0 84%[===============> ] 1.35G 95.3MB/s eta 3s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_ 85%[================> ] 1.37G 95.6MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_0 86%[================> ] 1.39G 97.1MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + " v5-L12-D2048-E0_01 88%[================> ] 1.41G 103MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5-L12-D2048-E0_01- 89%[================> ] 1.44G 103MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "5-L12-D2048-E0_01-n 90%[=================> ] 1.46G 104MB/s eta 2s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-L12-D2048-E0_01-ne 92%[=================> ] 1.48G 107MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "L12-D2048-E0_01-neo 93%[=================> ] 1.50G 107MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "12-D2048-E0_01-neox 95%[==================> ] 1.52G 109MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "2-D2048-E0_01-neox- 96%[==================> ] 1.54G 111MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "-D2048-E0_01-neox-v 97%[==================> ] 1.57G 112MB/s eta 1s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "D2048-E0_01-neox-v5 99%[==================> ] 1.59G 112MB/s eta 0s " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "v5-L12-D2048-E0_01- 100%[===================>] 1.60G 112MB/s in 15s \r\n", + "\r\n", + "2023-10-01 09:58:12 (108 MB/s) - ‘v5-L12-D2048-E0_01-neox-v5base-init.pth’ saved [1721189797/1721189797]\r\n", + "\r\n" + ] + } + ], + "source": [ + "# Get the init L12 model, and download the L6 model\n", + "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth\"\n", + "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cb1e8942", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T09:58:12.673746Z", + "iopub.status.busy": "2023-10-01T09:58:12.673380Z", + "iopub.status.idle": "2023-10-01T09:58:15.950961Z", + "shell.execute_reply": "2023-10-01T09:58:15.950219Z" + }, + "papermill": { + "duration": 3.286901, + "end_time": "2023-10-01T09:58:15.953065", + "exception": false, + "start_time": "2023-10-01T09:58:12.666164", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---- Merging model ----\r\n", + "Baseline model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n", + "Source model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth\r\n", + "Output model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L6+6-D2048-E0_01-overwrite-naive-p1.pth\r\n", + "Merge mode: overwrite\r\n", + "---- ----- ----\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging blocks.0.att.gate.weight ...\r\n", + "Merging blocks.0.att.key.weight ...\r\n", + "Merging blocks.0.att.ln_x.bias ...\r\n", + "Merging blocks.0.att.ln_x.weight ...\r\n", + "Merging blocks.0.att.output.weight ...\r\n", + "Merging blocks.0.att.receptance.weight ...\r\n", + "Merging blocks.0.att.time_mix_g ...\r\n", + "Merging blocks.0.att.time_mix_k ...\r\n", + "Merging blocks.0.att.time_mix_r ...\r\n", + "Merging blocks.0.att.time_mix_v ...\r\n", + "Merging blocks.0.att.value.weight ...\r\n", + "Merging blocks.0.ffn.key.weight ...\r\n", + "Merging blocks.0.ffn.receptance.weight ...\r\n", + "Merging blocks.0.ffn.time_mix_k ...\r\n", + "Merging blocks.0.ffn.time_mix_r ...\r\n", + "Merging blocks.0.ffn.value.weight ...\r\n", + "Merging blocks.0.ln0.bias ...\r\n", + "Merging blocks.0.ln0.weight ...\r\n", + "Merging blocks.0.ln1.bias ...\r\n", + "Merging blocks.0.ln1.weight ...\r\n", + "Merging blocks.0.ln2.bias ...\r\n", + "Merging blocks.0.ln2.weight ...\r\n", + "Merging blocks.1.att.gate.weight ...\r\n", + "Merging blocks.1.att.key.weight ...\r\n", + "Merging blocks.1.att.ln_x.bias ...\r\n", + "Merging blocks.1.att.ln_x.weight ...\r\n", + "Merging blocks.1.att.output.weight ...\r\n", + "Merging blocks.1.att.receptance.weight ...\r\n", + "Merging blocks.1.att.time_mix_g ...\r\n", + "Merging blocks.1.att.time_mix_k ...\r\n", + "Merging blocks.1.att.time_mix_r ...\r\n", + "Merging blocks.1.att.time_mix_v ...\r\n", + "Merging blocks.1.att.value.weight ...\r\n", + "Merging blocks.1.ffn.key.weight ...\r\n", + "Merging blocks.1.ffn.receptance.weight ...\r\n", + "Merging blocks.1.ffn.time_mix_k ...\r\n", + "Merging blocks.1.ffn.time_mix_r ...\r\n", + "Merging blocks.1.ffn.value.weight ...\r\n", + "Merging blocks.1.ln1.bias ...\r\n", + "Merging blocks.1.ln1.weight ...\r\n", + "Merging blocks.1.ln2.bias ...\r\n", + "Merging blocks.1.ln2.weight ...\r\n", + "Merging blocks.2.att.gate.weight ...\r\n", + "Merging blocks.2.att.key.weight ...\r\n", + "Merging blocks.2.att.ln_x.bias ...\r\n", + "Merging blocks.2.att.ln_x.weight ...\r\n", + "Merging blocks.2.att.output.weight ...\r\n", + "Merging blocks.2.att.receptance.weight ...\r\n", + "Merging blocks.2.att.time_mix_g ...\r\n", + "Merging blocks.2.att.time_mix_k ...\r\n", + "Merging blocks.2.att.time_mix_r ...\r\n", + "Merging blocks.2.att.time_mix_v ...\r\n", + "Merging blocks.2.att.value.weight ...\r\n", + "Merging blocks.2.ffn.key.weight ...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging blocks.2.ffn.receptance.weight ...\r\n", + "Merging blocks.2.ffn.time_mix_k ...\r\n", + "Merging blocks.2.ffn.time_mix_r ...\r\n", + "Merging blocks.2.ffn.value.weight ...\r\n", + "Merging blocks.2.ln1.bias ...\r\n", + "Merging blocks.2.ln1.weight ...\r\n", + "Merging blocks.2.ln2.bias ...\r\n", + "Merging blocks.2.ln2.weight ...\r\n", + "Merging blocks.3.att.gate.weight ...\r\n", + "Merging blocks.3.att.key.weight ...\r\n", + "Merging blocks.3.att.ln_x.bias ...\r\n", + "Merging blocks.3.att.ln_x.weight ...\r\n", + "Merging blocks.3.att.output.weight ...\r\n", + "Merging blocks.3.att.receptance.weight ...\r\n", + "Merging blocks.3.att.time_mix_g ...\r\n", + "Merging blocks.3.att.time_mix_k ...\r\n", + "Merging blocks.3.att.time_mix_r ...\r\n", + "Merging blocks.3.att.time_mix_v ...\r\n", + "Merging blocks.3.att.value.weight ...\r\n", + "Merging blocks.3.ffn.key.weight ...\r\n", + "Merging blocks.3.ffn.receptance.weight ...\r\n", + "Merging blocks.3.ffn.time_mix_k ...\r\n", + "Merging blocks.3.ffn.time_mix_r ...\r\n", + "Merging blocks.3.ffn.value.weight ...\r\n", + "Merging blocks.3.ln1.bias ...\r\n", + "Merging blocks.3.ln1.weight ...\r\n", + "Merging blocks.3.ln2.bias ...\r\n", + "Merging blocks.3.ln2.weight ...\r\n", + "Merging blocks.4.att.gate.weight ...\r\n", + "Merging blocks.4.att.key.weight ...\r\n", + "Merging blocks.4.att.ln_x.bias ...\r\n", + "Merging blocks.4.att.ln_x.weight ...\r\n", + "Merging blocks.4.att.output.weight ...\r\n", + "Merging blocks.4.att.receptance.weight ...\r\n", + "Merging blocks.4.att.time_mix_g ...\r\n", + "Merging blocks.4.att.time_mix_k ...\r\n", + "Merging blocks.4.att.time_mix_r ...\r\n", + "Merging blocks.4.att.time_mix_v ...\r\n", + "Merging blocks.4.att.value.weight ...\r\n", + "Merging blocks.4.ffn.key.weight ...\r\n", + "Merging blocks.4.ffn.receptance.weight ...\r\n", + "Merging blocks.4.ffn.time_mix_k ...\r\n", + "Merging blocks.4.ffn.time_mix_r ...\r\n", + "Merging blocks.4.ffn.value.weight ...\r\n", + "Merging blocks.4.ln1.bias ...\r\n", + "Merging blocks.4.ln1.weight ...\r\n", + "Merging blocks.4.ln2.bias ...\r\n", + "Merging blocks.4.ln2.weight ...\r\n", + "Merging blocks.5.att.gate.weight ...\r\n", + "Merging blocks.5.att.key.weight ...\r\n", + "Merging blocks.5.att.ln_x.bias ...\r\n", + "Merging blocks.5.att.ln_x.weight ...\r\n", + "Merging blocks.5.att.output.weight ...\r\n", + "Merging blocks.5.att.receptance.weight ...\r\n", + "Merging blocks.5.att.time_mix_g ...\r\n", + "Merging blocks.5.att.time_mix_k ...\r\n", + "Merging blocks.5.att.time_mix_r ...\r\n", + "Merging blocks.5.att.time_mix_v ...\r\n", + "Merging blocks.5.att.value.weight ...\r\n", + "Merging blocks.5.ffn.key.weight ...\r\n", + "Merging blocks.5.ffn.receptance.weight ...\r\n", + "Merging blocks.5.ffn.time_mix_k ...\r\n", + "Merging blocks.5.ffn.time_mix_r ...\r\n", + "Merging blocks.5.ffn.value.weight ...\r\n", + "Merging blocks.5.ln1.bias ...\r\n", + "Merging blocks.5.ln1.weight ...\r\n", + "Merging blocks.5.ln2.bias ...\r\n", + "Merging blocks.5.ln2.weight ...\r\n", + "Merging emb.weight ...\r\n", + "Merging head.weight ...\r\n", + "Merging ln_out.bias ...\r\n", + "Merging ln_out.weight ...\r\n", + "Merging blocks.0.att.time_decay ...\r\n", + "Merging blocks.0.att.time_faaaa ...\r\n", + "Merging blocks.1.att.time_decay ...\r\n", + "Merging blocks.1.att.time_faaaa ...\r\n", + "Merging blocks.2.att.time_decay ...\r\n", + "Merging blocks.2.att.time_faaaa ...\r\n", + "Merging blocks.3.att.time_decay ...\r\n", + "Merging blocks.3.att.time_faaaa ...\r\n", + "Merging blocks.4.att.time_decay ...\r\n", + "Merging blocks.4.att.time_faaaa ...\r\n", + "Merging blocks.5.att.time_decay ...\r\n", + "Merging blocks.5.att.time_faaaa ...\r\n" + ] + } + ], + "source": [ + "# Lets build the merged model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 model_merge.py \\\n", + " --merge-mode=\"overwrite\" \\\n", + " \"{PROJECT_DIR}/model/v5-L12-D2048-E0_01-neox-v5base-init.pth\" \\\n", + " \"{PROJECT_DIR}/model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth\" \\\n", + " \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-overwrite-naive-p1.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "76f85027", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T09:58:15.973538Z", + "iopub.status.busy": "2023-10-01T09:58:15.973079Z", + "iopub.status.idle": "2023-10-01T09:58:31.617840Z", + "shell.execute_reply": "2023-10-01T09:58:31.617035Z" + }, + "papermill": { + "duration": 15.65472, + "end_time": "2023-10-01T09:58:31.619907", + "exception": false, + "start_time": "2023-10-01T09:58:15.965187", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Saving the dataset (0/2 shards): 0%| | 0/27202 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 5.000e-04 (0.0005)\r\n", + " - lr_final: 4.500e-04 (0.00045)\r\n", + "\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n", + "Creating extension directory /root/.cache/torch_extensions/py310_cu118/cpu_adam...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\r\n", + "Building extension module cpu_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_89,code=compute_89 -gencode=arch=compute_89,code=sm_89 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_89,code=compute_89 -DBF16_AVAILABLE -c /usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2/3] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX256__ -D__ENABLE_CUDA__ -DBF16_AVAILABLE -c /usr/local/lib/python3.10/dist-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3/3] c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/usr/local/lib/python3.10/dist-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o cpu_adam.so\r\n", + "Loading extension module cpu_adam...\r\n", + "Time to load cpu_adam op: 23.124703645706177 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading `train_dataloader` to estimate number of stepping batches.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 654 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "860 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "860 M Total params\r\n", + "3,442.200 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/27202 [00:00\r\n", + " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", + " self.model = RWKV(**model_config)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", + " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", + "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-baseline-p2.pth' does not exist\r\n" + ] + } + ], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-baseline-p2.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "id": "19af35c8", + "metadata": { + "papermill": { + "duration": 1.17054, + "end_time": "2023-10-01T15:22:07.809266", + "exception": false, + "start_time": "2023-10-01T15:22:06.638726", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Enwiki Stage 3 : Baseline training" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "841e33e5", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T15:22:10.148567Z", + "iopub.status.busy": "2023-10-01T15:22:10.148263Z", + "iopub.status.idle": "2023-10-01T15:22:27.626409Z", + "shell.execute_reply": "2023-10-01T15:22:27.625595Z" + }, + "papermill": { + "duration": 18.651033, + "end_time": "2023-10-01T15:22:27.628404", + "exception": false, + "start_time": "2023-10-01T15:22:08.977371", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-01 15:22:12,971] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-baseline-p3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-baseline-p2.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-baseline-p3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-baseline-p2.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n", + " rank_zero_warn(\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 4191519895\r\n", + " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n", + "Global seed set to 4191519895\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.11\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231001_152215-4zohh46b\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/4zohh46b\u001b[0m\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in \r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n", + " self.instantiate_classes()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n", + " self.config_init = self.parser.instantiate_classes(self.config)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n", + " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n", + " component.instantiate_class(component, cfg)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n", + " parent[key] = group.group_class(**value)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", + " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", + "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-baseline-p2.pth' does not exist\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/4zohh46b\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v10\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231001_152215-4zohh46b/logs\u001b[0m\r\n" + ] + } + ], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python3 lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Overwrite Merge Part 3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-baseline-p3/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-baseline-p2.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6dbac364", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T15:22:29.905239Z", + "iopub.status.busy": "2023-10-01T15:22:29.904937Z", + "iopub.status.idle": "2023-10-01T15:22:32.368148Z", + "shell.execute_reply": "2023-10-01T15:22:32.367303Z" + }, + "papermill": { + "duration": 3.639326, + "end_time": "2023-10-01T15:22:32.369962", + "exception": false, + "start_time": "2023-10-01T15:22:28.730636", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-01 15:22:31,492] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-baseline-p3/last.ckpt/latest\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '../model/v5-L6+6-D2048-E0_01-baseline-p3.pth': No such file or directory\r\n" + ] + } + ], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-baseline-p3/last.ckpt\" \"../model/{FILENAME_PREFIX}-baseline-p3.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-baseline-p3.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fceeae0b", + "metadata": { + "execution": { + "iopub.execute_input": "2023-10-01T15:22:34.653212Z", + "iopub.status.busy": "2023-10-01T15:22:34.652788Z", + "iopub.status.idle": "2023-10-01T15:22:38.358063Z", + "shell.execute_reply": "2023-10-01T15:22:38.357193Z" + }, + "papermill": { + "duration": 4.813211, + "end_time": "2023-10-01T15:22:38.360065", + "exception": false, + "start_time": "2023-10-01T15:22:33.546854", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-10-01 15:22:37,346] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", + " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", + " self.model = RWKV(**model_config)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", + " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", + "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-baseline-p3.pth' does not exist\r\n" + ] + } + ], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-baseline-p3.pth\" \"cuda fp32\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 19494.071363, + "end_time": "2023-10-01T15:22:39.767052", + "environment_variables": {}, + "exception": null, + "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb", + "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb", + "parameters": {}, + "start_time": "2023-10-01T09:57:45.695689", + "version": "2.4.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file