{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "896634be", "metadata": { "papermill": { "duration": 0.003834, "end_time": "2023-08-26T10:50:44.293199", "exception": false, "start_time": "2023-08-26T10:50:44.289365", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", "\n", "- 6 layers\n", "- 4096 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d84fe08b", "metadata": { "papermill": { "duration": 0.002275, "end_time": "2023-08-26T10:50:44.297946", "exception": false, "start_time": "2023-08-26T10:50:44.295671", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "cfbcf7a9", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:44.304015Z", "iopub.status.busy": "2023-08-26T10:50:44.303474Z", "iopub.status.idle": "2023-08-26T10:50:45.020477Z", "shell.execute_reply": "2023-08-26T10:50:45.019638Z" }, "papermill": { "duration": 0.722053, "end_time": "2023-08-26T10:50:45.022408", "exception": false, "start_time": "2023-08-26T10:50:44.300355", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "023aaaac", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:45.028347Z", "iopub.status.busy": "2023-08-26T10:50:45.028130Z", "iopub.status.idle": "2023-08-26T10:50:47.855321Z", "shell.execute_reply": "2023-08-26T10:50:47.854667Z" }, "papermill": { "duration": 2.83269, "end_time": "2023-08-26T10:50:47.857531", "exception": false, "start_time": "2023-08-26T10:50:45.024841", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n" ] } ], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, { "cell_type": "code", "execution_count": 3, "id": "c9023c51", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:47.864761Z", "iopub.status.busy": "2023-08-26T10:50:47.864581Z", "iopub.status.idle": "2023-08-26T10:50:47.870388Z", "shell.execute_reply": "2023-08-26T10:50:47.869937Z" }, "papermill": { "duration": 0.010544, "end_time": "2023-08-26T10:50:47.871440", "exception": false, "start_time": "2023-08-26T10:50:47.860896", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "RWKV_WAVENET_LAYERS=1\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_DIM=4096\n", "\n", "WANDB_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "e62f03e8", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:47.877565Z", "iopub.status.busy": "2023-08-26T10:50:47.877406Z", "iopub.status.idle": "2023-08-26T10:52:17.326298Z", "shell.execute_reply": "2023-08-26T10:52:17.324780Z" }, "papermill": { "duration": 89.454427, "end_time": "2023-08-26T10:52:17.328534", "exception": false, "start_time": "2023-08-26T10:50:47.874107", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 4096\r\n", "Output model path: ../model/v5-hs32-L6-D4096-E0_1-neox-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 -0.1 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.att.output.weight\r\n", "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.ffn.receptance.weight\r\n", "4096 16384 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.att.output.weight\r\n", "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.att.output.weight\r\n", "16384 4096 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" ] }, { "cell_type": "markdown", "id": "ffe9086d", "metadata": { "papermill": { "duration": 0.004561, "end_time": "2023-08-26T10:52:17.339630", "exception": false, "start_time": "2023-08-26T10:52:17.335069", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 5, "id": "666731cf", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:52:17.351244Z", "iopub.status.busy": "2023-08-26T10:52:17.350990Z", "iopub.status.idle": "2023-08-26T10:52:29.271727Z", "shell.execute_reply": "2023-08-26T10:52:29.270864Z" }, "papermill": { "duration": 11.929867, "end_time": "2023-08-26T10:52:29.274103", "exception": false, "start_time": "2023-08-26T10:52:17.344236", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", "\r", " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", "\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 6.000e-04 (0.0006)\r\n", " - lr_final: 4.000e-04 (0.0004)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.07281303405761719 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10150337219238281 seconds\r\n", "Time to load fused_adam op: 0.10160708427429199 seconds\r\n", "Time to load fused_adam op: 0.10151147842407227 seconds\r\n", "Time to load fused_adam op: 0.10152792930603027 seconds\r\n", "Time to load fused_adam op: 0.10149931907653809 seconds\r\n", "Time to load fused_adam op: 0.10160160064697266 seconds\r\n", "Time to load fused_adam op: 0.10160374641418457 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07712340354919434 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10222005844116211 seconds\r\n", "Time to load utils op: 0.10222220420837402 seconds\r\n", "Time to load utils op: 0.1022186279296875 seconds\r\n", "Loading extension module utils...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to load utils op: 0.10224509239196777 seconds\r\n", "Time to load utils op: 0.10342669486999512 seconds\r\n", "Time to load utils op: 0.10248851776123047 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10252261161804199 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006635189056396484 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006933212280273438 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005958080291748047 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005662441253662109 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005939006805419922 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006010532379150391 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006580352783203125 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.00098419189453125 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 205 M \r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 8.2 K \r\n", "3 | head | Linear | 205 M \r\n", "--------------------------------------\r\n", "1.7 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.7 B Total params\r\n", "6,883.121 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/10186 [00:00\r\n", " asyncio.run(main_function())\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", " return runner.run(main)\r\n", " ^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", " return self._loop.run_until_complete(task)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", " return future.result()\r\n", " ^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " ^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", " self.load_state_dict(model_weights)\r\n", " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" ] } ], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" ] }, { "attachments": {}, "cell_type": "markdown", "id": "56b0cbbd", "metadata": { "papermill": { "duration": 0.559214, "end_time": "2023-08-26T18:35:12.774022", "exception": false, "start_time": "2023-08-26T18:35:12.214808", "status": "completed" }, "tags": [] }, "source": [ "# Enwiki Stage 2 : Basic Instruct Tuning" ] }, { "cell_type": "code", "execution_count": 10, "id": "866eecec", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T18:35:13.981059Z", "iopub.status.busy": "2023-08-26T18:35:13.980635Z", "iopub.status.idle": "2023-08-26T18:35:20.814986Z", "shell.execute_reply": "2023-08-26T18:35:20.814067Z" }, "papermill": { "duration": 7.452047, "end_time": "2023-08-26T18:35:20.816515", "exception": false, "start_time": "2023-08-26T18:35:13.364468", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", "\r", " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", "\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 4.000e-04 (0.0004)\r\n", " - lr_final: 3.000e-04 (0.0003)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.06924247741699219 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10156655311584473 seconds\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.1015317440032959 seconds\r\n", "Time to load fused_adam op: 0.10159778594970703 seconds\r\n", "Time to load fused_adam op: 0.10166597366333008 seconds\r\n", "Time to load fused_adam op: 0.10168814659118652 seconds\r\n", "Time to load fused_adam op: 0.10166096687316895 seconds\r\n", "Time to load fused_adam op: 0.10368108749389648 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07047772407531738 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10281085968017578 seconds\r\n", "Time to load utils op: 0.10235595703125 seconds\r\n", "Time to load utils op: 0.1023414134979248 seconds\r\n", "Time to load utils op: 0.1021113395690918 seconds\r\n", "Time to load utils op: 0.10221982002258301 seconds\r\n", "Time to load utils op: 0.10251808166503906 seconds\r\n", "Time to load utils op: 0.10249519348144531 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Time to load utils op: 0.0005998611450195312 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006024837493896484 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Time to load utils op: 0.0005958080291748047 seconds\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0007169246673583984 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006222724914550781 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0011990070343017578 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008442401885986328 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008878707885742188 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 205 M \r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 8.2 K \r\n", "3 | head | Linear | 205 M \r\n", "--------------------------------------\r\n", "1.7 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.7 B Total params\r\n", "6,883.121 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/1867 [00:00