{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "896634be", "metadata": { "papermill": { "duration": 0.003834, "end_time": "2023-08-26T10:50:44.293199", "exception": false, "start_time": "2023-08-26T10:50:44.289365", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", "\n", "- 6 layers\n", "- 4096 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d84fe08b", "metadata": { "papermill": { "duration": 0.002275, "end_time": "2023-08-26T10:50:44.297946", "exception": false, "start_time": "2023-08-26T10:50:44.295671", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "cfbcf7a9", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:44.304015Z", "iopub.status.busy": "2023-08-26T10:50:44.303474Z", "iopub.status.idle": "2023-08-26T10:50:45.020477Z", "shell.execute_reply": "2023-08-26T10:50:45.019638Z" }, "papermill": { "duration": 0.722053, "end_time": "2023-08-26T10:50:45.022408", "exception": false, "start_time": "2023-08-26T10:50:44.300355", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "023aaaac", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:45.028347Z", "iopub.status.busy": "2023-08-26T10:50:45.028130Z", "iopub.status.idle": "2023-08-26T10:50:47.855321Z", "shell.execute_reply": "2023-08-26T10:50:47.854667Z" }, "papermill": { "duration": 2.83269, "end_time": "2023-08-26T10:50:47.857531", "exception": false, "start_time": "2023-08-26T10:50:45.024841", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] } ], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, }, "papermill": { "duration": 0.010544, "end_time": "2023-08-26T10:50:47.871440", "exception": false, "start_time": "2023-08-26T10:50:47.860896", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize32\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "RWKV_WAVENET_LAYERS=1\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_DIM=4096\n", "\n", "WANDB_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-hs32-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize32/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "e62f03e8", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:50:47.877565Z", "iopub.status.busy": "2023-08-26T10:50:47.877406Z", "iopub.status.idle": "2023-08-26T10:52:17.326298Z", "shell.execute_reply": "2023-08-26T10:52:17.324780Z" }, "papermill": { "duration": 89.454427, "end_time": "2023-08-26T10:52:17.328534", "exception": false, "start_time": "2023-08-26T10:50:47.874107", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 4096\r\n", "Output model path: ../model/v5-hs32-L6-D4096-E0_1-neox-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 -0.1 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.att.output.weight\r\n", "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.ffn.receptance.weight\r\n", "4096 16384 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.att.output.weight\r\n", "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.att.output.weight\r\n", "16384 4096 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" ] }, { "cell_type": "markdown", "id": "ffe9086d", "metadata": { "papermill": { "duration": 0.004561, "end_time": "2023-08-26T10:52:17.339630", "exception": false, "start_time": "2023-08-26T10:52:17.335069", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 5, "id": "666731cf", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T10:52:17.351244Z", "iopub.status.busy": "2023-08-26T10:52:17.350990Z", "iopub.status.idle": "2023-08-26T10:52:29.271727Z", "shell.execute_reply": "2023-08-26T10:52:29.270864Z" }, "papermill": { "duration": 11.929867, "end_time": "2023-08-26T10:52:29.274103", "exception": false, "start_time": "2023-08-26T10:52:17.344236", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", "\r", " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", "\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 6.000e-04 (0.0006)\r\n", " - lr_final: 4.000e-04 (0.0004)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.07281303405761719 seconds\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10150337219238281 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07712340354919434 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10222005844116211 seconds seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (192, False)] \r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006635189056396484 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006933212280273438 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005958080291748047 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005662441253662109 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0005939006805419922 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006010532379150391 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006580352783203125 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.00098419189453125 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 205 M \r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 8.2 K \r\n", "3 | head | Linear | 205 M \r\n", "--------------------------------------\r\n", "1.7 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.7 B Total params\r\n", "6,883.121 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/10186 [00:00\r\n", " asyncio.run(main_function())\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", " return runner.run(main)\r\n", " ^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", " return self._loop.run_until_complete(task)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", " return future.result()\r\n", " ^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize32/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " ^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", " self.load_state_dict(model_weights)\r\n", " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" ] } ], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" ] }, { "attachments": {}, "cell_type": "markdown", "id": "56b0cbbd", "metadata": { "papermill": { "duration": 0.559214, "end_time": "2023-08-26T18:35:12.774022", "exception": false, "start_time": "2023-08-26T18:35:12.214808", "status": "completed" }, "tags": [] }, "source": [ "# Enwiki Stage 2 : Basic Instruct Tuning" ] }, { "cell_type": "code", "execution_count": 10, "id": "866eecec", "metadata": { "execution": { "iopub.execute_input": "2023-08-26T18:35:13.981059Z", "iopub.status.busy": "2023-08-26T18:35:13.980635Z", "iopub.status.idle": "2023-08-26T18:35:20.814986Z", "shell.execute_reply": "2023-08-26T18:35:20.814067Z" }, "papermill": { "duration": 7.452047, "end_time": "2023-08-26T18:35:20.816515", "exception": false, "start_time": "2023-08-26T18:35:13.364468", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", "\r", " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", "\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 4.000e-04 (0.0004)\r\n", " - lr_final: 3.000e-04 (0.0003)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.06924247741699219 seconds\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10156655311584473 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07047772407531738 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10281085968017578 seconds