diff --git "a/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb" "b/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb" deleted file mode 100644--- "a/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb" +++ /dev/null @@ -1,116224 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "3cb7ba82", - "metadata": { - "papermill": { - "duration": 0.003253, - "end_time": "2023-08-28T18:45:25.049803", - "exception": false, - "start_time": "2023-08-28T18:45:25.046550", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", - "\n", - "- 6 layers\n", - "- 4096 embedding size\n", - "\n", - "Going through the modified memory training for v5 models, across various initial embedding model weights\n", - "\n", - "**Note:** This project assumes you have the rwkv-infctx conda env setup" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4aba7083", - "metadata": { - "papermill": { - "duration": 0.002594, - "end_time": "2023-08-28T18:45:25.054857", - "exception": false, - "start_time": "2023-08-28T18:45:25.052263", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Basic Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b74e122f", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-28T18:45:25.060868Z", - "iopub.status.busy": "2023-08-28T18:45:25.060669Z", - "iopub.status.idle": "2023-08-28T18:45:25.776404Z", - "shell.execute_reply": "2023-08-28T18:45:25.775244Z" - }, - "papermill": { - "duration": 0.720741, - "end_time": "2023-08-28T18:45:25.777978", - "exception": false, - "start_time": "2023-08-28T18:45:25.057237", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# First lets setup the various directories, and init the model\n", - "!mkdir -p ../../../../model/\n", - "!mkdir -p ../../../../datapath/\n", - "!mkdir -p ../../../../checkpoint/" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a8b3d9f6", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-28T18:45:25.784847Z", - "iopub.status.busy": "2023-08-28T18:45:25.784172Z", - "iopub.status.idle": "2023-08-28T18:45:28.697467Z", - "shell.execute_reply": "2023-08-28T18:45:28.696485Z" - }, - "papermill": { - "duration": 2.918657, - "end_time": "2023-08-28T18:45:28.699319", - "exception": false, - "start_time": "2023-08-28T18:45:25.780662", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", - "\u001b[0m" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n" - ] - } - ], - "source": [ - "# Additional dependencies for eval stuff\n", - "!pip install -q aiocsv aiofiles" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "68bfff15", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-28T18:45:28.706431Z", - "iopub.status.busy": "2023-08-28T18:45:28.706239Z", - "iopub.status.idle": "2023-08-28T18:45:28.712644Z", - "shell.execute_reply": "2023-08-28T18:45:28.711968Z" - }, - "papermill": { - "duration": 0.011435, - "end_time": "2023-08-28T18:45:28.713756", - "exception": false, - "start_time": "2023-08-28T18:45:28.702321", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DEEPSPEED_STRAT: deepspeed_stage_1\n", - "ENABLE_WANDB: True\n", - "GPU_DEVICES: auto\n", - "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x\n", - "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", - "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", - "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" - ] - } - ], - "source": [ - "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", - "GPU_DEVICES=\"auto\"\n", - "ENABLE_WANDB=True\n", - "\n", - "RWKV_WAVENET_LAYERS=1\n", - "\n", - "EMBED_SCALE=0.1\n", - "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", - "\n", - "LAYER_COUNT=6\n", - "EMBED_DIM=4096\n", - "\n", - "WANDB_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", - "FILENAME_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", - "\n", - "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", - "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", - "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", - "\n", - "if ENABLE_WANDB:\n", - " WANDB_MODE=\"online\"\n", - "else:\n", - " WANDB_MODE=\"disabled\"\n", - "\n", - "# Computing the notebook, and various paths\n", - "import os\n", - "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", - "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", - "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", - "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", - "\n", - "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", - "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", - "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", - "print(\"PROJECT_DIR:\", PROJECT_DIR)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d6d231c3", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-28T18:45:28.719844Z", - "iopub.status.busy": "2023-08-28T18:45:28.719677Z", - "iopub.status.idle": "2023-08-28T18:46:57.146232Z", - "shell.execute_reply": "2023-08-28T18:46:57.145151Z" - }, - "papermill": { - "duration": 88.432008, - "end_time": "2023-08-28T18:46:57.148448", - "exception": false, - "start_time": "2023-08-28T18:45:28.716440", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Setting ds_accelerator to cuda (auto detect)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", - "---- Initializing model ----\r\n", - "No of layers: 6\r\n", - "Embedding size: 4096\r\n", - "Output model path: ../model/L6-D4096-E0_1-neox-v5base-init.pth\r\n", - "Vocab size: 50277\r\n", - "Emb scale: 0.1\r\n", - "Note: this process takes a significant time (and ram) for large models\r\n", - "---- ----- ----\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50277 4096 -0.1 emb.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.0.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.0.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.0.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.0.ffn.receptance.weight\r\n", - "4096 16384 0 blocks.0.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.1.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.1.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.1.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.1.ffn.receptance.weight\r\n", - "4096 16384 0 blocks.1.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.2.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.2.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.2.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.2.ffn.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 16384 0 blocks.2.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.3.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.3.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.3.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 16384 0 blocks.3.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.4.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.4.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.4.att.output.weight\r\n", - "16384 4096 1.0 blocks.4.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.4.ffn.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 16384 0 blocks.4.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.5.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.5.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 1.0 blocks.5.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.5.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16384 4096 1.0 blocks.5.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 4096 0 blocks.5.ffn.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4096 16384 0 blocks.5.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50277 4096 0.5 head.weight\r\n" - ] - } - ], - "source": [ - "# Init the model\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ./init_model.py \\\n", - " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", - " --emb-scale \"{EMBED_SCALE}\" \\\n", - " --vocab_size neox --skip-if-exists \\\n", - " \"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\"" - ] - }, - { - "cell_type": "markdown", - "id": "b95d87fb", - "metadata": { - "papermill": { - "duration": 0.004479, - "end_time": "2023-08-28T18:46:57.159346", - "exception": false, - "start_time": "2023-08-28T18:46:57.154867", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Enwiki Stage 1 : Foundation 4k model training" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "fec23852", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-28T18:46:57.169965Z", - "iopub.status.busy": "2023-08-28T18:46:57.169753Z", - "iopub.status.idle": "2023-08-28T18:47:08.337785Z", - "shell.execute_reply": "2023-08-28T18:47:08.336964Z" - }, - "papermill": { - "duration": 11.175653, - "end_time": "2023-08-28T18:47:08.339555", - "exception": false, - "start_time": "2023-08-28T18:46:57.163902", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", - "\r", - " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", - "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", - "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", - "#\r\n", - "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", - "\r\n", - "[RWKV.model] Configuring optimizer with\r\n", - " - lr_init: 6.000e-04 (0.0006)\r\n", - " - lr_final: 4.000e-04 (0.0004)\r\n", - "\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Detected CUDA files, patching ldflags\r\n", - "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", - "Building extension module fused_adam...\r\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", - "ninja: no work to do.\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.07090473175048828 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.10126924514770508 seconds\r\n", - "Time to load fused_adam op: 0.10122466087341309 seconds\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.10141754150390625 seconds\r\n", - "Time to load fused_adam op: 0.1015462875366211 seconds\r\n", - "Time to load fused_adam op: 0.10146403312683105 seconds\r\n", - "Time to load fused_adam op: 0.10181498527526855 seconds\r\n", - "Time to load fused_adam op: 0.10191535949707031 seconds\r\n", - "Loading `train_dataloader` to estimate number of stepping batches.\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", - "Building extension module utils...\r\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ninja: no work to do.\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0714881420135498 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.10209321975708008 seconds\r\n", - "Time to load utils op: 0.10218286514282227 seconds\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.1021718978881836 seconds\r\n", - "Time to load utils op: 0.10290384292602539 seconds\r\n", - "Time to load utils op: 0.10239219665527344 seconds\r\n", - "Time to load utils op: 0.10289859771728516 seconds\r\n", - "Time to load utils op: 0.10258913040161133 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n", - "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0005891323089599609 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.00074005126953125 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Time to load utils op: 0.0006580352783203125 seconds\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0008976459503173828 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0006477832794189453 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0007245540618896484 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0014486312866210938 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0009706020355224609 seconds\r\n", - "\r\n", - " | Name | Type | Params\r\n", - "--------------------------------------\r\n", - "0 | emb | Embedding | 205 M \r\n", - "1 | blocks | ModuleList | 1.3 B \r\n", - "2 | ln_out | LayerNorm | 8.2 K \r\n", - "3 | head | Linear | 205 M \r\n", - "--------------------------------------\r\n", - "1.7 B Trainable params\r\n", - "0 Non-trainable params\r\n", - "1.7 B Total params\r\n", - "6,883.117 Total estimated model params size (MB)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "Training: 0it [00:00, ?it/s]\r", - "Training: 0%| | 0/10186 [00:00\r\n", - " asyncio.run(main_function())\r\n", - " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", - " return runner.run(main)\r\n", - " ^^^^^^^^^^^^^^^^\r\n", - " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", - " return self._loop.run_until_complete(task)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", - " return future.result()\r\n", - " ^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", - " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", - " self.model = RWKV(**model_config)\r\n", - " ^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", - " self.load_state_dict(model_weights)\r\n", - " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", - " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", - "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", - "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" - ] - } - ], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "1e197c3b", - "metadata": { - "papermill": { - "duration": 0.584635, - "end_time": "2023-08-29T01:43:14.328059", - "exception": false, - "start_time": "2023-08-29T01:43:13.743424", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Enwiki Stage 2 : Basic Instruct Tuning" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "9f645ad6", - "metadata": { - "execution": { - "iopub.execute_input": "2023-08-29T01:43:15.622056Z", - "iopub.status.busy": "2023-08-29T01:43:15.621812Z", - "iopub.status.idle": "2023-08-29T01:43:22.600320Z", - "shell.execute_reply": "2023-08-29T01:43:22.599545Z" - }, - "papermill": { - "duration": 7.694583, - "end_time": "2023-08-29T01:43:22.602460", - "exception": false, - "start_time": "2023-08-29T01:43:14.907877", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", - "\r", - " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", - "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", - "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", - "#\r\n", - "\r\n", - "[RWKV.model] Configuring optimizer with\r\n", - " - lr_init: 4.000e-04 (0.0004)\r\n", - " - lr_final: 3.000e-04 (0.0003)\r\n", - "\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Detected CUDA files, patching ldflags\r\n", - "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", - "Building extension module fused_adam...\r\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", - "ninja: no work to do.\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.07155156135559082 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.10176849365234375 seconds\r\n", - "Time to load fused_adam op: 0.10148835182189941 seconds\r\n", - "Time to load fused_adam op: 0.1015634536743164 seconds\r\n", - "Time to load fused_adam op: 0.10209345817565918 seconds\r\n", - "Time to load fused_adam op: 0.10170292854309082 seconds\r\n", - "Time to load fused_adam op: 0.10150504112243652 seconds\r\n", - "Time to load fused_adam op: 0.10160398483276367 seconds\r\n", - "Loading `train_dataloader` to estimate number of stepping batches.\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", - "Building extension module utils...\r\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", - "ninja: no work to do.\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading extension module utils...\r\n", - "Time to load utils op: 0.07130551338195801 seconds\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.10222482681274414 seconds\r\n", - "Time to load utils op: 0.10239815711975098 seconds\r\n", - "Loading extension module utils...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.10255551338195801 seconds\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.1025240421295166 seconds\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.10241293907165527 seconds\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.10340571403503418 seconds\r\n", - "Time to load utils op: 0.1028285026550293 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0006301403045654297 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Time to load utils op: 0.0006008148193359375 seconds\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Time to load utils op: 0.0006322860717773438 seconds\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "Time to load utils op: 0.0006577968597412109 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0010302066802978516 seconds\r\n", - "Time to load utils op: 0.0007059574127197266 seconds\r\n", - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.001132965087890625 seconds\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", - "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", - "Loading extension module utils...\r\n", - "Time to load utils op: 0.0008440017700195312 seconds\r\n", - "\r\n", - " | Name | Type | Params\r\n", - "--------------------------------------\r\n", - "0 | emb | Embedding | 205 M \r\n", - "1 | blocks | ModuleList | 1.3 B \r\n", - "2 | ln_out | LayerNorm | 8.2 K \r\n", - "3 | head | Linear | 205 M \r\n", - "--------------------------------------\r\n", - "1.7 B Trainable params\r\n", - "0 Non-trainable params\r\n", - "1.7 B Total params\r\n", - "6,883.117 Total estimated model params size (MB)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "Training: 0it [00:00, ?it/s]\r", - "Training: 0%| | 0/1867 [00:00\r\n", - " asyncio.run(main_function())\r\n", - " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", - " return runner.run(main)\r\n", - " ^^^^^^^^^^^^^^^^\r\n", - " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", - " return self._loop.run_until_complete(task)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", - " return future.result()\r\n", - " ^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", - " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", - " self.model = RWKV(**model_config)\r\n", - " ^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", - " self.load_state_dict(model_weights)\r\n", - " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", - " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", - "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", - "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", - "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" - ] - } - ], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "papermill": { - "default_parameters": {}, - "duration": 28184.999752, - "end_time": "2023-08-29T02:35:09.072726", - "environment_variables": {}, - "exception": null, - "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb", - "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb", - "parameters": {}, - "start_time": "2023-08-28T18:45:24.072974", - "version": "2.4.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file