{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "55f3ffab", "metadata": { "papermill": { "duration": 0.003272, "end_time": "2023-08-23T11:26:20.527556", "exception": false, "start_time": "2023-08-23T11:26:20.524284", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", "\n", "- 6 layers\n", "- 4096 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "c6113b1c", "metadata": { "papermill": { "duration": 0.002255, "end_time": "2023-08-23T11:26:20.532263", "exception": false, "start_time": "2023-08-23T11:26:20.530008", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "f344ac2b", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:26:20.538364Z", "iopub.status.busy": "2023-08-23T11:26:20.537725Z", "iopub.status.idle": "2023-08-23T11:26:21.258042Z", "shell.execute_reply": "2023-08-23T11:26:21.257050Z" }, "papermill": { "duration": 0.725211, "end_time": "2023-08-23T11:26:21.259859", "exception": false, "start_time": "2023-08-23T11:26:20.534648", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "51cbdb70", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:26:21.266726Z", "iopub.status.busy": "2023-08-23T11:26:21.265933Z", "iopub.status.idle": "2023-08-23T11:26:24.123123Z", "shell.execute_reply": "2023-08-23T11:26:24.122216Z" }, "papermill": { "duration": 2.862476, "end_time": "2023-08-23T11:26:24.124834", "exception": false, "start_time": "2023-08-23T11:26:21.262358", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n" ] } ], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, { "cell_type": "code", "execution_count": 3, "id": "ecfd37da", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:26:24.131992Z", "iopub.status.busy": "2023-08-23T11:26:24.131798Z", "iopub.status.idle": "2023-08-23T11:26:24.138184Z", "shell.execute_reply": "2023-08-23T11:26:24.137623Z" }, "papermill": { "duration": 0.011361, "end_time": "2023-08-23T11:26:24.139306", "exception": false, "start_time": "2023-08-23T11:26:24.127945", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_2_offload\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_2_offload\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "RWKV_WAVENET_LAYERS=1\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_DIM=4096\n", "\n", "WANDB_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "f22bb0a5", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:26:24.145509Z", "iopub.status.busy": "2023-08-23T11:26:24.145097Z", "iopub.status.idle": "2023-08-23T11:27:52.704301Z", "shell.execute_reply": "2023-08-23T11:27:52.703060Z" }, "papermill": { "duration": 88.564748, "end_time": "2023-08-23T11:27:52.706654", "exception": false, "start_time": "2023-08-23T11:26:24.141906", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 4096\r\n", "Output model path: ../model/L6-D4096-E0_1-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 -0.1 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.0.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.1.ffn.receptance.weight\r\n", "4096 16384 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.att.output.weight\r\n", "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.2.ffn.receptance.weight\r\n", "4096 16384 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 16384 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "16384 4096 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.4.ffn.receptance.weight\r\n", "4096 16384 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.att.output.weight\r\n", "16384 4096 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 4096 0 blocks.5.ffn.receptance.weight\r\n", "4096 16384 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 4096 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer 6 --n_embd 4096 \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "f5c0a0c0", "metadata": { "papermill": { "duration": 0.004372, "end_time": "2023-08-23T11:27:52.717486", "exception": false, "start_time": "2023-08-23T11:27:52.713114", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 5, "id": "60fd4eca", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:27:52.728245Z", "iopub.status.busy": "2023-08-23T11:27:52.727830Z", "iopub.status.idle": "2023-08-23T11:29:37.858683Z", "shell.execute_reply": "2023-08-23T11:29:37.857848Z" }, "papermill": { "duration": 105.138816, "end_time": "2023-08-23T11:29:37.860763", "exception": false, "start_time": "2023-08-23T11:27:52.721947", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/preload_datapath.py\", line 37, in \r\n", " dataMod.prepare_data()\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/src/data.py\", line 465, in prepare_data\r\n", " prepare_data_static(**self._init_locals)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/src/data.py\", line 94, in prepare_data_static\r\n", " src_dataset = load_dataset(**load_dataset_params)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/datasets/load.py\", line 1785, in load_dataset\r\n", " builder_instance = load_dataset_builder(\r\n", " ^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/datasets/load.py\", line 1514, in load_dataset_builder\r\n", " dataset_module = dataset_module_factory(\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/datasets/load.py\", line 1231, in dataset_module_factory\r\n", " raise e1 from None\r\n", " File \"/usr/local/lib/python3.11/dist-packages/datasets/load.py\", line 1198, in dataset_module_factory\r\n", " raise e\r\n", " File \"/usr/local/lib/python3.11/dist-packages/datasets/load.py\", line 1172, in dataset_module_factory\r\n", " dataset_info = hf_api.dataset_info(\r\n", " ^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 118, in _inner_fn\r\n", " return fn(*args, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1738, in dataset_info\r\n", " r = get_session().get(path, headers=headers, timeout=timeout, params=params)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/requests/sessions.py\", line 602, in get\r\n", " return self.request(\"GET\", url, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/requests/sessions.py\", line 589, in request\r\n", " resp = self.send(prep, **send_kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/requests/sessions.py\", line 703, in send\r\n", " r = adapter.send(request, **kwargs)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/local/lib/python3.11/dist-packages/requests/adapters.py\", line 532, in send\r\n", " raise ReadTimeout(e, request=request)\r\n", "requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=100.0)\r\n" ] } ], "source": [ "# Lets preload the requried dataset \n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "717fc32b", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T11:29:37.871484Z", "iopub.status.busy": "2023-08-23T11:29:37.871296Z", "iopub.status.idle": "2023-08-23T22:00:28.794016Z", "shell.execute_reply": "2023-08-23T22:00:28.793092Z" }, "papermill": { "duration": 37850.930689, "end_time": "2023-08-23T22:00:28.796260", "exception": false, "start_time": "2023-08-23T11:29:37.865571", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/usr/local/lib/python3.11/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5base-enwiki-4k.yaml', '--trainer.logger.init_args.name=v5-hs2x-L6-D4096-E0.1 - Enwiki-4k Foundation (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-hs2x-L6-D4096-E0_1-enwiki-4k/', '--model.load_model=../model/L6-D4096-E0_1-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5base-enwiki-4k.yaml', '--trainer.logger.init_args.name=v5-hs2x-L6-D4096-E0.1 - Enwiki-4k Foundation (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-hs2x-L6-D4096-E0_1-enwiki-4k/', '--model.load_model=../model/L6-D4096-E0_1-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n", " rank_zero_warn(\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "/usr/local/lib/python3.11/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 2838735928\r\n", " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n", "Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.15.8 is available! To upgrade, please run:\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: $ pip install wandb --upgrade\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.4\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230823_112943-wdxosswf\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mv5-hs2x-L6-D4096-E0.1 - Enwiki-4k Foundation (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/wdxosswf\u001b[0m\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "GPU available: True (cuda), used: True\r\n", "TPU available: False, using: 0 TPU cores\r\n", "IPU available: False, using: 0 IPUs\r\n", "HPU available: False, using: 0 HPUs\r\n", "\r\n", "\r\n", "[RWKV.Trainer] Applying 'target_batch_size' with the following:\r\n", " - target_batch_size: 32\r\n", " - num_nodes: 1\r\n", " - num_devices: 8\r\n", " - accumulate_grad_batches: 4\r\n", " - effective_batch_size: 32\r\n", "\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n", "Setting ds_accelerator to cuda (auto detect)\r\n", "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n", "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 5] Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 3] Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 7] Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 6] Global seed set to 2838735928\r\n", "[rank: 1] Global seed set to 2838735928\r\n", "[rank: 4] Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 2] Global seed set to 2838735928\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 5] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\r\n", "[2023-08-23 11:30:35,617] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 4] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\r\n", "[2023-08-23 11:30:37,763] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 2] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\r\n", "[2023-08-23 11:30:38,123] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 6] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\r\n", "[2023-08-23 11:30:38,160] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 3] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\r\n", "[2023-08-23 11:30:38,283] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 1] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\r\n", "[2023-08-23 11:30:39,211] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[rank: 7] Global seed set to 2838735928\r\n", "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\r\n", "[2023-08-23 11:30:39,233] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Downloading readme: 0%| | 0.00/433 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 6.000e-04 (0.0006)\r\n", " - lr_final: 4.000e-04 (0.0004)\r\n", "\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/cpu_adam/build.ninja...\r\n", "Building extension module cpu_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3307251930236816 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3550989627838135 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module cpu_adam...\r\n", "Loading extension module cpu_adam...\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3721420764923096 seconds\r\n", "Time to load cpu_adam op: 2.369891881942749 seconds\r\n", "Time to load cpu_adam op: 2.3694915771484375 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Loading extension module cpu_adam...\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3722288608551025 seconds\r\n", "Time to load cpu_adam op: 2.37237811088562 seconds\r\n", "Time to load cpu_adam op: 2.3751280307769775 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07468938827514648 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10226869583129883 seconds\r\n", "Time to load utils op: 0.1022031307220459 seconds\r\n", "Time to load utils op: 0.10259532928466797 seconds\r\n", "Time to load utils op: 0.10230875015258789 seconds\r\n", "Time to load utils op: 0.10206985473632812 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10176777839660645 seconds\r\n", "Time to load utils op: 0.10233497619628906 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n", "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008993148803710938 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008089542388916016 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.01240086555480957 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0007753372192382812 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.003259420394897461 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.000774383544921875 seconds\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.02939915657043457 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0010874271392822266 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 205 M \r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 8.2 K \r\n", "3 | head | Linear | 205 M \r\n", "--------------------------------------\r\n", "1.7 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.7 B Total params\r\n", "6,883.117 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/10186 [00:00\r\n", " asyncio.run(main_function())\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", " return runner.run(main)\r\n", " ^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", " return self._loop.run_until_complete(task)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", " return future.result()\r\n", " ^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " ^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", " self.load_state_dict(model_weights)\r\n", " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" ] } ], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" ] }, { "attachments": {}, "cell_type": "markdown", "id": "d27a513a", "metadata": { "papermill": { "duration": 0.61749, "end_time": "2023-08-23T22:02:49.429688", "exception": false, "start_time": "2023-08-23T22:02:48.812198", "status": "completed" }, "tags": [] }, "source": [ "# Enwiki Stage 2 : Basic Instruct Tuning" ] }, { "cell_type": "code", "execution_count": 10, "id": "fcc36b80", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T22:02:50.824810Z", "iopub.status.busy": "2023-08-23T22:02:50.824162Z", "iopub.status.idle": "2023-08-23T22:03:05.219243Z", "shell.execute_reply": "2023-08-23T22:03:05.218037Z" }, "papermill": { "duration": 15.01633, "end_time": "2023-08-23T22:03:05.221363", "exception": false, "start_time": "2023-08-23T22:02:50.205033", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Downloading readme: 0%| | 0.00/7.79k [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 4.000e-04 (0.0004)\r\n", " - lr_final: 3.000e-04 (0.0003)\r\n", "\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/cpu_adam/build.ninja...\r\n", "Building extension module cpu_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.323086738586426 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3568384647369385 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.363503932952881 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3699283599853516 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.37654972076416 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.372668504714966 seconds\r\n", "Loading extension module cpu_adam...\r\n", "Loading extension module cpu_adam...\r\n", "Time to load cpu_adam op: 2.3785459995269775 seconds\r\n", "Time to load cpu_adam op: 2.3747076988220215 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.06939339637756348 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.1026608943939209 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10243606567382812 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10294842720031738 seconds\r\n", "Time to load utils op: 0.10233807563781738 seconds\r\n", "Time to load utils op: 0.10210633277893066 seconds\r\n", "Time to load utils op: 0.10252761840820312 seconds\r\n", "Time to load utils op: 0.10194945335388184 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n", "Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.023431777954101562 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.005431175231933594 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root..." ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.025610923767089844 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0012531280517578125 seconds" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "Time to load utils op: 0.01864337921142578 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.00513148307800293 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.000774383544921875 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008399486541748047 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 205 M \r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 8.2 K \r\n", "3 | head | Linear | 205 M \r\n", "--------------------------------------\r\n", "1.7 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.7 B Total params\r\n", "6,883.117 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/1867 [00:00\r\n", " asyncio.run(main_function())\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", " return runner.run(main)\r\n", " ^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", " return self._loop.run_until_complete(task)\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", " return future.result()\r\n", " ^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " ^^^^^^^^^^^^^^^^^^^^\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", " self.load_state_dict(model_weights)\r\n", " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n", "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([32]) from checkpoint, the shape in current model is torch.Size([64]).\r\n" ] } ], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" }, "papermill": { "default_parameters": {}, "duration": 44434.312064, "end_time": "2023-08-23T23:46:53.889371", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D4096-E1e-1-ctx4k-part1.ipynb", "parameters": {}, "start_time": "2023-08-23T11:26:19.577307", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }