{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "3a71b6b3", "metadata": { "papermill": { "duration": 0.003347, "end_time": "2023-09-29T09:57:11.488052", "exception": false, "start_time": "2023-09-29T09:57:11.484705", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 multi-size training experiment\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "73dce349", "metadata": { "papermill": { "duration": 0.002599, "end_time": "2023-09-29T09:57:11.495409", "exception": false, "start_time": "2023-09-29T09:57:11.492810", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "2fa01ec7", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T09:57:11.502573Z", "iopub.status.busy": "2023-09-29T09:57:11.502067Z", "iopub.status.idle": "2023-09-29T09:57:12.255533Z", "shell.execute_reply": "2023-09-29T09:57:12.254559Z" }, "papermill": { "duration": 0.759642, "end_time": "2023-09-29T09:57:12.257872", "exception": false, "start_time": "2023-09-29T09:57:11.498230", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "39dd6623", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T09:57:12.265541Z", "iopub.status.busy": "2023-09-29T09:57:12.264984Z", "iopub.status.idle": "2023-09-29T09:57:12.272639Z", "shell.execute_reply": "2023-09-29T09:57:12.271887Z" }, "papermill": { "duration": 0.013629, "end_time": "2023-09-29T09:57:12.274503", "exception": false, "start_time": "2023-09-29T09:57:12.260874", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "EMBED_SCALE=0.01\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=12\n", "EMBED_SIZE=2048\n", "\n", "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 3, "id": "cf99b23f", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T09:57:12.282369Z", "iopub.status.busy": "2023-09-29T09:57:12.281873Z", "iopub.status.idle": "2023-09-29T09:57:56.980304Z", "shell.execute_reply": "2023-09-29T09:57:56.979053Z" }, "papermill": { "duration": 44.705166, "end_time": "2023-09-29T09:57:56.982856", "exception": false, "start_time": "2023-09-29T09:57:12.277690", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 09:57:16,435] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 12\r\n", "Embedding size: 2048\r\n", "Output model path: ../model/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.01\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 -0.01 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.att.output.weight\r\n", "7168 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", "2048 7168 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", "2048 7168 0 blocks.1.ffn.value.weight\r\n", "2048 2048 1.0 blocks.2.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.att.output.weight\r\n", "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", "2048 7168 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", "2048 7168 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", "2048 7168 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.att.output.weight\r\n", "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", "2048 7168 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.6.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.6.att.output.weight\r\n", "7168 2048 1.0 blocks.6.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.6.ffn.receptance.weight\r\n", "2048 7168 0 blocks.6.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.7.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.7.att.output.weight\r\n", "7168 2048 1.0 blocks.7.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.7.ffn.receptance.weight\r\n", "2048 7168 0 blocks.7.ffn.value.weight\r\n", "2048 2048 1.0 blocks.8.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.8.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.8.att.output.weight\r\n", "7168 2048 1.0 blocks.8.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.8.ffn.receptance.weight\r\n", "2048 7168 0 blocks.8.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.9.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.9.att.output.weight\r\n", "7168 2048 1.0 blocks.9.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.9.ffn.receptance.weight\r\n", "2048 7168 0 blocks.9.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.10.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.10.att.output.weight\r\n", "7168 2048 1.0 blocks.10.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.10.ffn.receptance.weight\r\n", "2048 7168 0 blocks.10.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.11.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.11.att.output.weight\r\n", "7168 2048 1.0 blocks.11.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.11.ffn.receptance.weight\r\n", "2048 7168 0 blocks.11.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "0c176d9f", "metadata": { "papermill": { "duration": 0.008403, "end_time": "2023-09-29T09:57:57.000220", "exception": false, "start_time": "2023-09-29T09:57:56.991817", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 4, "id": "bd55a062", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T09:57:57.020044Z", "iopub.status.busy": "2023-09-29T09:57:57.019632Z", "iopub.status.idle": "2023-09-29T09:58:08.660786Z", "shell.execute_reply": "2023-09-29T09:58:08.660057Z" }, "papermill": { "duration": 11.65489, "end_time": "2023-09-29T09:58:08.664002", "exception": false, "start_time": "2023-09-29T09:57:57.009112", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Saving the dataset (0/3 shards): 0%| | 0/54401 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 6.000e-04 (0.0006)\r\n", " - lr_final: 5.000e-04 (0.0005)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.07915163040161133 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 102 M \r\n", "1 | blocks | ModuleList | 654 M \r\n", "2 | ln_out | LayerNorm | 4.1 K \r\n", "3 | head | Linear | 102 M \r\n", "--------------------------------------\r\n", "860 M Trainable params\r\n", "0 Non-trainable params\r\n", "860 M Total params\r\n", "3,442.200 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/54401 [00:00\r\n", " cli_main()\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", " LightningCLI(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", " self._run_subcommand(self.subcommand)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", " fn(**fn_kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", " call._call_and_handle_interrupt(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", " return function(*args, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", " self._run(model, ckpt_path=ckpt_path)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 973, in _run\r\n", " results = self._run_stage()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 1016, in _run_stage\r\n", " self.fit_loop.run()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 201, in run\r\n", " self.advance()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 354, in advance\r\n", " self.epoch_loop.run(self._data_fetcher)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 133, in run\r\n", " self.advance(data_fetcher)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 218, in advance\r\n", " batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 185, in run\r\n", " self._optimizer_step(kwargs.get(\"batch_idx\", 0), closure)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 260, in _optimizer_step\r\n", " call._call_lightning_module_hook(\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 144, in _call_lightning_module_hook\r\n", " output = fn(*args, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/module.py\", line 1256, in optimizer_step\r\n", " optimizer.step(closure=optimizer_closure)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/optimizer.py\", line 155, in step\r\n", " step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/ddp.py\", line 256, in optimizer_step\r\n", " optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py\", line 225, in optimizer_step\r\n", " return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 102, in optimizer_step\r\n", " return deepspeed_engine.step(**kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 2087, in step\r\n", " self._take_model_step(lr_kwargs)\r\n", " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 1994, in _take_model_step\r\n", " self.optimizer.step()\r\n", " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1715, in step\r\n", " int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)\r\n", "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.21 GiB (GPU 0; 22.19 GiB total capacity; 14.81 GiB already allocated; 2.39 GiB free; 18.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: - 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▁▁▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: real_ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: train/loss ████████████████████▂▁▃▂▂▂▃▂▂▂▂▂▃▂▂▂▃▂▃▃\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/global_step ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/learning_rate ████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: batchidx 63\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: global_rank 0\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 3052.94065\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_total.gpu.0 262080\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: real_ctx_len 4095\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: substep 63\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: train/loss 9.625\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/global_step 1\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/learning_rate 0.0006\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/3rwyj6ei\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v4\u001b[0m\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_095815-3rwyj6ei/logs\u001b[0m\r\n" ] } ], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python3 lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": 6, "id": "73f2dbdc", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T10:00:21.556396Z", "iopub.status.busy": "2023-09-29T10:00:21.555187Z", "iopub.status.idle": "2023-09-29T10:00:25.260439Z", "shell.execute_reply": "2023-09-29T10:00:25.259001Z" }, "papermill": { "duration": 3.72759, "end_time": "2023-09-29T10:00:25.263203", "exception": false, "start_time": "2023-09-29T10:00:21.535613", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 10:00:23,854] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n" ] } ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "9b1932b1", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T10:00:25.302083Z", "iopub.status.busy": "2023-09-29T10:00:25.300897Z", "iopub.status.idle": "2023-09-29T10:00:31.273775Z", "shell.execute_reply": "2023-09-29T10:00:31.272586Z" }, "papermill": { "duration": 5.996558, "end_time": "2023-09-29T10:00:31.277049", "exception": false, "start_time": "2023-09-29T10:00:25.280491", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 10:00:29,417] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in \r\n", " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n", " self.model = RWKV(**model_config)\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n", " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", "ValueError: load_model file '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n" ] } ], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 201.43354, "end_time": "2023-09-29T10:00:31.714898", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb", "parameters": {}, "start_time": "2023-09-29T09:57:10.281358", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }