{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "f5ebb021", "metadata": { "papermill": { "duration": 0.002819, "end_time": "2023-09-29T04:50:11.669839", "exception": false, "start_time": "2023-09-29T04:50:11.667020", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 multi-size training experiment\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6e6abc3f", "metadata": { "papermill": { "duration": 0.00214, "end_time": "2023-09-29T04:50:11.676239", "exception": false, "start_time": "2023-09-29T04:50:11.674099", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "3d2405bd", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:11.682830Z", "iopub.status.busy": "2023-09-29T04:50:11.682139Z", "iopub.status.idle": "2023-09-29T04:50:12.432460Z", "shell.execute_reply": "2023-09-29T04:50:12.431486Z" }, "papermill": { "duration": 0.756299, "end_time": "2023-09-29T04:50:12.434815", "exception": false, "start_time": "2023-09-29T04:50:11.678516", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "66fd5201", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:12.446546Z", "iopub.status.busy": "2023-09-29T04:50:12.446098Z", "iopub.status.idle": "2023-09-29T04:50:12.454394Z", "shell.execute_reply": "2023-09-29T04:50:12.453644Z" }, "papermill": { "duration": 0.018125, "end_time": "2023-09-29T04:50:12.456177", "exception": false, "start_time": "2023-09-29T04:50:12.438052", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "EMBED_SCALE=0.01\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "LAYER_COUNT=6\n", "EMBED_SIZE=2048\n", "\n", "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e0b56789", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:12.464627Z", "iopub.status.busy": "2023-09-29T04:50:12.464037Z", "iopub.status.idle": "2023-09-29T04:50:42.488005Z", "shell.execute_reply": "2023-09-29T04:50:42.486665Z" }, "papermill": { "duration": 30.031629, "end_time": "2023-09-29T04:50:42.490859", "exception": false, "start_time": "2023-09-29T04:50:12.459230", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-09-29 04:50:16,856] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 2048\r\n", "Output model path: ../model/v5-L6-D2048-E0_01-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.01\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 2048 -0.01 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.att.output.weight\r\n", "7168 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", "2048 7168 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.att.output.weight\r\n", "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", "2048 7168 0 blocks.1.ffn.value.weight\r\n", "2048 2048 1.0 blocks.2.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.2.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 7168 0 blocks.2.ffn.value.weight\r\n", "2048 2048 1.0 blocks.3.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.att.output.weight\r\n", "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", "2048 7168 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.att.output.weight\r\n", "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", "2048 7168 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.gate.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", "2048 7168 0 blocks.5.ffn.value.weight\r\n", "50277 2048 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" ] }, { "cell_type": "markdown", "id": "e3057f8b", "metadata": { "papermill": { "duration": 0.006306, "end_time": "2023-09-29T04:50:42.503924", "exception": false, "start_time": "2023-09-29T04:50:42.497618", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 4, "id": "c06c6ad2", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:42.519776Z", "iopub.status.busy": "2023-09-29T04:50:42.518734Z", "iopub.status.idle": "2023-09-29T04:50:47.537394Z", "shell.execute_reply": "2023-09-29T04:50:47.535673Z" }, "papermill": { "duration": 5.029265, "end_time": "2023-09-29T04:50:47.539698", "exception": false, "start_time": "2023-09-29T04:50:42.510433", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Traceback (most recent call last):\r\n", " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/preload_datapath.py\", line 20, in \r\n", " assert os.path.exists(config_file), \"Config file does not exist\"\r\n", "AssertionError: Config file does not exist\r\n" ] } ], "source": [ "# Lets preload the requried dataset \n", "!cd \"{TRAINER_DIR}\" && \\\n", " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "4cc7e34f", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:47.552392Z", "iopub.status.busy": "2023-09-29T04:50:47.551853Z", "iopub.status.idle": "2023-09-29T04:50:47.806392Z", "shell.execute_reply": "2023-09-29T04:50:47.805379Z" }, "papermill": { "duration": 0.264553, "end_time": "2023-09-29T04:50:47.809133", "exception": false, "start_time": "2023-09-29T04:50:47.544580", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: cd: can't cd to {TRAINER_DIR}\r\n" ] } ], "source": [ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", " python lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", " --trainer.devices=\"{GPU_DEVICES}\" \\\n", " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n", " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n", " --model.ctx_len=4096 \\\n", " --model.bptt_learning_range=1" ] }, { "cell_type": "code", "execution_count": 6, "id": "0b3b8134", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:47.825099Z", "iopub.status.busy": "2023-09-29T04:50:47.824495Z", "iopub.status.idle": "2023-09-29T04:50:48.327589Z", "shell.execute_reply": "2023-09-29T04:50:48.326466Z" }, "papermill": { "duration": 0.514109, "end_time": "2023-09-29T04:50:48.330177", "exception": false, "start_time": "2023-09-29T04:50:47.816068", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: python: not found\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ls: cannot access '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n" ] } ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "92869fb9", "metadata": { "execution": { "iopub.execute_input": "2023-09-29T04:50:48.346924Z", "iopub.status.busy": "2023-09-29T04:50:48.346311Z", "iopub.status.idle": "2023-09-29T04:50:48.600443Z", "shell.execute_reply": "2023-09-29T04:50:48.599423Z" }, "papermill": { "duration": 0.26565, "end_time": "2023-09-29T04:50:48.603118", "exception": false, "start_time": "2023-09-29T04:50:48.337468", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/bin/sh: 1: cd: can't cd to {INFERENCE_DIR}\r\n" ] } ], "source": [ "# # Lets do a quick dragon prompt validation\n", "!cd \"{INFERENCE_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 38.643439, "end_time": "2023-09-29T04:50:49.032467", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "parameters": {}, "start_time": "2023-09-29T04:50:10.389028", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }