3f44fb0588a4b67ebbe52911d4f27fef311310d036a5530b0334f112e00e6b8e

Browse files

Files changed (2) hide show

.gitattributes +1 -0
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb +3 -2222

.gitattributes CHANGED Viewed

@@ -99,3 +99,4 @@ experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage5.ipynb filter=lfs
 experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
 experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
 experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text

 experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
 experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
 experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
+experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text

experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb CHANGED Viewed

@@ -1,2222 +1,3 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "3a71b6b3",
-   "metadata": {
-    "papermill": {
-     "duration": 0.003347,
-     "end_time": "2023-09-29T09:57:11.488052",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:11.484705",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "# RWKV v5 multi-size training experiment\n",
-    "\n",
-    "**Note:** This project assumes you have the rwkv-infctx conda env setup"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "73dce349",
-   "metadata": {
-    "papermill": {
-     "duration": 0.002599,
-     "end_time": "2023-09-29T09:57:11.495409",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:11.492810",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "# Basic Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "2fa01ec7",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T09:57:11.502573Z",
-     "iopub.status.busy": "2023-09-29T09:57:11.502067Z",
-     "iopub.status.idle": "2023-09-29T09:57:12.255533Z",
-     "shell.execute_reply": "2023-09-29T09:57:12.254559Z"
-    },
-    "papermill": {
-     "duration": 0.759642,
-     "end_time": "2023-09-29T09:57:12.257872",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:11.498230",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# First lets setup the various directories, and init the model\n",
-    "!mkdir -p ../../../../model/\n",
-    "!mkdir -p ../../../../datapath/\n",
-    "!mkdir -p ../../../../checkpoint/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "39dd6623",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T09:57:12.265541Z",
-     "iopub.status.busy": "2023-09-29T09:57:12.264984Z",
-     "iopub.status.idle": "2023-09-29T09:57:12.272639Z",
-     "shell.execute_reply": "2023-09-29T09:57:12.271887Z"
-    },
-    "papermill": {
-     "duration": 0.013629,
-     "end_time": "2023-09-29T09:57:12.274503",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:12.260874",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "DEEPSPEED_STRAT: deepspeed_stage_1\n",
-      "ENABLE_WANDB: True\n",
-      "GPU_DEVICES: auto\n",
-      "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
-      "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
-      "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
-      "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
-     ]
-    }
-   ],
-   "source": [
-    "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
-    "GPU_DEVICES=\"auto\"\n",
-    "ENABLE_WANDB=True\n",
-    "\n",
-    "EMBED_SCALE=0.01\n",
-    "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
-    "\n",
-    "LAYER_COUNT=12\n",
-    "EMBED_SIZE=2048\n",
-    "\n",
-    "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
-    "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
-    "\n",
-    "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
-    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
-    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
-    "\n",
-    "if ENABLE_WANDB:\n",
-    "    WANDB_MODE=\"online\"\n",
-    "else:\n",
-    "    WANDB_MODE=\"disabled\"\n",
-    "\n",
-    "# Computing the notebook, and various paths\n",
-    "import os\n",
-    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
-    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
-    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
-    "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
-    "\n",
-    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
-    "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
-    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
-    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "cf99b23f",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T09:57:12.282369Z",
-     "iopub.status.busy": "2023-09-29T09:57:12.281873Z",
-     "iopub.status.idle": "2023-09-29T09:57:56.980304Z",
-     "shell.execute_reply": "2023-09-29T09:57:56.979053Z"
-    },
-    "papermill": {
-     "duration": 44.705166,
-     "end_time": "2023-09-29T09:57:56.982856",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:12.277690",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-29 09:57:16,435] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
-      "---- Initializing model ----\r\n",
-      "No of layers: 12\r\n",
-      "Embedding size: 2048\r\n",
-      "Output model path: ../model/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n",
-      "Vocab size: 50277\r\n",
-      "Emb scale: 0.01\r\n",
-      "Note: this process takes a significant time (and ram) for large models\r\n",
-      "---- ----- ----\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "50277 2048  -0.01 emb.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.0.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.0.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.0.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.0.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.0.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.0.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.0.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.0.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.1.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.1.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.1.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.1.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.1.att.output.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7168  2048  1.0  blocks.1.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.1.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.1.ffn.value.weight\r\n",
-      "2048  2048  1.0  blocks.2.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.2.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.2.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.2.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.2.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.2.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.2.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.2.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.3.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.3.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.3.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.3.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.3.att.output.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7168  2048  1.0  blocks.3.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.3.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.3.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.4.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.4.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.4.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.4.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.4.att.output.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7168  2048  1.0  blocks.4.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.4.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.4.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.5.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.5.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.5.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.5.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.5.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.5.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.5.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.5.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.6.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.6.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.6.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.6.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.6.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.6.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.6.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.6.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.7.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.7.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.7.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.7.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.7.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.7.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.7.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.7.ffn.value.weight\r\n",
-      "2048  2048  1.0  blocks.8.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.8.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.8.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.8.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.8.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.8.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.8.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.8.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.9.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.9.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.9.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.9.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.9.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.9.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.9.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.9.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.10.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.10.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.10.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.10.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.10.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.10.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.10.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.10.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.11.att.gate.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.11.att.receptance.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.11.att.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  1.0  blocks.11.att.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.11.att.output.weight\r\n",
-      "7168  2048  1.0  blocks.11.ffn.key.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2048  2048  0    blocks.11.ffn.receptance.weight\r\n",
-      "2048  7168  0    blocks.11.ffn.value.weight\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "50277 2048  0.5  head.weight\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Init the model\n",
-    "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    python3 ./init_model.py \\\n",
-    "        --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n",
-    "        --emb-scale \"{EMBED_SCALE}\" \\\n",
-    "        --vocab_size neox --skip-if-exists \\\n",
-    "        \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0c176d9f",
-   "metadata": {
-    "papermill": {
-     "duration": 0.008403,
-     "end_time": "2023-09-29T09:57:57.000220",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:56.991817",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## Enwiki Stage 1 : Foundation 4k model training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "bd55a062",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T09:57:57.020044Z",
-     "iopub.status.busy": "2023-09-29T09:57:57.019632Z",
-     "iopub.status.idle": "2023-09-29T09:58:08.660786Z",
-     "shell.execute_reply": "2023-09-29T09:58:08.660057Z"
-    },
-    "papermill": {
-     "duration": 11.65489,
-     "end_time": "2023-09-29T09:58:08.664002",
-     "exception": false,
-     "start_time": "2023-09-29T09:57:57.009112",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   0%|         | 0/54401 [00:00<?, ? examples/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   4%| | 2000/54401 [00:00<00:03, 15197.33 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   7%| | 4000/54401 [00:00<00:03, 15929.46 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  11%| | 6000/54401 [00:00<00:02, 16418.37 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  15%|▏| 8000/54401 [00:00<00:02, 16923.89 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  18%|▏| 10000/54401 [00:00<00:02, 17273.31 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  22%|▏| 12000/54401 [00:00<00:02, 17662.61 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  26%|▎| 14000/54401 [00:00<00:02, 17923.49 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  29%|▎| 16000/54401 [00:00<00:02, 18184.27 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  33%|▎| 18000/54401 [00:01<00:01, 18438.75 exam\r",
-      "Saving the dataset (1/3 shards):  33%|▎| 18134/54401 [00:01<00:01, 18438.75 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  37%|▎| 20134/54401 [00:01<00:01, 17356.03 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  41%|▍| 22134/54401 [00:01<00:01, 17970.31 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  44%|▍| 24134/54401 [00:01<00:01, 18401.36 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  48%|▍| 26134/54401 [00:01<00:01, 18772.52 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  52%|▌| 28134/54401 [00:01<00:01, 19015.25 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  55%|▌| 30134/54401 [00:01<00:01, 19175.86 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  59%|▌| 32134/54401 [00:01<00:01, 19340.44 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  63%|▋| 34134/54401 [00:01<00:01, 19458.62 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  67%|▋| 36268/54401 [00:01<00:00, 19480.21 exam\r",
-      "Saving the dataset (2/3 shards):  67%|▋| 36268/54401 [00:01<00:00, 19480.21 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  72%|▋| 39268/54401 [00:02<00:00, 19488.59 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  80%|▊| 43268/54401 [00:02<00:00, 19830.22 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  87%|▊| 47268/54401 [00:02<00:00, 20058.57 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  94%|▉| 51268/54401 [00:02<00:00, 20178.13 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards): 100%|█| 54401/54401 [00:02<00:00, 20197.74 exam\r",
-      "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:02<00:00, 20197.74 exam\r",
-      "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:02<00:00, 18877.90 exam\r\n",
-      "\r",
-      "Saving the dataset (0/1 shards):   0%|           | 0/109 [00:00<?, ? examples/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7330.11 examples/\r",
-      "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7058.50 examples/\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Lets preload the requried dataset \n",
-    "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "5e1ede96",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T09:58:08.702501Z",
-     "iopub.status.busy": "2023-09-29T09:58:08.701875Z",
-     "iopub.status.idle": "2023-09-29T10:00:21.515956Z",
-     "shell.execute_reply": "2023-09-29T10:00:21.514599Z"
-    },
-    "papermill": {
-     "duration": 132.843495,
-     "end_time": "2023-09-29T10:00:21.518814",
-     "exception": false,
-     "start_time": "2023-09-29T09:58:08.675319",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-29 09:58:12,868] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part1.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/', '--model.load_model=../model/v5-L12-D2048-E0_01-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part1.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/', '--model.load_model=../model/v5-L12-D2048-E0_01-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
-      "  rank_zero_warn(\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 207026176\r\n",
-      "  rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
-      "Global seed set to 207026176\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.11\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230929_095815-3rwyj6ei\u001b[0m\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/3rwyj6ei\u001b[0m\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "GPU available: True (cuda), used: True\r\n",
-      "TPU available: False, using: 0 TPU cores\r\n",
-      "IPU available: False, using: 0 IPUs\r\n",
-      "HPU available: False, using: 0 HPUs\r\n",
-      "\r\n",
-      "\r\n",
-      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\r\n",
-      "   - target_batch_size:       32\r\n",
-      "   - num_nodes:               1\r\n",
-      "   - num_devices:             1\r\n",
-      "   - accumulate_grad_batches: 32\r\n",
-      "   - effective_batch_size:    32\r\n",
-      "\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   0%|         | 0/54401 [00:00<?, ? examples/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   2%| | 1000/54401 [00:00<00:07, 6759.93 exampl"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   6%| | 3000/54401 [00:00<00:04, 12309.88 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):   9%| | 5000/54401 [00:00<00:03, 14423.41 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  13%|▏| 7000/54401 [00:00<00:03, 15440.72 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  17%|▏| 9000/54401 [00:00<00:02, 16338.39 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  22%|▏| 12000/54401 [00:00<00:02, 17519.79 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  28%|▎| 15000/54401 [00:00<00:02, 18285.95 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (0/3 shards):  33%|▎| 18134/54401 [00:01<00:01, 19055.07 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  33%|▎| 18134/54401 [00:01<00:01, 19055.07 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  37%|▎| 20134/54401 [00:01<00:05, 6782.90 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  44%|▍| 24134/54401 [00:02<00:03, 9522.94 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  52%|▌| 28134/54401 [00:02<00:02, 12076.84 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  59%|▌| 32134/54401 [00:02<00:01, 14336.61 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/3 shards):  66%|▋| 36134/54401 [00:02<00:01, 16234.67 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  67%|▋| 36268/54401 [00:03<00:01, 16234.67 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  70%|▋| 38268/54401 [00:03<00:02, 7102.74 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  78%|▊| 42268/54401 [00:03<00:01, 9363.53 examp"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  85%|▊| 46268/54401 [00:03<00:00, 11635.76 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards):  92%|▉| 50268/54401 [00:04<00:00, 13831.20 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (2/3 shards): 100%|▉| 54268/54401 [00:04<00:00, 15794.33 exam"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:05<00:00, 15794.33 exam\r",
-      "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:05<00:00, 10560.28 exam\r\n",
-      "\r",
-      "Saving the dataset (0/1 shards):   0%|           | 0/109 [00:00<?, ? examples/s]\r",
-      "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7298.75 examples/"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 6907.18 examples/\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[rank: 0] Global seed set to 207026176\r\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\r\n",
-      "[2023-09-29 09:58:33,172] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Enabling DeepSpeed BF16.\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\r\n",
-      "#\r\n",
-      "# RWKV lighting_trainer.py important notes \r\n",
-      "# https://github.com/RWKV/RWKV-infctx-trainer \r\n",
-      "#\r\n",
-      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\r\n",
-      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n",
-      "# - When resuming from checkpoint, the estimated time is inaccurate\r\n",
-      "#\r\n",
-      "\r\n",
-      "[RWKV.model] Configuring optimizer with\r\n",
-      "    - lr_init:  6.000e-04 (0.0006)\r\n",
-      "    - lr_final: 5.000e-04 (0.0005)\r\n",
-      "\r\n",
-      "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Detected CUDA files, patching ldflags\r\n",
-      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n",
-      "Building extension module fused_adam...\r\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ninja: no work to do.\r\n",
-      "Loading extension module fused_adam...\r\n",
-      "Time to load fused_adam op: 0.07915163040161133 seconds\r\n",
-      "Loading `train_dataloader` to estimate number of stepping batches.\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r\n",
-      "  | Name   | Type       | Params\r\n",
-      "--------------------------------------\r\n",
-      "0 | emb    | Embedding  | 102 M \r\n",
-      "1 | blocks | ModuleList | 654 M \r\n",
-      "2 | ln_out | LayerNorm  | 4.1 K \r\n",
-      "3 | head   | Linear     | 102 M \r\n",
-      "--------------------------------------\r\n",
-      "860 M     Trainable params\r\n",
-      "0         Non-trainable params\r\n",
-      "860 M     Total params\r\n",
-      "3,442.200 Total estimated model params size (MB)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Training: 0it [00:00, ?it/s]\r",
-      "Training:   0%|                                       | 0/54401 [00:00<?, ?it/s]\r",
-      "Epoch 0:   0%|                                        | 0/54401 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%|                            | 1/54401 [00:10<154:16:48, 10.21s/it]\r",
-      "Epoch 0:   0%| | 1/54401 [00:10<154:17:58, 10.21s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 2/54401 [00:11<86:21:01,  5.71s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 2/54401 [00:11<86:21:23,  5.71s/it, v_num=j6ei, train/loss=11.0"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 3/54401 [00:12<63:41:34,  4.22s/it, v_num=j6ei, train/loss=11.0\r",
-      "Epoch 0:   0%| | 3/54401 [00:12<63:41:49,  4.22s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 4/54401 [00:13<52:21:56,  3.47s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 4/54401 [00:13<52:22:08,  3.47s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 5/54401 [00:15<45:34:11,  3.02s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 5/54401 [00:15<45:34:21,  3.02s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 6/54401 [00:16<41:01:56,  2.72s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 6/54401 [00:16<41:02:04,  2.72s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 7/54401 [00:17<37:47:47,  2.50s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 7/54401 [00:17<37:47:53,  2.50s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 8/54401 [00:18<35:22:29,  2.34s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 8/54401 [00:18<35:22:35,  2.34s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 9/54401 [00:19<33:29:09,  2.22s/it, v_num=j6ei, train/loss=10.9\r",
-      "Epoch 0:   0%| | 9/54401 [00:19<33:29:15,  2.22s/it, v_num=j6ei, train/loss=10.9"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 10/54401 [00:21<31:58:25,  2.12s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 10/54401 [00:21<31:58:30,  2.12s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 11/54401 [00:22<30:44:06,  2.03s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 11/54401 [00:22<30:44:10,  2.03s/it, v_num=j6ei, train/loss=11."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 12/54401 [00:23<29:42:14,  1.97s/it, v_num=j6ei, train/loss=11.\r",
-      "Epoch 0:   0%| | 12/54401 [00:23<29:42:17,  1.97s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 13/54401 [00:24<28:49:59,  1.91s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 13/54401 [00:24<28:50:03,  1.91s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 14/54401 [00:26<28:05:17,  1.86s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 14/54401 [00:26<28:05:20,  1.86s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 15/54401 [00:27<27:26:24,  1.82s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 15/54401 [00:27<27:26:27,  1.82s/it, v_num=j6ei, train/loss=11."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 16/54401 [00:28<26:52:37,  1.78s/it, v_num=j6ei, train/loss=11."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 16/54401 [00:28<26:52:39,  1.78s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 17/54401 [00:29<26:22:57,  1.75s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 17/54401 [00:29<26:23:00,  1.75s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 18/54401 [00:30<25:57:05,  1.72s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 18/54401 [00:30<25:57:08,  1.72s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 19/54401 [00:32<25:33:20,  1.69s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 19/54401 [00:32<25:33:22,  1.69s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 20/54401 [00:33<25:11:45,  1.67s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 20/54401 [00:33<25:11:48,  1.67s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 21/54401 [00:34<24:52:15,  1.65s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 21/54401 [00:34<24:52:17,  1.65s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 22/54401 [00:35<24:34:31,  1.63s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 22/54401 [00:35<24:34:33,  1.63s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 23/54401 [00:37<24:18:22,  1.61s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 23/54401 [00:37<24:18:24,  1.61s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 24/54401 [00:38<24:03:23,  1.59s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 24/54401 [00:38<24:03:25,  1.59s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 25/54401 [00:39<23:49:44,  1.58s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 25/54401 [00:39<23:49:46,  1.58s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 26/54401 [00:40<23:37:08,  1.56s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 26/54401 [00:40<23:37:10,  1.56s/it, v_num=j6ei, train/loss=11."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 27/54401 [00:41<23:25:25,  1.55s/it, v_num=j6ei, train/loss=11.\r",
-      "Epoch 0:   0%| | 27/54401 [00:41<23:25:27,  1.55s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 28/54401 [00:43<23:14:36,  1.54s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 28/54401 [00:43<23:14:38,  1.54s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 29/54401 [00:44<23:04:27,  1.53s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 29/54401 [00:44<23:04:28,  1.53s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 30/54401 [00:45<22:55:04,  1.52s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 30/54401 [00:45<22:55:05,  1.52s/it, v_num=j6ei, train/loss=11."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 31/54401 [00:46<22:46:11,  1.51s/it, v_num=j6ei, train/loss=11.\r",
-      "Epoch 0:   0%| | 31/54401 [00:46<22:46:12,  1.51s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 32/54401 [00:48<22:43:11,  1.50s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 32/54401 [00:48<22:45:59,  1.51s/it, v_num=j6ei, train/loss=10."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 33/54401 [00:49<22:37:44,  1.50s/it, v_num=j6ei, train/loss=10.\r",
-      "Epoch 0:   0%| | 33/54401 [00:49<22:37:46,  1.50s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 34/54401 [00:50<22:30:18,  1.49s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 34/54401 [00:50<22:30:19,  1.49s/it, v_num=j6ei, train/loss=9.2"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 35/54401 [00:51<22:23:17,  1.48s/it, v_num=j6ei, train/loss=9.2\r",
-      "Epoch 0:   0%| | 35/54401 [00:51<22:23:18,  1.48s/it, v_num=j6ei, train/loss=9.3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 36/54401 [00:53<22:16:41,  1.48s/it, v_num=j6ei, train/loss=9.3\r",
-      "Epoch 0:   0%| | 36/54401 [00:53<22:16:42,  1.48s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 37/54401 [00:54<22:10:26,  1.47s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 37/54401 [00:54<22:10:27,  1.47s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 38/54401 [00:55<22:04:33,  1.46s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 38/54401 [00:55<22:04:35,  1.46s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 39/54401 [00:56<21:58:58,  1.46s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 39/54401 [00:56<21:58:59,  1.46s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 40/54401 [00:57<21:53:35,  1.45s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 40/54401 [00:57<21:53:36,  1.45s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 41/54401 [00:59<21:48:31,  1.44s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 41/54401 [00:59<21:48:32,  1.44s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 42/54401 [01:00<21:43:42,  1.44s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 42/54401 [01:00<21:43:43,  1.44s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 43/54401 [01:01<21:39:04,  1.43s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 43/54401 [01:01<21:39:06,  1.43s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 44/54401 [01:02<21:34:37,  1.43s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 44/54401 [01:02<21:34:38,  1.43s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 45/54401 [01:04<21:30:26,  1.42s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 45/54401 [01:04<21:30:27,  1.42s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 46/54401 [01:05<21:26:24,  1.42s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 46/54401 [01:05<21:26:25,  1.42s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 47/54401 [01:06<21:22:30,  1.42s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 47/54401 [01:06<21:22:31,  1.42s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 48/54401 [01:07<21:18:53,  1.41s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 48/54401 [01:07<21:18:54,  1.41s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 49/54401 [01:08<21:15:23,  1.41s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 49/54401 [01:08<21:15:24,  1.41s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 50/54401 [01:10<21:11:54,  1.40s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 50/54401 [01:10<21:11:55,  1.40s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 51/54401 [01:11<21:08:37,  1.40s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 51/54401 [01:11<21:08:38,  1.40s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 52/54401 [01:12<21:05:29,  1.40s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 52/54401 [01:12<21:05:30,  1.40s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 53/54401 [01:13<21:02:31,  1.39s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 53/54401 [01:13<21:02:32,  1.39s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 54/54401 [01:15<20:59:37,  1.39s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 54/54401 [01:15<20:59:38,  1.39s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 55/54401 [01:16<20:56:54,  1.39s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 55/54401 [01:16<20:56:55,  1.39s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 56/54401 [01:17<20:54:17,  1.38s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 56/54401 [01:17<20:54:18,  1.38s/it, v_num=j6ei, train/loss=9.3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 57/54401 [01:18<20:51:46,  1.38s/it, v_num=j6ei, train/loss=9.3\r",
-      "Epoch 0:   0%| | 57/54401 [01:18<20:51:47,  1.38s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 58/54401 [01:20<20:49:18,  1.38s/it, v_num=j6ei, train/loss=9.4\r",
-      "Epoch 0:   0%| | 58/54401 [01:20<20:49:19,  1.38s/it, v_num=j6ei, train/loss=9.3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 59/54401 [01:21<20:46:56,  1.38s/it, v_num=j6ei, train/loss=9.3\r",
-      "Epoch 0:   0%| | 59/54401 [01:21<20:46:57,  1.38s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 60/54401 [01:22<20:44:35,  1.37s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 60/54401 [01:22<20:44:35,  1.37s/it, v_num=j6ei, train/loss=9.5"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 61/54401 [01:23<20:42:18,  1.37s/it, v_num=j6ei, train/loss=9.5\r",
-      "Epoch 0:   0%| | 61/54401 [01:23<20:42:19,  1.37s/it, v_num=j6ei, train/loss=9.3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 62/54401 [01:24<20:40:02,  1.37s/it, v_num=j6ei, train/loss=9.3\r",
-      "Epoch 0:   0%| | 62/54401 [01:24<20:40:03,  1.37s/it, v_num=j6ei, train/loss=9.6"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Epoch 0:   0%| | 63/54401 [01:26<20:37:53,  1.37s/it, v_num=j6ei, train/loss=9.6\r",
-      "Epoch 0:   0%| | 63/54401 [01:26<20:37:54,  1.37s/it, v_num=j6ei, train/loss=9.4"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Traceback (most recent call last):\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
-      "    cli_main()\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
-      "    LightningCLI(\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n",
-      "    self._run_subcommand(self.subcommand)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n",
-      "    fn(**fn_kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n",
-      "    call._call_and_handle_interrupt(\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n",
-      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n",
-      "    return function(*args, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n",
-      "    self._run(model, ckpt_path=ckpt_path)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 973, in _run\r\n",
-      "    results = self._run_stage()\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 1016, in _run_stage\r\n",
-      "    self.fit_loop.run()\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 201, in run\r\n",
-      "    self.advance()\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 354, in advance\r\n",
-      "    self.epoch_loop.run(self._data_fetcher)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 133, in run\r\n",
-      "    self.advance(data_fetcher)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 218, in advance\r\n",
-      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 185, in run\r\n",
-      "    self._optimizer_step(kwargs.get(\"batch_idx\", 0), closure)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 260, in _optimizer_step\r\n",
-      "    call._call_lightning_module_hook(\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 144, in _call_lightning_module_hook\r\n",
-      "    output = fn(*args, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/module.py\", line 1256, in optimizer_step\r\n",
-      "    optimizer.step(closure=optimizer_closure)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/optimizer.py\", line 155, in step\r\n",
-      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/ddp.py\", line 256, in optimizer_step\r\n",
-      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py\", line 225, in optimizer_step\r\n",
-      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 102, in optimizer_step\r\n",
-      "    return deepspeed_engine.step(**kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 2087, in step\r\n",
-      "    self._take_model_step(lr_kwargs)\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 1994, in _take_model_step\r\n",
-      "    self.optimizer.step()\r\n",
-      "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1715, in step\r\n",
-      "    int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)\r\n",
-      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.21 GiB (GPU 0; 22.19 GiB total capacity; 14.81 GiB already allocated; 2.39 GiB free; 18.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: - 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▁▁▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              real_ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss ████████████████████▂▁▃▂▂▂▃▂▂▂▂▂▃▂▂▂▃▂▃▃\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate ████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx 63\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank 0\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 3052.94065\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 262080\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              real_ctx_len 4095\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep 63\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss 9.625\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step 1\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate 0.0006\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/3rwyj6ei\u001b[0m\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v4\u001b[0m\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_095815-3rwyj6ei/logs\u001b[0m\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Start the foundation model training\n",
-    "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
-    "    python3 lightning_trainer.py fit \\\n",
-    "        -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n",
-    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
-    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
-    "        --trainer.devices=\"{GPU_DEVICES}\" \\\n",
-    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n",
-    "        --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n",
-    "        --model.ctx_len=4096 \\\n",
-    "        --model.bptt_learning_range=1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "73f2dbdc",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T10:00:21.556396Z",
-     "iopub.status.busy": "2023-09-29T10:00:21.555187Z",
-     "iopub.status.idle": "2023-09-29T10:00:25.260439Z",
-     "shell.execute_reply": "2023-09-29T10:00:25.259001Z"
-    },
-    "papermill": {
-     "duration": 3.72759,
-     "end_time": "2023-09-29T10:00:25.263203",
-     "exception": false,
-     "start_time": "2023-09-29T10:00:21.535613",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-29 10:00:23,854] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Traceback (most recent call last):\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
-      "    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
-      "    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
-      "    raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
-      "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ls: cannot access '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Lets export the model from the checkpoint\n",
-    "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n",
-    "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "9b1932b1",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-09-29T10:00:25.302083Z",
-     "iopub.status.busy": "2023-09-29T10:00:25.300897Z",
-     "iopub.status.idle": "2023-09-29T10:00:31.273775Z",
-     "shell.execute_reply": "2023-09-29T10:00:31.272586Z"
-    },
-    "papermill": {
-     "duration": 5.996558,
-     "end_time": "2023-09-29T10:00:31.277049",
-     "exception": false,
-     "start_time": "2023-09-29T10:00:25.280491",
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-29 10:00:29,417] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
-      "Traceback (most recent call last):\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
-      "    model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
-      "    self.model = RWKV(**model_config)\r\n",
-      "  File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
-      "    raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
-      "ValueError: load_model file '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "# # Lets do a quick dragon prompt validation\n",
-    "!cd \"{INFERENCE_DIR}\" && \\\n",
-    "    python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "papermill": {
-   "default_parameters": {},
-   "duration": 201.43354,
-   "end_time": "2023-09-29T10:00:31.714898",
-   "environment_variables": {},
-   "exception": null,
-   "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb",
-   "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb",
-   "parameters": {},
-   "start_time": "2023-09-29T09:57:10.281358",
-   "version": "2.4.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:b53c27ed2c20b9f1f690647a83c0fbe2ce09594518b9ec557f515a4f8b548f2b
+size 15941299