{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "430dbd1b", "metadata": { "papermill": { "duration": 0.005154, "end_time": "2023-08-29T10:52:03.691555", "exception": false, "start_time": "2023-08-29T10:52:03.686401", "status": "completed" }, "tags": [] }, "source": [ "# RWKV v5 / embedding init-range 1e-01 / 4k\n", "\n", "- 96 layers\n", "- 1024 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", "**Note:** This project assumes you have the rwkv-infctx conda env setup" ] }, { "attachments": {}, "cell_type": "markdown", "id": "9d410278", "metadata": { "papermill": { "duration": 0.003177, "end_time": "2023-08-29T10:52:03.698289", "exception": false, "start_time": "2023-08-29T10:52:03.695112", "status": "completed" }, "tags": [] }, "source": [ "# Basic Setup" ] }, { "cell_type": "code", "execution_count": 1, "id": "ee9adae2", "metadata": { "execution": { "iopub.execute_input": "2023-08-29T10:52:03.706507Z", "iopub.status.busy": "2023-08-29T10:52:03.706179Z", "iopub.status.idle": "2023-08-29T10:52:04.431883Z", "shell.execute_reply": "2023-08-29T10:52:04.430945Z" }, "papermill": { "duration": 0.732611, "end_time": "2023-08-29T10:52:04.434319", "exception": false, "start_time": "2023-08-29T10:52:03.701708", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories, and init the model\n", "!mkdir -p ../../../../model/\n", "!mkdir -p ../../../../datapath/\n", "!mkdir -p ../../../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "fed25227", "metadata": { "execution": { "iopub.execute_input": "2023-08-29T10:52:04.443095Z", "iopub.status.busy": "2023-08-29T10:52:04.442902Z", "iopub.status.idle": "2023-08-29T10:52:07.311465Z", "shell.execute_reply": "2023-08-29T10:52:07.310597Z" }, "papermill": { "duration": 2.874873, "end_time": "2023-08-29T10:52:07.313158", "exception": false, "start_time": "2023-08-29T10:52:04.438285", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n" ] } ], "source": [ "# Additional dependencies for eval stuff\n", "!pip install -q aiocsv aiofiles" ] }, { "cell_type": "code", "execution_count": 3, "id": "e6f3b0a3", "metadata": { "execution": { "iopub.execute_input": "2023-08-29T10:52:07.321866Z", "iopub.status.busy": "2023-08-29T10:52:07.321676Z", "iopub.status.idle": "2023-08-29T10:52:07.328125Z", "shell.execute_reply": "2023-08-29T10:52:07.327553Z" }, "papermill": { "duration": 0.012062, "end_time": "2023-08-29T10:52:07.329205", "exception": false, "start_time": "2023-08-29T10:52:07.317143", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DEEPSPEED_STRAT: deepspeed_stage_1\n", "ENABLE_WANDB: True\n", "GPU_DEVICES: auto\n", "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-memory\n", "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n", "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n" ] } ], "source": [ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", "GPU_DEVICES=\"auto\"\n", "ENABLE_WANDB=True\n", "\n", "# Layer count and embed dim to start with\n", "LAYER_COUNT=96\n", "EMBED_DIM=1024\n", "\n", "# Wavnet compatibility?\n", "RWKV_WAVENET_LAYERS=0\n", "\n", "EMBED_SCALE=0.1\n", "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", "\n", "WANDB_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", "\n", "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", "\n", "if ENABLE_WANDB:\n", " WANDB_MODE=\"online\"\n", "else:\n", " WANDB_MODE=\"disabled\"\n", "\n", "# Computing the notebook, and various paths\n", "import os\n", "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", "\n", "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", "print(\"PROJECT_DIR:\", PROJECT_DIR)" ] }, { "cell_type": "code", "execution_count": 4, "id": "f8f06ddc", "metadata": { "execution": { "iopub.execute_input": "2023-08-29T10:52:07.337224Z", "iopub.status.busy": "2023-08-29T10:52:07.337057Z", "iopub.status.idle": "2023-08-29T10:54:16.966582Z", "shell.execute_reply": "2023-08-29T10:54:16.965488Z" }, "papermill": { "duration": 129.636235, "end_time": "2023-08-29T10:54:16.969097", "exception": false, "start_time": "2023-08-29T10:52:07.332862", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 96\r\n", "Embedding size: 1024\r\n", "Output model path: ../model/v5-L96-D1024-E0_1-neox-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 1024 -0.1 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.0.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.0.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.0.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.0.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.0.ffn.receptance.weight\r\n", "1024 4096 0 blocks.0.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.1.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.2.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.2.ffn.receptance.weight\r\n", "1024 4096 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.3.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.4.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.4.ffn.receptance.weight\r\n", "1024 4096 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.5.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.5.ffn.receptance.weight\r\n", "1024 4096 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.6.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.6.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.6.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.6.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.6.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.6.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.6.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.7.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.7.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.7.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.7.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.7.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.7.ffn.receptance.weight\r\n", "1024 4096 0 blocks.7.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.8.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.8.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.8.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.8.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.8.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.8.ffn.receptance.weight\r\n", "1024 4096 0 blocks.8.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.9.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.9.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.9.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.9.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.9.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.9.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.9.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.10.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.10.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.10.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.10.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.10.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.10.ffn.receptance.weight\r\n", "1024 4096 0 blocks.10.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.11.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.11.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.11.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.11.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.11.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.11.ffn.receptance.weight\r\n", "1024 4096 0 blocks.11.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.12.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.12.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.12.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.12.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.12.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.12.ffn.receptance.weight\r\n", "1024 4096 0 blocks.12.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.13.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.13.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.13.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.13.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.13.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.13.ffn.receptance.weight\r\n", "1024 4096 0 blocks.13.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.14.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.14.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.14.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.14.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.14.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.14.ffn.receptance.weight\r\n", "1024 4096 0 blocks.14.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.15.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.15.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.15.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.15.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.15.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.15.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.15.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.16.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.16.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.16.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.16.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.16.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.16.ffn.receptance.weight\r\n", "1024 4096 0 blocks.16.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.17.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.17.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.17.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.17.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.17.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.17.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.17.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.18.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.18.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.18.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.18.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.18.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.18.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.18.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.19.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.19.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.19.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.19.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.19.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.19.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.19.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.20.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.20.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.20.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.20.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.20.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.20.ffn.receptance.weight\r\n", "1024 4096 0 blocks.20.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.21.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.21.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.21.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.21.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.21.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.21.ffn.receptance.weight\r\n", "1024 4096 0 blocks.21.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.22.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.22.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.22.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.22.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.22.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.22.ffn.receptance.weight\r\n", "1024 4096 0 blocks.22.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.23.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.23.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.23.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.23.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.23.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.23.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.23.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.24.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.24.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.24.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.24.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.24.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.24.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.24.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.25.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.25.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.25.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.25.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.25.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.25.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.25.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.26.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.26.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.26.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.26.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.26.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.26.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.26.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.27.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.27.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.27.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.27.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.27.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.27.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.27.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.28.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.28.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.28.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.28.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.28.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.28.ffn.receptance.weight\r\n", "1024 4096 0 blocks.28.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.29.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.29.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.29.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.29.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.29.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.29.ffn.receptance.weight\r\n", "1024 4096 0 blocks.29.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.30.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.30.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.30.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.30.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.30.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.30.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.30.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.31.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.31.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.31.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.31.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.31.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.31.ffn.receptance.weight\r\n", "1024 4096 0 blocks.31.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.32.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.32.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.32.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.32.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.32.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.32.ffn.receptance.weight\r\n", "1024 4096 0 blocks.32.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.33.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.33.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.33.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.33.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.33.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.33.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.33.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.34.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.34.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.34.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.34.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.34.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.34.ffn.receptance.weight\r\n", "1024 4096 0 blocks.34.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.35.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.35.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.35.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.35.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.35.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.35.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.35.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.36.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.36.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.36.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.36.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.36.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.36.ffn.receptance.weight\r\n", "1024 4096 0 blocks.36.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.37.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.37.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.37.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.37.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.37.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.37.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.37.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.38.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.38.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.38.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.38.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.38.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.38.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.38.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.39.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.39.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.39.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.39.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.39.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.39.ffn.receptance.weight\r\n", "1024 4096 0 blocks.39.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.40.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.40.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.40.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.40.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.40.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.40.ffn.receptance.weight\r\n", "1024 4096 0 blocks.40.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.41.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.41.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.41.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.41.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.41.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.41.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.41.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.42.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.42.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.42.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.42.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.42.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.42.ffn.receptance.weight\r\n", "1024 4096 0 blocks.42.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.43.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.43.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.43.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.43.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.43.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.43.ffn.receptance.weight\r\n", "1024 4096 0 blocks.43.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.44.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.44.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.44.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.44.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.44.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.44.ffn.receptance.weight\r\n", "1024 4096 0 blocks.44.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.45.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.45.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.45.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.45.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.45.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.45.ffn.receptance.weight\r\n", "1024 4096 0 blocks.45.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.46.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.46.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.46.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.46.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.46.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.46.ffn.receptance.weight\r\n", "1024 4096 0 blocks.46.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.47.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.47.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.47.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.47.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.47.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.47.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.47.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.48.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.48.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.48.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.48.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.48.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.48.ffn.receptance.weight\r\n", "1024 4096 0 blocks.48.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.49.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.49.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.49.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.49.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.49.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.49.ffn.receptance.weight\r\n", "1024 4096 0 blocks.49.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.50.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.50.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.50.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.50.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.50.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.50.ffn.receptance.weight\r\n", "1024 4096 0 blocks.50.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.51.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.51.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.51.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.51.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.51.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.51.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.51.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.52.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.52.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.52.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.52.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.52.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.52.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.52.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.53.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.53.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.53.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.53.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.53.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.53.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.53.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.54.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.54.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.54.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.54.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.54.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.54.ffn.receptance.weight\r\n", "1024 4096 0 blocks.54.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.55.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.55.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.55.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.55.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.55.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.55.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.55.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.56.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.56.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.56.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.56.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.56.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.56.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.56.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.57.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.57.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.57.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.57.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.57.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.57.ffn.receptance.weight\r\n", "1024 4096 0 blocks.57.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.58.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.58.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.58.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.58.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.58.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.58.ffn.receptance.weight\r\n", "1024 4096 0 blocks.58.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.59.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.59.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.59.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.59.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.59.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.59.ffn.receptance.weight\r\n", "1024 4096 0 blocks.59.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.60.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.60.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.60.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.60.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.60.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.60.ffn.receptance.weight\r\n", "1024 4096 0 blocks.60.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.61.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.61.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.61.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.61.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.61.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.61.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.61.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.62.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.62.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.62.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.62.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.62.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.62.ffn.receptance.weight\r\n", "1024 4096 0 blocks.62.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.63.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.63.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.63.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.63.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.63.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.63.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.63.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.64.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.64.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.64.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.64.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.64.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.64.ffn.receptance.weight\r\n", "1024 4096 0 blocks.64.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.65.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.65.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.65.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.65.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.65.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.65.ffn.receptance.weight\r\n", "1024 4096 0 blocks.65.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.66.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.66.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.66.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.66.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.66.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.66.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.66.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.67.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.67.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.67.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.67.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.67.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.67.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.67.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.68.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.68.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.68.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.68.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.68.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.68.ffn.receptance.weight\r\n", "1024 4096 0 blocks.68.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.69.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.69.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.69.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.69.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.69.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.69.ffn.receptance.weight\r\n", "1024 4096 0 blocks.69.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.70.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.70.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.70.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.70.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.70.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.70.ffn.receptance.weight\r\n", "1024 4096 0 blocks.70.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.71.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.71.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.71.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.71.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.71.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.71.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.71.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.72.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.72.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.72.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.72.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.72.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.72.ffn.receptance.weight\r\n", "1024 4096 0 blocks.72.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.73.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.73.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.73.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.73.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.73.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.73.ffn.receptance.weight\r\n", "1024 4096 0 blocks.73.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.74.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.74.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.74.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.74.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.74.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.74.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.74.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.75.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.75.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.75.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.75.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.75.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.75.ffn.receptance.weight\r\n", "1024 4096 0 blocks.75.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.76.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.76.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.76.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.76.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.76.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.76.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.76.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.77.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.77.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.77.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.77.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.77.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.77.ffn.receptance.weight\r\n", "1024 4096 0 blocks.77.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.78.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.78.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.78.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.78.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.78.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.78.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.78.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.79.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.79.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.79.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.79.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.79.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.79.ffn.receptance.weight\r\n", "1024 4096 0 blocks.79.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.80.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.80.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.80.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.80.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.80.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.80.ffn.receptance.weight\r\n", "1024 4096 0 blocks.80.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.81.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.81.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.81.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.81.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.81.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.81.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.81.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.82.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.82.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.82.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.82.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.82.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.82.ffn.receptance.weight\r\n", "1024 4096 0 blocks.82.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.83.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.83.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.83.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.83.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.83.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.83.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.83.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.84.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.84.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.84.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.84.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.84.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.84.ffn.receptance.weight\r\n", "1024 4096 0 blocks.84.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.85.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.85.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.85.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.85.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.85.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.85.ffn.receptance.weight\r\n", "1024 4096 0 blocks.85.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.86.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.86.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.86.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.86.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.86.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.86.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.86.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.87.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.87.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.87.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.87.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.87.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.87.ffn.receptance.weight\r\n", "1024 4096 0 blocks.87.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.88.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.88.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.88.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.88.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.88.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.88.ffn.receptance.weight\r\n", "1024 4096 0 blocks.88.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.89.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.89.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.89.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.89.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.89.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.89.ffn.receptance.weight\r\n", "1024 4096 0 blocks.89.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.90.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.90.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.90.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.90.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.90.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.90.ffn.receptance.weight\r\n", "1024 4096 0 blocks.90.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.91.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.91.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.91.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.91.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.91.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.91.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.91.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.92.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.92.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.92.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.92.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.92.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.92.ffn.receptance.weight\r\n", "1024 4096 0 blocks.92.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.93.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.93.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.93.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.93.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.93.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.93.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 4096 0 blocks.93.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.94.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.94.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.94.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.94.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.94.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.94.ffn.receptance.weight\r\n", "1024 4096 0 blocks.94.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.95.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.95.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 1.0 blocks.95.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.95.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "4096 1024 1.0 blocks.95.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1024 1024 0 blocks.95.ffn.receptance.weight\r\n", "1024 4096 0 blocks.95.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "50277 1024 0.5 head.weight\r\n" ] } ], "source": [ "# Init the model\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" ] }, { "cell_type": "markdown", "id": "369a71fe", "metadata": { "papermill": { "duration": 0.032056, "end_time": "2023-08-29T10:54:17.086208", "exception": false, "start_time": "2023-08-29T10:54:17.054152", "status": "completed" }, "tags": [] }, "source": [ "## Enwiki Stage 1 : Foundation 4k model training" ] }, { "cell_type": "code", "execution_count": 5, "id": "b15d63e6", "metadata": { "execution": { "iopub.execute_input": "2023-08-29T10:54:17.152458Z", "iopub.status.busy": "2023-08-29T10:54:17.152217Z", "iopub.status.idle": "2023-08-29T10:54:28.463275Z", "shell.execute_reply": "2023-08-29T10:54:28.462382Z" }, "papermill": { "duration": 11.34712, "end_time": "2023-08-29T10:54:28.465462", "exception": false, "start_time": "2023-08-29T10:54:17.118342", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", "\r", " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 6.000e-04 (0.0006)\r\n", " - lr_final: 4.000e-04 (0.0004)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.0686655044555664 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10175180435180664 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10137462615966797 seconds\r\n", "Time to load fused_adam op: 0.10145688056945801 seconds\r\n", "Time to load fused_adam op: 0.10144901275634766 seconds\r\n", "Time to load fused_adam op: 0.10149240493774414 seconds\r\n", "Time to load fused_adam op: 0.1016225814819336 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10149359703063965 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", "ninja: no work to do.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Time to load utils op: 0.07332181930541992 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10331416130065918 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10317444801330566 seconds\r\n", "Time to load utils op: 0.10267972946166992 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.1027224063873291 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10244035720825195 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Time to load utils op: 0.10284280776977539 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.1026923656463623 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n", "Rank: 4 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0007910728454589844 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006494522094726562 seconds\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Time to load utils op: 0.0006210803985595703 seconds\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0007061958312988281 seconds\r\n", "Time to load utils op: 0.0006842613220214844 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006155967712402344 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.000637054443359375 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0009512901306152344 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 51.5 M\r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 2.0 K \r\n", "3 | head | Linear | 51.5 M\r\n", "--------------------------------------\r\n", "1.4 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.4 B Total params\r\n", "5,650.715 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/10186 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 4.000e-04 (0.0004)\r\n", " - lr_final: 3.000e-04 (0.0003)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.07061624526977539 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.1013798713684082 seconds\r\n", "Time to load fused_adam op: 0.10146570205688477 seconds\r\n", "Time to load fused_adam op: 0.10148930549621582 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10149049758911133 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to load fused_adam op: 0.10142898559570312 seconds\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10200691223144531 seconds\r\n", "Time to load fused_adam op: 0.10409951210021973 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.06915402412414551 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Time to load utils op: 0.10242414474487305 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10329604148864746 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.1024332046508789 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10218477249145508 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10254240036010742 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10277557373046875 seconds\r\n", "Time to load utils op: 0.1038351058959961 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.00103759765625 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006682872772216797 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0012485980987548828 seconds\r\n", "Time to load utils op: 0.000621795654296875 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0007317066192626953 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0011317729949951172 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0013556480407714844 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008046627044677734 seconds\r\n", "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 51.5 M\r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 2.0 K \r\n", "3 | head | Linear | 51.5 M\r\n", "--------------------------------------\r\n", "1.4 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.4 B Total params\r\n", "5,650.715 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/1867 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 8.000e-04 (0.0008)\r\n", " - lr_final: 5.000e-04 (0.0005)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.06505656242370605 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10134768486022949 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10140538215637207 seconds\r\n", "Time to load fused_adam op: 0.10127806663513184 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10140156745910645 seconds\r\n", "Time to load fused_adam op: 0.10147309303283691 seconds\r\n", "Time to load fused_adam op: 0.10154032707214355 seconds\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10238361358642578 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07725977897644043 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10293865203857422 seconds\r\n", "Time to load utils op: 0.10255193710327148 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10571074485778809 seconds\r\n", "Time to load utils op: 0.1022646427154541 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10294461250305176 seconds\r\n", "Loading extension module utils...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to load utils op: 0.10330677032470703 seconds\r\n", "Time to load utils op: 0.10295748710632324 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n", "Rank: 6 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.001497507095336914 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006098747253417969 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006680488586425781 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006284713745117188 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006568431854248047 seconds\r\n", "Time to load utils op: 0.0006983280181884766 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0009129047393798828 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.000835418701171875 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 51.5 M\r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 2.0 K \r\n", "3 | head | Linear | 51.5 M\r\n", "--------------------------------------\r\n", "1.4 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.4 B Total params\r\n", "5,650.715 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/4371 [00:00=12.1), as this is known to have freeze issues\r\n", "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", "#\r\n", "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", "\r\n", "[RWKV.model] Configuring optimizer with\r\n", " - lr_init: 5.000e-04 (0.0005)\r\n", " - lr_final: 4.000e-04 (0.0004)\r\n", "\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", "Building extension module fused_adam...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", "ninja: no work to do.\r\n", "Loading extension module fused_adam...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to load fused_adam op: 0.099365234375 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.10165834426879883 seconds\r\n", "Time to load fused_adam op: 0.1015784740447998 seconds\r\n", "Time to load fused_adam op: 0.10136890411376953 seconds\r\n", "Time to load fused_adam op: 0.10139322280883789 seconds\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Loading extension module fused_adam...\r\n", "Time to load fused_adam op: 0.1018073558807373 seconds\r\n", "Loading `train_dataloader` to estimate number of stepping batches.\r\n", "Time to load fused_adam op: 0.10153341293334961 seconds\r\n", "Time to load fused_adam op: 0.1029043197631836 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", "Building extension module utils...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.07778620719909668 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10236048698425293 seconds\r\n", "Time to load utils op: 0.10336160659790039 seconds\r\n", "Time to load utils op: 0.10291481018066406 seconds\r\n", "Time to load utils op: 0.1023705005645752 seconds\r\n", "Loading extension module utils...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10250258445739746 seconds\r\n", "Time to load utils op: 0.10256433486938477 seconds\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.10273289680480957 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 4 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 1 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 5 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 0 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 7 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 2 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 6 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Rank: 3 partition count [8, 8] and sizes[(176584448, False), (384, False)] \r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006737709045410156 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006251335144042969 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006656646728515625 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0006136894226074219 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.00067138671875 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0009543895721435547 seconds\r\n", "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.001495361328125 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", "Loading extension module utils...\r\n", "Time to load utils op: 0.0008938312530517578 seconds\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r\n", " | Name | Type | Params\r\n", "--------------------------------------\r\n", "0 | emb | Embedding | 51.5 M\r\n", "1 | blocks | ModuleList | 1.3 B \r\n", "2 | ln_out | LayerNorm | 2.0 K \r\n", "3 | head | Linear | 51.5 M\r\n", "--------------------------------------\r\n", "1.4 B Trainable params\r\n", "0 Non-trainable params\r\n", "1.4 B Total params\r\n", "5,650.715 Total estimated model params size (MB)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\r", "Training: 0it [00:00, ?it/s]\r", "Training: 0%| | 0/16026 [00:00