diff --git "a/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" "b/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" --- "a/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" +++ "b/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb" @@ -3,13 +3,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "4c73afb6", + "id": "637a63b7", "metadata": { "papermill": { - "duration": 0.003926, - "end_time": "2023-08-25T16:07:12.117562", + "duration": 0.004036, + "end_time": "2023-08-25T16:11:16.699201", "exception": false, - "start_time": "2023-08-25T16:07:12.113636", + "start_time": "2023-08-25T16:11:16.695165", "status": "completed" }, "tags": [] @@ -18,7 +18,7 @@ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n", "\n", "- 6 layers\n", - "- 4096 embedding size\n", + "- 2048 embedding size\n", "\n", "Going through the modified memory training for v5 models, across various initial embedding model weights\n", "\n", @@ -28,13 +28,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "393e4299", + "id": "de12bebf", "metadata": { "papermill": { - "duration": 0.002494, - "end_time": "2023-08-25T16:07:12.122625", + "duration": 0.002243, + "end_time": "2023-08-25T16:11:16.704494", "exception": false, - "start_time": "2023-08-25T16:07:12.120131", + "start_time": "2023-08-25T16:11:16.702251", "status": "completed" }, "tags": [] @@ -46,19 +46,19 @@ { "cell_type": "code", "execution_count": 1, - "id": "e8229f09", + "id": "964150fc", "metadata": { "execution": { - "iopub.execute_input": "2023-08-25T16:07:12.129239Z", - "iopub.status.busy": "2023-08-25T16:07:12.128559Z", - "iopub.status.idle": "2023-08-25T16:07:12.847564Z", - "shell.execute_reply": "2023-08-25T16:07:12.846564Z" + "iopub.execute_input": "2023-08-25T16:11:16.710942Z", + "iopub.status.busy": "2023-08-25T16:11:16.710551Z", + "iopub.status.idle": "2023-08-25T16:11:17.427297Z", + "shell.execute_reply": "2023-08-25T16:11:17.426333Z" }, "papermill": { - "duration": 0.724082, - "end_time": "2023-08-25T16:07:12.849402", + "duration": 0.721982, + "end_time": "2023-08-25T16:11:17.428861", "exception": false, - "start_time": "2023-08-25T16:07:12.125320", + "start_time": "2023-08-25T16:11:16.706879", "status": "completed" }, "tags": [] @@ -74,19 +74,19 @@ { "cell_type": "code", "execution_count": 2, - "id": "c7a42dc0", + "id": "576838ed", "metadata": { "execution": { - "iopub.execute_input": "2023-08-25T16:07:12.855853Z", - "iopub.status.busy": "2023-08-25T16:07:12.855653Z", - "iopub.status.idle": "2023-08-25T16:07:15.711591Z", - "shell.execute_reply": "2023-08-25T16:07:15.710680Z" + "iopub.execute_input": "2023-08-25T16:11:17.435940Z", + "iopub.status.busy": "2023-08-25T16:11:17.435670Z", + "iopub.status.idle": "2023-08-25T16:11:20.292367Z", + "shell.execute_reply": "2023-08-25T16:11:20.291509Z" }, "papermill": { - "duration": 2.861479, - "end_time": "2023-08-25T16:07:15.713370", + "duration": 2.86298, + "end_time": "2023-08-25T16:11:20.294308", "exception": false, - "start_time": "2023-08-25T16:07:12.851891", + "start_time": "2023-08-25T16:11:17.431328", "status": "completed" }, "tags": [] @@ -118,19 +118,19 @@ { "cell_type": "code", "execution_count": 3, - "id": "6cab8aca", + "id": "a2b03006", "metadata": { "execution": { - "iopub.execute_input": "2023-08-25T16:07:15.720224Z", - "iopub.status.busy": "2023-08-25T16:07:15.720023Z", - "iopub.status.idle": "2023-08-25T16:07:15.726216Z", - "shell.execute_reply": "2023-08-25T16:07:15.725750Z" + "iopub.execute_input": "2023-08-25T16:11:20.301143Z", + "iopub.status.busy": "2023-08-25T16:11:20.300945Z", + "iopub.status.idle": "2023-08-25T16:11:20.307541Z", + "shell.execute_reply": "2023-08-25T16:11:20.306932Z" }, "papermill": { - "duration": 0.011194, - "end_time": "2023-08-25T16:07:15.727505", + "duration": 0.011411, + "end_time": "2023-08-25T16:11:20.308662", "exception": false, - "start_time": "2023-08-25T16:07:15.716311", + "start_time": "2023-08-25T16:11:20.297251", "status": "completed" }, "tags": [] @@ -191,18 +191,20 @@ { "cell_type": "code", "execution_count": 4, - "id": "fed724db", + "id": "1e354c57", "metadata": { "execution": { - "iopub.execute_input": "2023-08-25T16:07:15.733547Z", - "iopub.status.busy": "2023-08-25T16:07:15.733381Z" + "iopub.execute_input": "2023-08-25T16:11:20.314723Z", + "iopub.status.busy": "2023-08-25T16:11:20.314557Z", + "iopub.status.idle": "2023-08-25T16:11:51.590243Z", + "shell.execute_reply": "2023-08-25T16:11:51.589647Z" }, "papermill": { - "duration": null, - "end_time": null, + "duration": 31.280786, + "end_time": "2023-08-25T16:11:51.592117", "exception": false, - "start_time": "2023-08-25T16:07:15.730189", - "status": "running" + "start_time": "2023-08-25T16:11:20.311331", + "status": "completed" }, "tags": [] }, @@ -221,7 +223,7 @@ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n", "---- Initializing model ----\r\n", "No of layers: 6\r\n", - "Embedding size: 4096\r\n", + "Embedding size: 2048\r\n", "Output model path: ../model/L6-D2048-E0_1-neox-v5base-init.pth\r\n", "Vocab size: 50277\r\n", "Emb scale: 0.1\r\n", @@ -233,200 +235,236 @@ "name": "stdout", "output_type": "stream", "text": [ - "50277 4096 -0.1 emb.weight\r\n" + "50277 2048 -0.1 emb.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.receptance.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.key.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 1.0 blocks.0.att.value.weight\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2048 2048 0 blocks.0.att.output.weight\r\n", + "8192 2048 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.0.att.receptance.weight\r\n" + "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.0.ffn.value.weight\r\n", + "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.0.att.key.weight\r\n" + "2048 2048 1.0 blocks.1.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.0.att.value.weight\r\n" + "2048 2048 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.0.att.output.weight\r\n", - "16384 4096 1.0 blocks.0.ffn.key.weight\r\n" + "2048 2048 0 blocks.1.att.output.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.0.ffn.receptance.weight\r\n", - "4096 16384 0 blocks.0.ffn.value.weight\r\n" + "8192 2048 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.1.att.receptance.weight\r\n" + "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.1.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.1.att.key.weight\r\n" + "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.1.att.value.weight\r\n" + "2048 2048 1.0 blocks.2.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.1.att.output.weight\r\n" + "2048 2048 1.0 blocks.2.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "16384 4096 1.0 blocks.1.ffn.key.weight\r\n" + "2048 2048 0 blocks.2.att.output.weight\r\n", + "8192 2048 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.1.ffn.receptance.weight\r\n" + "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.2.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 16384 0 blocks.1.ffn.value.weight\r\n" + "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.2.att.receptance.weight\r\n" + "2048 2048 1.0 blocks.3.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.2.att.key.weight\r\n" + "2048 2048 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.2.att.value.weight\r\n" + "2048 2048 0 blocks.3.att.output.weight\r\n", + "8192 2048 1.0 blocks.3.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.2.att.output.weight\r\n", - "16384 4096 1.0 blocks.2.ffn.key.weight\r\n" + "2048 2048 0 blocks.3.ffn.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.2.ffn.receptance.weight\r\n" + "2048 8192 0 blocks.3.ffn.value.weight\r\n", + "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 16384 0 blocks.2.ffn.value.weight\r\n" + "2048 2048 1.0 blocks.4.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.3.att.receptance.weight\r\n" + "2048 2048 1.0 blocks.4.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.3.att.key.weight\r\n" + "2048 2048 0 blocks.4.att.output.weight\r\n", + "8192 2048 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.3.att.value.weight\r\n" + "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.4.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.3.att.output.weight\r\n", - "16384 4096 1.0 blocks.3.ffn.key.weight\r\n" + "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 0 blocks.3.ffn.receptance.weight\r\n" + "2048 2048 1.0 blocks.5.att.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 16384 0 blocks.3.ffn.value.weight\r\n" + "2048 2048 1.0 blocks.5.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.4.att.receptance.weight\r\n" + "2048 2048 0 blocks.5.att.output.weight\r\n", + "8192 2048 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.4.att.key.weight\r\n" + "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", + "2048 8192 0 blocks.5.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "4096 4096 1.0 blocks.4.att.value.weight\r\n" + "50277 2048 0.5 head.weight\r\n" ] } ], @@ -435,7 +473,7 @@ "!cd \"{TRAINER_DIR}\" && \\\n", " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ./init_model.py \\\n", - " --n_layer 6 --n_embd 4096 \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", " --emb-scale \"{EMBED_SCALE}\" \\\n", " --vocab_size neox --skip-if-exists \\\n", " \"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\"" @@ -443,14 +481,14 @@ }, { "cell_type": "markdown", - "id": "b17f7961", + "id": "1695a22f", "metadata": { "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" + "duration": 0.004084, + "end_time": "2023-08-25T16:11:51.601214", + "exception": false, + "start_time": "2023-08-25T16:11:51.597130", + "status": "completed" }, "tags": [] }, @@ -460,19 +498,338 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "0caf9040", + "execution_count": 5, + "id": "0a8706d2", "metadata": { + "execution": { + "iopub.execute_input": "2023-08-25T16:11:51.612580Z", + "iopub.status.busy": "2023-08-25T16:11:51.612394Z", + "iopub.status.idle": "2023-08-25T16:12:06.132970Z", + "shell.execute_reply": "2023-08-25T16:12:06.132013Z" + }, "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" + "duration": 14.529644, + "end_time": "2023-08-25T16:12:06.135118", + "exception": false, + "start_time": "2023-08-25T16:11:51.605474", + "status": "completed" }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/teven___parquet/teven--enwiki_100k-1359e81b212c2dd6/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", + "\r", + " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 6.000e-04 (0.0006)\r\n", + " - lr_final: 4.000e-04 (0.0004)\r\n", + "\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.07104086875915527 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.10213232040405273 seconds\r\n", + "Time to load fused_adam op: 0.10221099853515625 seconds\r\n", + "Time to load fused_adam op: 0.10222291946411133 seconds\r\n", + "Time to load fused_adam op: 0.10229325294494629 seconds\r\n", + "Time to load fused_adam op: 0.10161519050598145 seconds\r\n", + "Time to load fused_adam op: 0.1018679141998291 seconds\r\n", + "Time to load fused_adam op: 0.10222315788269043 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", + "Building extension module utils...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.07134485244750977 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10251235961914062 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10227251052856445 seconds\r\n", + "Time to load utils op: 0.1020975112915039 seconds\r\n", + "Time to load utils op: 0.10264277458190918 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10219502449035645 seconds\r\n", + "Time to load utils op: 0.10291242599487305 seconds\r\n", + "Time to load utils op: 0.10216760635375977 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 3 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 7 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Rank: 2 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 6 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 1 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 5 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Rank: 0 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 4 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0005962848663330078 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Time to load utils op: 0.0006504058837890625 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006344318389892578 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006222724914550781 seconds\r\n", + "Time to load utils op: 0.0006110668182373047 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006005764007568359 seconds\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006175041198730469 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0008368492126464844 seconds\r\n", + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.935 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/10186 [00:00\r\n", + " asyncio.run(main_function())\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", + " return runner.run(main)\r\n", + " ^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", + " return self._loop.run_until_complete(task)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", + " return future.result()\r\n", + " ^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py\", line 58, in main_function\r\n", + " model = SimpleRWKV(model_path, device=\"cuda\")\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1378, in __init__\r\n", + " self.model = RWKV(**model_config)\r\n", + " ^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 650, in __init__\r\n", + " self.load_state_dict(model_weights)\r\n", + " File \"/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\", line 2041, in load_state_dict\r\n", + " raise RuntimeError('Error(s) in loading state_dict for {}:\\n\\t{}'.format(\r\n", + "RuntimeError: Error(s) in loading state_dict for RWKV:\r\n", + "\tsize mismatch for blocks.0.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.0.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.1.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.1.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.2.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.2.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.3.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.3.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.4.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.4.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.5.att.time_decay: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n", + "\tsize mismatch for blocks.5.att.time_faaaa: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([32]).\r\n" + ] + } + ], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e28c794b", + "metadata": { + "papermill": { + "duration": 0.589439, + "end_time": "2023-08-25T18:42:34.155711", + "exception": false, + "start_time": "2023-08-25T18:42:33.566272", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "55af99e2", + "metadata": { + "execution": { + "iopub.execute_input": "2023-08-25T18:42:35.416355Z", + "iopub.status.busy": "2023-08-25T18:42:35.416058Z", + "iopub.status.idle": "2023-08-25T18:42:42.458315Z", + "shell.execute_reply": "2023-08-25T18:42:42.457559Z" + }, + "papermill": { + "duration": 7.721475, + "end_time": "2023-08-25T18:42:42.460292", + "exception": false, + "start_time": "2023-08-25T18:42:34.738817", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/actions-runner/.cache/huggingface/datasets/c-s-ale___parquet/c-s-ale--dolly-15k-instruction-alpaca-format-9dfbb23260d63d9d/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\r\n", + "\r", + " 0%| | 0/1 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 4.000e-04 (0.0004)\r\n", + " - lr_final: 3.000e-04 (0.0003)\r\n", + "\r\n", + "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", + "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.06991124153137207 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.10216736793518066 seconds\r\n", + "Time to load fused_adam op: 0.10175776481628418 seconds\r\n", + "Time to load fused_adam op: 0.10200738906860352 seconds\r\n", + "Time to load fused_adam op: 0.10212993621826172 seconds\r\n", + "Time to load fused_adam op: 0.10253763198852539 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n", + "Loading extension module fused_adam...\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.10177922248840332 seconds\r\n", + "Time to load fused_adam op: 0.10174822807312012 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...\r\n", + "Building extension module utils...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ninja: no work to do.\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.07215428352355957 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10257506370544434 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10219836235046387 seconds\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10246992111206055 seconds\r\n", + "Time to load utils op: 0.10241150856018066 seconds\r\n", + "Time to load utils op: 0.10303401947021484 seconds\r\n", + "Time to load utils op: 0.10210776329040527 seconds\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.10240936279296875 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 4 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 6 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 2 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Rank: 5 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Rank: 3 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 1 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 7 partition count [8, 8] and sizes[(66654208, False), (24, False)] \r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006098747253417969 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006396770477294922 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0006613731384277344 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0005762577056884766 seconds\r\n", + "Time to load utils op: 0.00106048583984375 seconds\r\n", + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0011622905731201172 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0012149810791015625 seconds\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\r\n", + "No modifications detected for re-loaded extension module utils, skipping build step...\r\n", + "Loading extension module utils...\r\n", + "Time to load utils op: 0.0008492469787597656 seconds\r\n", + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 327 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "533 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "533 M Total params\r\n", + "2,132.935 Total estimated model params size (MB)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/1867 [00:00\r\n", + " asyncio.run(main_function())\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 190, in run\r\n", + " return runner.run(main)\r\n", + " ^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/runners.py\", line 118, in run\r\n", + " return self._loop.run_until_complete(task)\r\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", + " File \"/usr/lib/python3.11/asyncio/base_events.py\", line 653, in run_until_complete\r\n", + " return future.result()\r\n", + " ^^^^^^^^^^^^^^^\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5headsize2x_memory_guided.py\", line 56, in main_function\r\n", + " from src.model import SimpleRWKV\r\n", + "ModuleNotFoundError: No module named 'src'\r\n" + ] + } + ], "source": [ "# Lets do a quick memory test\n", "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", @@ -731,14 +115875,14 @@ }, "papermill": { "default_parameters": {}, - "duration": null, - "end_time": null, + "duration": 10259.517613, + "end_time": "2023-08-25T19:02:15.243840", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb", "parameters": {}, - "start_time": "2023-08-25T16:07:11.154329", + "start_time": "2023-08-25T16:11:15.726227", "version": "2.4.0" } },