diff --git "a/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb" "b/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb" --- "a/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb" +++ "b/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb" @@ -3,13 +3,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "bb9e93fd", + "id": "215f6d68", "metadata": { "papermill": { - "duration": 0.003202, - "end_time": "2023-09-29T05:48:15.621385", + "duration": 0.00298, + "end_time": "2023-09-29T06:34:37.429814", "exception": false, - "start_time": "2023-09-29T05:48:15.618183", + "start_time": "2023-09-29T06:34:37.426834", "status": "completed" }, "tags": [] @@ -23,13 +23,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "ec0b74ac", + "id": "bf0cf97c", "metadata": { "papermill": { - "duration": 0.002532, - "end_time": "2023-09-29T05:48:15.628420", + "duration": 0.002312, + "end_time": "2023-09-29T06:34:37.435725", "exception": false, - "start_time": "2023-09-29T05:48:15.625888", + "start_time": "2023-09-29T06:34:37.433413", "status": "completed" }, "tags": [] @@ -41,19 +41,19 @@ { "cell_type": "code", "execution_count": 1, - "id": "89d0e842", + "id": "98d95606", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:48:15.635765Z", - "iopub.status.busy": "2023-09-29T05:48:15.635042Z", - "iopub.status.idle": "2023-09-29T05:48:16.384023Z", - "shell.execute_reply": "2023-09-29T05:48:16.383082Z" + "iopub.execute_input": "2023-09-29T06:34:37.440037Z", + "iopub.status.busy": "2023-09-29T06:34:37.439753Z", + "iopub.status.idle": "2023-09-29T06:34:38.107502Z", + "shell.execute_reply": "2023-09-29T06:34:38.106666Z" }, "papermill": { - "duration": 0.755093, - "end_time": "2023-09-29T05:48:16.386307", + "duration": 0.672001, + "end_time": "2023-09-29T06:34:38.109558", "exception": false, - "start_time": "2023-09-29T05:48:15.631214", + "start_time": "2023-09-29T06:34:37.437557", "status": "completed" }, "tags": [] @@ -69,19 +69,19 @@ { "cell_type": "code", "execution_count": 2, - "id": "eb4d593c", + "id": "d799a503", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:48:16.393985Z", - "iopub.status.busy": "2023-09-29T05:48:16.393428Z", - "iopub.status.idle": "2023-09-29T05:48:16.401146Z", - "shell.execute_reply": "2023-09-29T05:48:16.400448Z" + "iopub.execute_input": "2023-09-29T06:34:38.116182Z", + "iopub.status.busy": "2023-09-29T06:34:38.115939Z", + "iopub.status.idle": "2023-09-29T06:34:38.124075Z", + "shell.execute_reply": "2023-09-29T06:34:38.123428Z" }, "papermill": { - "duration": 0.013429, - "end_time": "2023-09-29T05:48:16.402758", + "duration": 0.012774, + "end_time": "2023-09-29T06:34:38.125078", "exception": false, - "start_time": "2023-09-29T05:48:16.389329", + "start_time": "2023-09-29T06:34:38.112304", "status": "completed" }, "tags": [] @@ -140,19 +140,19 @@ { "cell_type": "code", "execution_count": 3, - "id": "02a8c148", + "id": "e4204bbd", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:48:16.410433Z", - "iopub.status.busy": "2023-09-29T05:48:16.409951Z", - "iopub.status.idle": "2023-09-29T05:48:45.475788Z", - "shell.execute_reply": "2023-09-29T05:48:45.474892Z" + "iopub.execute_input": "2023-09-29T06:34:38.128341Z", + "iopub.status.busy": "2023-09-29T06:34:38.128199Z", + "iopub.status.idle": "2023-09-29T06:34:57.534218Z", + "shell.execute_reply": "2023-09-29T06:34:57.533501Z" }, "papermill": { - "duration": 29.072394, - "end_time": "2023-09-29T05:48:45.478206", + "duration": 19.409767, + "end_time": "2023-09-29T06:34:57.536209", "exception": false, - "start_time": "2023-09-29T05:48:16.405812", + "start_time": "2023-09-29T06:34:38.126442", "status": "completed" }, "tags": [] @@ -162,7 +162,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2023-09-29 05:48:20,485] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + "[2023-09-29 06:34:40,927] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { @@ -228,2780 +228,261 @@ "output_type": "stream", "text": [ "2048 2048 0 blocks.0.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.0.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.1.att.gate.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.1.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.1.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.1.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.1.att.output.weight\r\n", - "7168 2048 1.0 blocks.1.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.1.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.1.ffn.value.weight\r\n", - "2048 2048 1.0 blocks.2.att.gate.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.2.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.2.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.2.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.2.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7168 2048 1.0 blocks.2.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.2.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.2.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.3.att.gate.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.3.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.3.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.3.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.3.att.output.weight\r\n", - "7168 2048 1.0 blocks.3.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.3.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.3.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.4.att.gate.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.4.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.4.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.4.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.4.att.output.weight\r\n", - "7168 2048 1.0 blocks.4.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.4.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.4.ffn.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.5.att.gate.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.5.att.receptance.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.5.att.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 1.0 blocks.5.att.value.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.5.att.output.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7168 2048 1.0 blocks.5.ffn.key.weight\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2048 2048 0 blocks.5.ffn.receptance.weight\r\n", - "2048 7168 0 blocks.5.ffn.value.weight\r\n", - "50277 2048 0.5 head.weight\r\n" - ] - } - ], - "source": [ - "# Init the model\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python3 ./init_model.py \\\n", - " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n", - " --emb-scale \"{EMBED_SCALE}\" \\\n", - " --vocab_size neox --skip-if-exists \\\n", - " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\"" - ] - }, - { - "cell_type": "markdown", - "id": "92eb36b4", - "metadata": { - "papermill": { - "duration": 0.00572, - "end_time": "2023-09-29T05:48:45.490182", - "exception": false, - "start_time": "2023-09-29T05:48:45.484462", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Enwiki Stage 1 : Foundation 4k model training" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b6db7965", - "metadata": { - "execution": { - "iopub.execute_input": "2023-09-29T05:48:45.504239Z", - "iopub.status.busy": "2023-09-29T05:48:45.503871Z", - "iopub.status.idle": "2023-09-29T05:55:02.308216Z", - "shell.execute_reply": "2023-09-29T05:55:02.307296Z" - }, - "papermill": { - "duration": 376.815064, - "end_time": "2023-09-29T05:55:02.311331", - "exception": false, - "start_time": "2023-09-29T05:48:45.496267", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", - "Downloading readme: 0%| | 0.00/433 [00:00\r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", + " self._run_subcommand(self.subcommand)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", + " fn(**fn_kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", + " call._call_and_handle_interrupt(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", + " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", + " return function(*args, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", + " self._run(model, ckpt_path=ckpt_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", + " self._data_connector.prepare_data()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", + " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", + " return fn(*args, **kwargs)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 549, in prepare_data\r\n", + " prepare_data_static(**self._init_locals)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 464, in prepare_data_static\r\n", + " src_dataset[\"train\"] = src_dataset[\"train\"].select(range(offset_val, offset_val + length_val))\r\n", + "TypeError: 'float' object cannot be interpreted as an integer\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/nua9z0t5\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v2\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_063511-nua9z0t5/logs\u001b[0m\r\n" ] } ], @@ -17754,7 +15271,7 @@ "# Start the foundation model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " python lightning_trainer.py fit \\\n", + " python3 lightning_trainer.py fit \\\n", " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n", " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", @@ -17768,19 +15285,19 @@ { "cell_type": "code", "execution_count": 6, - "id": "3f09ba8e", + "id": "e03ffed6", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:55:03.057758Z", - "iopub.status.busy": "2023-09-29T05:55:03.057339Z", - "iopub.status.idle": "2023-09-29T05:55:03.559231Z", - "shell.execute_reply": "2023-09-29T05:55:03.558078Z" + "iopub.execute_input": "2023-09-29T06:40:32.715392Z", + "iopub.status.busy": "2023-09-29T06:40:32.715097Z", + "iopub.status.idle": "2023-09-29T06:40:35.440958Z", + "shell.execute_reply": "2023-09-29T06:40:35.440200Z" }, "papermill": { - "duration": 0.622283, - "end_time": "2023-09-29T05:55:03.561504", + "duration": 2.797683, + "end_time": "2023-09-29T06:40:35.442702", "exception": false, - "start_time": "2023-09-29T05:55:02.939221", + "start_time": "2023-09-29T06:40:32.645019", "status": "completed" }, "tags": [] @@ -17790,7 +15307,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/bin/sh: 1: python: not found\r\n" + "[2023-09-29 06:40:34,511] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n" ] }, { @@ -17804,26 +15335,26 @@ "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", + " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n", "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\"" ] }, { "cell_type": "code", "execution_count": 7, - "id": "fc3229d6", + "id": "b2d5fe57", "metadata": { "execution": { - "iopub.execute_input": "2023-09-29T05:55:03.798050Z", - "iopub.status.busy": "2023-09-29T05:55:03.797179Z", - "iopub.status.idle": "2023-09-29T05:55:10.219851Z", - "shell.execute_reply": "2023-09-29T05:55:10.218764Z" + "iopub.execute_input": "2023-09-29T06:40:35.588027Z", + "iopub.status.busy": "2023-09-29T06:40:35.587772Z", + "iopub.status.idle": "2023-09-29T06:40:39.405717Z", + "shell.execute_reply": "2023-09-29T06:40:39.404915Z" }, "papermill": { - "duration": 6.543652, - "end_time": "2023-09-29T05:55:10.222175", + "duration": 3.890278, + "end_time": "2023-09-29T06:40:39.407623", "exception": false, - "start_time": "2023-09-29T05:55:03.678523", + "start_time": "2023-09-29T06:40:35.517345", "status": "completed" }, "tags": [] @@ -17833,7 +15364,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2023-09-29 05:55:08,169] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + "[2023-09-29 06:40:38,394] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { @@ -17879,14 +15410,14 @@ }, "papermill": { "default_parameters": {}, - "duration": 416.415898, - "end_time": "2023-09-29T05:55:10.831493", + "duration": 363.056705, + "end_time": "2023-09-29T06:40:39.597218", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb", "parameters": {}, - "start_time": "2023-09-29T05:48:14.415595", + "start_time": "2023-09-29T06:34:36.540513", "version": "2.4.0" } },