{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "ee8cfec1", "metadata": { "papermill": { "duration": 0.001903, "end_time": "2023-08-23T19:20:40.353029", "exception": false, "start_time": "2023-08-23T19:20:40.351126", "status": "completed" }, "tags": [] }, "source": [ "# Model Init\n", "\n", "Test that the model init code, runs without issues\n", "\n", "**L6-D512 model with**\n", "- Layer count: 6\n", "- Embed size: 512" ] }, { "attachments": {}, "cell_type": "markdown", "id": "7713f8b0", "metadata": { "notebookRunGroups": { "groupValue": "" }, "papermill": { "duration": 0.001217, "end_time": "2023-08-23T19:20:40.355733", "exception": false, "start_time": "2023-08-23T19:20:40.354516", "status": "completed" }, "tags": [] }, "source": [ "## Preparing the init model and test dataset" ] }, { "cell_type": "code", "execution_count": 1, "id": "54684bc2", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T19:20:40.360445Z", "iopub.status.busy": "2023-08-23T19:20:40.359491Z", "iopub.status.idle": "2023-08-23T19:20:41.112004Z", "shell.execute_reply": "2023-08-23T19:20:41.111011Z" }, "papermill": { "duration": 0.757092, "end_time": "2023-08-23T19:20:41.114108", "exception": false, "start_time": "2023-08-23T19:20:40.357016", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# First lets setup the various directories\n", "!mkdir -p ../../model/\n", "!mkdir -p ../../datapath/\n", "!mkdir -p ../../checkpoint/" ] }, { "cell_type": "code", "execution_count": 2, "id": "80da2afe", "metadata": { "execution": { "iopub.execute_input": "2023-08-23T19:20:41.119086Z", "iopub.status.busy": "2023-08-23T19:20:41.118584Z", "iopub.status.idle": "2023-08-23T19:20:50.840612Z", "shell.execute_reply": "2023-08-23T19:20:50.839437Z" }, "papermill": { "duration": 9.727624, "end_time": "2023-08-23T19:20:50.843488", "exception": false, "start_time": "2023-08-23T19:20:41.115864", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2023-08-23 19:20:45,786] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "---- Initializing model ----\r\n", "No of layers: 6\r\n", "Embedding size: 512\r\n", "Output model path: ../model/L6-D512-neox-init.pth\r\n", "Vocab size: 50277\r\n", "Note: this process takes a significant time (and ram) for large models\r\n", "---- ----- ----\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n", "Detected CUDA files, patching ldflags\r\n", "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/wkv_1_bf16/build.ninja...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Building extension module wkv_1_bf16...\r\n", "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "ninja: no work to do.\r\n", "Loading extension module wkv_1_bf16...\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[RWKV.model]: Finished initial model load\r\n", "50277 512 -0.0001 emb.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.0.att.key.weight\r\n", "512 512 1.0 blocks.0.att.value.weight\r\n", "512 512 0 blocks.0.att.receptance.weight\r\n", "512 512 0 blocks.0.att.output.weight\r\n", "2048 512 1.0 blocks.0.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.0.ffn.receptance.weight\r\n", "512 2048 0 blocks.0.ffn.value.weight\r\n", "512 512 0 blocks.1.att.key.weight\r\n", "512 512 1.0 blocks.1.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.1.att.receptance.weight\r\n", "512 512 0 blocks.1.att.output.weight\r\n", "2048 512 1.0 blocks.1.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.1.ffn.receptance.weight\r\n", "512 2048 0 blocks.1.ffn.value.weight\r\n", "512 512 0 blocks.2.att.key.weight\r\n", "512 512 1.0 blocks.2.att.value.weight\r\n", "512 512 0 blocks.2.att.receptance.weight\r\n", "512 512 0 blocks.2.att.output.weight\r\n", "2048 512 1.0 blocks.2.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.2.ffn.receptance.weight\r\n", "512 2048 0 blocks.2.ffn.value.weight\r\n", "512 512 0 blocks.3.att.key.weight\r\n", "512 512 1.0 blocks.3.att.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.3.att.receptance.weight\r\n", "512 512 0 blocks.3.att.output.weight\r\n", "2048 512 1.0 blocks.3.ffn.key.weight\r\n", "512 512 0 blocks.3.ffn.receptance.weight\r\n", "512 2048 0 blocks.3.ffn.value.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.4.att.key.weight\r\n", "512 512 1.0 blocks.4.att.value.weight\r\n", "512 512 0 blocks.4.att.receptance.weight\r\n", "512 512 0 blocks.4.att.output.weight\r\n", "2048 512 1.0 blocks.4.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.4.ffn.receptance.weight\r\n", "512 2048 0 blocks.4.ffn.value.weight\r\n", "512 512 0 blocks.5.att.key.weight\r\n", "512 512 1.0 blocks.5.att.value.weight\r\n", "512 512 0 blocks.5.att.receptance.weight\r\n", "512 512 0 blocks.5.att.output.weight\r\n", "2048 512 1.0 blocks.5.ffn.key.weight\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "512 512 0 blocks.5.ffn.receptance.weight\r\n", "512 2048 0 blocks.5.ffn.value.weight\r\n", "50277 512 0.5 head.weight\r\n" ] } ], "source": [ "# Lets initialized the L6-D512 model with the init_model.py code\n", "!cd ../../RWKV-v4neo/ && python3 init_model.py --n_layer 6 --n_embd 512 --vocab_size neox ../model/L6-D512-neox-init.pth" ] } ], "metadata": { "kernelspec": { "display_name": "rwkv-exp", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "papermill": { "default_parameters": {}, "duration": 12.198584, "end_time": "2023-08-23T19:20:51.170107", "environment_variables": {}, "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/trainer-v4-unit-test/model-init.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/trainer-v4-unit-test/model-init.ipynb", "parameters": {}, "start_time": "2023-08-23T19:20:38.971523", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }