# RWKV v5 multi-size training experiment

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
DEEPSPEED_STRAT="deepspeed_stage_1"
GPU_DEVICES="auto"
ENABLE_WANDB=True

EMBED_SCALE=0.01
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

LAYER_COUNT=6
EMBED_SIZE=2048

WANDB_PREFIX=f"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
 WANDB_MODE="online"
else:
 WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_1
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train
INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer


In [3]:
# Init the model
!cd "{TRAINER_DIR}" && \
 python3 ./init_model.py \
 --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \
 --emb-scale "{EMBED_SCALE}" \
 --vocab_size neox --skip-if-exists \
 "../model/{FILENAME_PREFIX}-neox-v5base-init.pth"

[2023-09-29 04:50:16,856] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 6
Embedding size: 2048
Output model path: ../model/v5-L6-D2048-E0_01-neox-v5base-init.pth
Vocab size: 50277
Emb scale: 0.01
Note: this process takes a significant time (and ram) for large models
---- ----- ----


50277 2048 -0.01 emb.weight


2048 2048 1.0 blocks.0.att.gate.weight


2048 2048 1.0 blocks.0.att.receptance.weight


2048 2048 1.0 blocks.0.att.key.weight


2048 2048 1.0 blocks.0.att.value.weight


2048 2048 0 blocks.0.att.output.weight
7168 2048 1.0 blocks.0.ffn.key.weight


2048 2048 0 blocks.0.ffn.receptance.weight
2048 7168 0 blocks.0.ffn.value.weight


2048 2048 1.0 blocks.1.att.gate.weight


2048 2048 1.0 blocks.1.att.receptance.weight


2048 2048 1.0 blocks.1.att.key.weight


2048 2048 1.0 blocks.1.att.value.weight


2048 2048 0 blocks.1.att.output.weight
7168 2048 1.0 blocks.1.ffn.key.weight


2048 2048 0 blocks.1.ffn.receptance.weight
2048 7168 0 blocks.1.ffn.value.weight
2048 2048 1.0 blocks.2.att.gate.weight


2048 2048 1.0 blocks.2.att.receptance.weight


2048 2048 1.0 blocks.2.att.key.weight


2048 2048 1.0 blocks.2.att.value.weight


2048 2048 0 blocks.2.att.output.weight


7168 2048 1.0 blocks.2.ffn.key.weight


2048 2048 0 blocks.2.ffn.receptance.weight


2048 7168 0 blocks.2.ffn.value.weight
2048 2048 1.0 blocks.3.att.gate.weight


2048 2048 1.0 blocks.3.att.receptance.weight


2048 2048 1.0 blocks.3.att.key.weight


2048 2048 1.0 blocks.3.att.value.weight


2048 2048 0 blocks.3.att.output.weight
7168 2048 1.0 blocks.3.ffn.key.weight


2048 2048 0 blocks.3.ffn.receptance.weight
2048 7168 0 blocks.3.ffn.value.weight


2048 2048 1.0 blocks.4.att.gate.weight


2048 2048 1.0 blocks.4.att.receptance.weight


2048 2048 1.0 blocks.4.att.key.weight


2048 2048 1.0 blocks.4.att.value.weight


2048 2048 0 blocks.4.att.output.weight
7168 2048 1.0 blocks.4.ffn.key.weight


2048 2048 0 blocks.4.ffn.receptance.weight
2048 7168 0 blocks.4.ffn.value.weight


2048 2048 1.0 blocks.5.att.gate.weight


2048 2048 1.0 blocks.5.att.receptance.weight


2048 2048 1.0 blocks.5.att.key.weight


2048 2048 1.0 blocks.5.att.value.weight


2048 2048 0 blocks.5.att.output.weight


7168 2048 1.0 blocks.5.ffn.key.weight


2048 2048 0 blocks.5.ffn.receptance.weight
2048 7168 0 blocks.5.ffn.value.weight
50277 2048 0.5 head.weight


## Enwiki Stage 1 : Foundation 4k model training

In [4]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
 python3 preload_datapath.py "{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml"

Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/preload_datapath.py", line 20, in 
 assert os.path.exists(config_file), "Config file does not exist"
AssertionError: Config file does not exist


In [5]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 export WANDB_MODE="{WANDB_MODE}" && \
 python lightning_trainer.py fit \
 -c "{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml" \
 --trainer.logger.init_args.name="{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})" \
 --trainer.strategy="{DEEPSPEED_STRAT}" \
 --trainer.devices="{GPU_DEVICES}" \
 --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/" \
 --model.load_model="../model/{FILENAME_PREFIX}-neox-v5base-init.pth" \
 --model.ctx_len=4096 \
 --model.bptt_learning_range=1

/usr/bin/sh: 1: cd: can't cd to {TRAINER_DIR}


In [6]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
 python export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt" "../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth"

/usr/bin/sh: 1: python: not found


ls: cannot access '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory


In [7]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 dragon_test.py "../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth" "cuda fp32"

/usr/bin/sh: 1: cd: can't cd to {INFERENCE_DIR}
