# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k

- 6 layers
- 4096 embedding size

Going through the modified memory training for v5 models, across various initial embedding model weights

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
# Additional dependencies for eval stuff
!pip install -q aiocsv aiofiles

[0m


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [3]:
DEEPSPEED_STRAT="deepspeed_stage_2_offload"
GPU_DEVICES="auto"
ENABLE_WANDB=True

RWKV_WAVENET_LAYERS=1

EMBED_SCALE=0.1
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

LAYER_COUNT=6
EMBED_DIM=4096

WANDB_PREFIX=f"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
 WANDB_MODE="online"
else:
 WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5headsize2x/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5headsize2x/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_2_offload
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x
INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x
TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x
PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer


In [4]:
# Init the model
!cd "{TRAINER_DIR}" && \
 export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ./init_model.py \
 --n_layer 6 --n_embd 4096 \
 --emb-scale "{EMBED_SCALE}" \
 --vocab_size neox --skip-if-exists \
 "../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth"

Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
---- Initializing model ----
No of layers: 6
Embedding size: 4096
Output model path: ../model/L6-D4096-E0_1-neox-v5base-init.pth
Vocab size: 50277
Emb scale: 0.1
Note: this process takes a significant time (and ram) for large models
---- ----- ----


50277 4096 -0.1 emb.weight


4096 4096 1.0 blocks.0.att.receptance.weight


4096 4096 1.0 blocks.0.att.key.weight


4096 4096 1.0 blocks.0.att.value.weight


4096 4096 0 blocks.0.att.output.weight


16384 4096 1.0 blocks.0.ffn.key.weight


4096 4096 0 blocks.0.ffn.receptance.weight


4096 16384 0 blocks.0.ffn.value.weight


4096 4096 1.0 blocks.1.att.receptance.weight


4096 4096 1.0 blocks.1.att.key.weight


4096 4096 1.0 blocks.1.att.value.weight


4096 4096 0 blocks.1.att.output.weight


16384 4096 1.0 blocks.1.ffn.key.weight


4096 4096 0 blocks.1.ffn.receptance.weight
4096 16384 0 blocks.1.ffn.value.weight


4096 4096 1.0 blocks.2.att.receptance.weight


4096 4096 1.0 blocks.2.att.key.weight


4096 4096 1.0 blocks.2.att.value.weight


4096 4096 0 blocks.2.att.output.weight
16384 4096 1.0 blocks.2.ffn.key.weight


4096 4096 0 blocks.2.ffn.receptance.weight
4096 16384 0 blocks.2.ffn.value.weight


4096 4096 1.0 blocks.3.att.receptance.weight


4096 4096 1.0 blocks.3.att.key.weight


4096 4096 1.0 blocks.3.att.value.weight


4096 4096 0 blocks.3.att.output.weight


16384 4096 1.0 blocks.3.ffn.key.weight


4096 4096 0 blocks.3.ffn.receptance.weight


4096 16384 0 blocks.3.ffn.value.weight


4096 4096 1.0 blocks.4.att.receptance.weight


4096 4096 1.0 blocks.4.att.key.weight


4096 4096 1.0 blocks.4.att.value.weight


4096 4096 0 blocks.4.att.output.weight


16384 4096 1.0 blocks.4.ffn.key.weight


4096 4096 0 blocks.4.ffn.receptance.weight
4096 16384 0 blocks.4.ffn.value.weight


4096 4096 1.0 blocks.5.att.receptance.weight


4096 4096 1.0 blocks.5.att.key.weight


4096 4096 1.0 blocks.5.att.value.weight


4096 4096 0 blocks.5.att.output.weight
16384 4096 1.0 blocks.5.ffn.key.weight


4096 4096 0 blocks.5.ffn.receptance.weight
4096 16384 0 blocks.5.ffn.value.weight


50277 4096 0.5 head.weight


## Enwiki Stage 1 : Foundation 4k model training

In [5]:
# Lets preload the requried dataset 
!cd "{TRAINER_DIR}" && \
 python3 preload_datapath.py "{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml"

Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/preload_datapath.py", line 37, in 
 dataMod.prepare_data()
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/src/data.py", line 465, in prepare_data
 prepare_data_static(**self._init_locals)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x/src/data.py", line 94, in prepare_data_static
 src_dataset = load_dataset(**load_dataset_params)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 1785, in load_dataset
 builder_instance = load_dataset_builder(
 ^^^^^^^^^^^^^^^^^^^^^
 File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 1514, in load_dataset_builder
 dataset_module = dataset_module_factory(
 ^^^^^^^^^^^^^^^^^^^^^^^
 File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 1231, in dataset_m

In [6]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth"

Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


 rank_zero_warn(


 rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 2838735928


[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: wandb version 0.15.8 is available! To upgrade, please run:
[34m[1mwandb[0m: $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20230823_112943-wdxosswf[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mv5-hs2x-L6-D4096-E0.1 - Enwiki-4k Foundation (train-ctx=4k, deepspeed_stage_2_offload)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/wdxosswf[0m


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[RWKV.Trainer] Applying 'target_batch_size' with the following:
 - target_batch_size: 32
 - num_nodes: 1
 - num_devices: 8
 - accumulate_grad_batches: 4
 - effective_batch_size: 32



Setting ds_accelerator to cuda (auto detect)


Setting ds_accelerator to cuda (auto detect)


Setting ds_accelerator to cuda (auto detect)
Setting ds_accelerator to cuda (auto detect)
Setting ds_accelerator to cuda (auto detect)


Setting ds_accelerator to cuda (auto detect)
Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


[rank: 5] Global seed set to 2838735928


[rank: 3] Global seed set to 2838735928


[rank: 7] Global seed set to 2838735928


[rank: 6] Global seed set to 2838735928
[rank: 1] Global seed set to 2838735928
[rank: 4] Global seed set to 2838735928


[rank: 2] Global seed set to 2838735928


[rank: 5] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8


[rank: 4] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8


[rank: 2] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8


[rank: 6] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8


[rank: 3] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8


[rank: 1] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8


[rank: 7] Global seed set to 2838735928
initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8


Downloading readme: 0%| | 0.00/433 [00:00=12.1), as this is known to have freeze issues
# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications
# - When resuming from checkpoint, the estimated time is inaccurate
#

[RWKV.model] Configuring optimizer with
 - lr_init: 6.000e-04 (0.0006)
 - lr_final: 4.000e-04 (0.0004)



Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3307251930236816 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3550989627838135 seconds


Loading extension module cpu_adam...
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3721420764923096 seconds
Time to load cpu_adam op: 2.369891881942749 seconds
Time to load cpu_adam op: 2.3694915771484375 seconds
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3722288608551025 seconds
Time to load cpu_adam op: 2.37237811088562 seconds
Time to load cpu_adam op: 2.3751280307769775 seconds


Loading `train_dataloader` to estimate number of stepping batches.


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...


Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.07468938827514648 seconds


Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Time to load utils op: 0.10226869583129883 seconds
Time to load utils op: 0.1022031307220459 seconds
Time to load utils op: 0.10259532928466797 seconds
Time to load utils op: 0.10230875015258789 seconds
Time to load utils op: 0.10206985473632812 seconds
Loading extension module utils...
Loading extension module utils...
Time to load utils op: 0.10176777839660645 seconds
Time to load utils op: 0.10233497619628906 seconds


Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] 
Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0008993148803710938 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0008089542388916016 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.01240086555480957 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0007753372192382812 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.003259420394897461 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.000774383544921875 seconds
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.02939915657043457 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0010874271392822266 seconds

 | Name | Type | Params
--------------------------------------
0 | emb | Embedding | 205 M 
1 | blocks | ModuleList | 1.3 B 
2 | ln_out | LayerNorm | 8.2 K 
3 | head | Linear | 205 M 
--------------------------------------
1.7 B Trainable params
0 Non-trainable params
1.7 B Total params
6,883.117 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]Training: 0%| | 0/10186 [00:00
 asyncio.run(main_function())
 File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
 return runner.run(main)
 ^^^^^^^^^^^^^^^^
 File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
 return self._loop.run_until_complete(task)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "/usr/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
 return future.result()
 ^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py", line 58, in main_function
 model = SimpleRWKV(model_path, device="cuda")
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1378, in __init__
 self.model = RWKV(**model_config)
 ^^^^^^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RW

# Enwiki Stage 2 : Basic Instruct Tuning

In [10]:
# Lets do a quick memory test
!export RWKV_WAVENET_LAYERS="{RWKV_WAVENET_LAYERS}" && \
 python3 ../memory_script/eval_v5_memory_guided.py "{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth"

Downloading readme: 0%| | 0.00/7.79k [00:00=12.1), as this is known to have freeze issues
# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications
# - When resuming from checkpoint, the estimated time is inaccurate
#

[RWKV.model] Configuring optimizer with
 - lr_init: 4.000e-04 (0.0004)
 - lr_final: 3.000e-04 (0.0003)



Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.323086738586426 seconds


Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3568384647369385 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.363503932952881 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3699283599853516 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.37654972076416 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.372668504714966 seconds
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Time to load cpu_adam op: 2.3785459995269775 seconds
Time to load cpu_adam op: 2.3747076988220215 seconds


Loading `train_dataloader` to estimate number of stepping batches.


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/py311_cu118/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.06939339637756348 seconds


Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Loading extension module utils...
Time to load utils op: 0.1026608943939209 seconds
Loading extension module utils...
Time to load utils op: 0.10243606567382812 seconds
Loading extension module utils...
Time to load utils op: 0.10294842720031738 seconds
Time to load utils op: 0.10233807563781738 seconds
Time to load utils op: 0.10210633277893066 seconds
Time to load utils op: 0.10252761840820312 seconds
Time to load utils op: 0.10194945335388184 seconds


Rank: 3 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 7 partition count [8, 8] and sizes[(215097344, False), (48, False)] 
Rank: 6 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 2 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 5 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 0 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 1 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Rank: 4 partition count [8, 8] and sizes[(215097344, False), (48, False)] 


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.023431777954101562 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.005431175231933594 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...


No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.025610923767089844 seconds
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0012531280517578125 seconds


Time to load utils op: 0.01864337921142578 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.00513148307800293 seconds
Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.000774383544921875 seconds


Using /root/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0008399486541748047 seconds

 | Name | Type | Params
--------------------------------------
0 | emb | Embedding | 205 M 
1 | blocks | ModuleList | 1.3 B 
2 | ln_out | LayerNorm | 8.2 K 
3 | head | Linear | 205 M 
--------------------------------------
1.7 B Trainable params
0 Non-trainable params
1.7 B Total params
6,883.117 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]Training: 0%| | 0/1867 [00:00
 asyncio.run(main_function())
 File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
 return runner.run(main)
 ^^^^^^^^^^^^^^^^
 File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
 return self._loop.run_until_complete(task)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "/usr/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
 return future.result()
 ^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/../memory_script/eval_v5_memory_guided.py", line 58, in main_function
 model = SimpleRWKV(model_path, device="cuda")
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1378, in __init__
 self.model = RWKV(**model_config)
 ^^^^^^^^^^^^^^^^^^^^
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWK