# RWKV v5 multi-size training experiment

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
DEEPSPEED_STRAT="deepspeed_stage_2_offload"
GPU_DEVICES="auto"
ENABLE_WANDB=True

EMBED_SCALE=0.01
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

EMBED_SIZE=2048

WANDB_PREFIX=f"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
 WANDB_MODE="online"
else:
 WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_2_offload
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train
INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer


In [3]:
# Get the init L12 model, and download the L6 model
!cd "{PROJECT_DIR}/model/" && wget -nc "https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth"
!cd "{PROJECT_DIR}/model/" && wget -nc "https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth"

--2023-10-09 13:45:06-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth
Resolving huggingface.co (huggingface.co)... 13.33.33.110, 13.33.33.20, 13.33.33.55, ...
Connecting to huggingface.co (huggingface.co)|13.33.33.110|:443... connected.
HTTP request sent, awaiting response... 

302 Found
Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/235d88b0aa939596392f2b5734a426940535816aa13106498974a809051a4c75?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-enwiki-4k-p1.pth%3B+filename%3D%22v5-L6-D2048-E0_01-enwiki-4k-p1.pth%22%3B&Expires=1697118306&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzExODMwNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzIzNWQ4OGIwYWE5Mzk1OTYzOTJmMmI1NzM0YTQyNjk0MDUzNTgxNmFhMTMxMDY0OTg5NzRhODA5MDUxYTRjNzU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=mYDHt4qPU9P2R8jxStu6hpcsaYf2BJxybVQl7UxmG-XKJV07nwUZPobAk4lYRRGYfVYs0s7n%7EXzZAHQfpJLBnI38caOMBFB-KgCvGG44D5HX%7ErJ-oct2gxYuMbdA7CvMpolTV%7EEmyePEpzCoxFN0FMjIgz3w2jwCdEZfD1UMKU-QRWfCFBxNX97rai95wuqXDM6oC1QB4Jbz4TST9P

18.155.68.94, 18.155.68.73, 18.155.68.128, ...
Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.94|:443... connected.
HTTP request sent, awaiting response... 

200 OK
Length: 1066537217 (1017M) [binary/octet-stream]
Saving to: ‘v5-L6-D2048-E0_01-enwiki-4k-p1.pth’

 v5-L6-D20 0%[ ] 0 --.-KB/s 

 v5-L6-D204 0%[ ] 26.25K 116KB/s 

 v5-L6-D2048 0%[ ] 58.25K 128KB/s 

 v5-L6-D2048- 0%[ ] 147.30K 217KB/s 

 v5-L6-D2048-E 0%[ ] 314.87K 347KB/s 

 v5-L6-D2048-E0 0%[ ] 648.86K 572KB/s 

 v5-L6-D2048-E0_ 0%[ ] 1.28M 961KB/s 

 v5-L6-D2048-E0_0 0%[ ] 2.59M 1.63MB/s 

 v5-L6-D2048-E0_01 0%[ ] 4.64M 2.55MB/s 

 v5-L6-D2048-E0_01- 0%[ ] 6.78M 3.31MB/s 

v5-L6-D2048-E0_01-e 0%[ ] 9.04M 3.98MB/s 

5-L6-D2048-E0_01-en 1%[ ] 11.40M 4.56MB/s 

-L6-D2048-E0_01-enw 1%[ ] 13.89M 5.09MB/s 

L6-D2048-E0_01-enwi 1%[ ] 16.48M 5.58MB/s 

6-D2048-E0_01-enwik 1%[ ] 19.21M 6.04MB/s eta 2m 45s 

-D2048-E0_01-enwiki 2%[ ] 22.09M 6.48MB/s eta 2m 45s 

D2048-E0_01-enwiki- 2%[ ] 25.11M 6.91MB/s eta 2m 45s 

2048-E0_01-enwiki-4 2%[ ] 28.28M 7.32MB/s eta 2m 45s 

048-E0_01-enwiki-4k 3%[ ] 31.59M 7.73MB/s eta 2m 45s 

48-E0_01-enwiki-4k- 3%[ ] 34.98M 8.10MB/s eta 2m 1s 

8-E0_01-enwiki-4k-p 3%[ ] 38.39M 8.42MB/s eta 2m 1s 

-E0_01-enwiki-4k-p1 4%[ ] 42.00M 9.22MB/s eta 2m 1s 

E0_01-enwiki-4k-p1. 4%[ ] 45.67M 10.0MB/s eta 2m 1s 

0_01-enwiki-4k-p1.p 4%[ ] 49.39M 10.8MB/s eta 2m 1s 

_01-enwiki-4k-p1.pt 5%[> ] 53.11M 11.6MB/s eta 99s 

01-enwiki-4k-p1.pth 5%[> ] 56.86M 12.3MB/s eta 99s 

1-enwiki-4k-p1.pth 5%[> ] 60.57M 13.0MB/s eta 99s 

-enwiki-4k-p1.pth 6%[> ] 64.29M 13.5MB/s eta 99s 

enwiki-4k-p1.pth 6%[> ] 67.73M 13.1MB/s eta 93s 

nwiki-4k-p1.pth 7%[> ] 71.81M 13.5MB/s eta 93s 

wiki-4k-p1.pth 7%[> ] 74.59M 13.6MB/s eta 93s 

iki-4k-p1.pth 7%[> ] 77.50M 13.8MB/s eta 93s 

ki-4k-p1.pth 7%[> ] 80.46M 13.9MB/s eta 93s 

i-4k-p1.pth 8%[> ] 83.51M 14.0MB/s eta 87s 

-4k-p1.pth 8%[> ] 86.59M 14.0MB/s eta 87s 

4k-p1.pth 8%[> ] 89.75M 14.1MB/s eta 87s 

k-p1.pth 9%[> ] 92.95M 14.1MB/s eta 87s 

-p1.pth 9%[> ] 96.20M 14.1MB/s eta 87s 

p1.pth 9%[> ] 99.50M 14.1MB/s eta 82s 

1.pth 10%[=> ] 102.82M 14.1MB/s eta 82s 

.pth 10%[=> ] 106.20M 14.2MB/s eta 82s 

pth 10%[=> ] 109.64M 14.1MB/s eta 82s 

th 11%[=> ] 113.11M 14.1MB/s eta 82s 

h 11%[=> ] 116.61M 14.1MB/s eta 77s 

 11%[=> ] 120.04M 14.0MB/s eta 77s 

 v 12%[=> ] 123.64M 14.0MB/s eta 77s 

 v5 12%[=> ] 126.54M 13.2MB/s eta 77s 

 v5- 12%[=> ] 130.29M 13.1MB/s eta 76s 

 v5-L 13%[=> ] 132.87M 13.6MB/s eta 76s 

 v5-L6 13%[=> ] 135.48M 13.3MB/s eta 76s 

 v5-L6- 13%[=> ] 138.12M 13.2MB/s eta 76s 

 v5-L6-D 13%[=> ] 140.82M 13.2MB/s eta 76s 

 v5-L6-D2 14%[=> ] 143.53M 13.1MB/s eta 75s 

 v5-L6-D20 14%[=> ] 146.29M 13.1MB/s eta 75s 

 v5-L6-D204 14%[=> ] 149.07M 13.0MB/s eta 75s 

 v5-L6-D2048 14%[=> ] 151.89M 12.9MB/s eta 75s 

 v5-L6-D2048- 15%[==> ] 154.71M 12.9MB/s eta 75s 

 v5-L6-D2048-E 15%[==> ] 157.14M 12.7MB/s eta 74s 

 v5-L6-D2048-E0 15%[==> ] 160.00M 12.6MB/s eta 74s 

 v5-L6-D2048-E0_ 16%[==> ] 162.89M 12.5MB/s eta 74s 

 v5-L6-D2048-E0_0 16%[==> ] 165.81M 12.4MB/s eta 74s 

 v5-L6-D2048-E0_01 16%[==> ] 168.73M 12.3MB/s eta 74s 

 v5-L6-D2048-E0_01- 16%[==> ] 171.68M 12.2MB/s eta 72s 

v5-L6-D2048-E0_01-e 17%[==> ] 174.64M 12.1MB/s eta 72s 

5-L6-D2048-E0_01-en 17%[==> ] 177.61M 12.0MB/s eta 72s 

-L6-D2048-E0_01-enw 17%[==> ] 180.59M 11.9MB/s eta 72s 

L6-D2048-E0_01-enwi 18%[==> ] 183.61M 12.5MB/s eta 72s 

6-D2048-E0_01-enwik 18%[==> ] 186.62M 12.4MB/s eta 70s 

-D2048-E0_01-enwiki 18%[==> ] 189.64M 12.5MB/s eta 70s 

D2048-E0_01-enwiki- 18%[==> ] 192.65M 12.6MB/s eta 70s 

2048-E0_01-enwiki-4 19%[==> ] 195.71M 12.7MB/s eta 70s 

048-E0_01-enwiki-4k 19%[==> ] 198.76M 12.8MB/s eta 70s 

48-E0_01-enwiki-4k- 19%[==> ] 201.82M 12.8MB/s eta 68s 

8-E0_01-enwiki-4k-p 20%[===> ] 204.87M 12.9MB/s eta 68s 

-E0_01-enwiki-4k-p1 20%[===> ] 207.95M 13.0MB/s eta 68s 

E0_01-enwiki-4k-p1. 20%[===> ] 211.03M 13.0MB/s eta 68s 

0_01-enwiki-4k-p1.p 21%[===> ] 214.12M 13.1MB/s eta 68s 

_01-enwiki-4k-p1.pt 21%[===> ] 217.18M 13.2MB/s eta 66s 

01-enwiki-4k-p1.pth 21%[===> ] 220.28M 13.3MB/s eta 66s 

1-enwiki-4k-p1.pth 21%[===> ] 223.37M 13.3MB/s eta 66s 

-enwiki-4k-p1.pth 22%[===> ] 226.48M 13.4MB/s eta 66s 

enwiki-4k-p1.pth 22%[===> ] 229.57M 13.4MB/s eta 66s 

nwiki-4k-p1.pth 22%[===> ] 232.65M 13.4MB/s eta 65s 

wiki-4k-p1.pth 23%[===> ] 235.75M 13.5MB/s eta 65s 

iki-4k-p1.pth 23%[===> ] 238.84M 13.5MB/s eta 65s 

ki-4k-p1.pth 23%[===> ] 241.95M 13.5MB/s eta 65s 

i-4k-p1.pth 24%[===> ] 245.04M 13.6MB/s eta 65s 

-4k-p1.pth 24%[===> ] 248.15M 13.6MB/s eta 63s 

4k-p1.pth 24%[===> ] 251.26M 13.6MB/s eta 63s 

k-p1.pth 25%[====> ] 254.36M 13.6MB/s eta 63s 

-p1.pth 25%[====> ] 257.45M 13.6MB/s eta 63s 

p1.pth 25%[====> ] 260.56M 13.6MB/s eta 63s 

1.pth 25%[====> ] 263.65M 13.6MB/s eta 61s 

.pth 26%[====> ] 266.76M 13.6MB/s eta 61s 

pth 26%[====> ] 269.86M 13.6MB/s eta 61s 

th 26%[====> ] 272.96M 13.7MB/s eta 61s 

h 27%[====> ] 276.07M 13.6MB/s eta 61s 

 27%[====> ] 279.18M 13.7MB/s eta 60s 

 v 27%[====> ] 282.31M 13.6MB/s eta 60s 

 v5 28%[====> ] 285.42M 13.7MB/s eta 60s 

 v5- 28%[====> ] 288.54M 13.7MB/s eta 60s 

 v5-L 28%[====> ] 291.65M 13.5MB/s eta 60s 

 v5-L6 28%[====> ] 294.78M 13.3MB/s eta 58s 

 v5-L6- 29%[====> ] 297.90M 13.8MB/s eta 58s 

 v5-L6-D 29%[====> ] 301.03M 13.7MB/s eta 58s 

 v5-L6-D2 29%[====> ] 304.17M 13.7MB/s eta 58s 

 v5-L6-D20 30%[=====> ] 307.28M 13.7MB/s eta 58s 

 v5-L6-D204 30%[=====> ] 310.40M 13.7MB/s eta 57s 

 v5-L6-D2048 30%[=====> ] 313.53M 13.7MB/s eta 57s 

 v5-L6-D2048- 31%[=====> ] 316.67M 13.7MB/s eta 57s 

 v5-L6-D2048-E 31%[=====> ] 319.82M 13.7MB/s eta 57s 

 v5-L6-D2048-E0 31%[=====> ] 322.98M 13.7MB/s eta 57s 

 v5-L6-D2048-E0_ 32%[=====> ] 326.07M 13.8MB/s eta 55s 

 v5-L6-D2048-E0_0 32%[=====> ] 329.18M 13.5MB/s eta 55s 

 v5-L6-D2048-E0_01 32%[=====> ] 332.28M 13.5MB/s eta 55s 

 v5-L6-D2048-E0_01- 32%[=====> ] 335.34M 13.5MB/s eta 55s 

v5-L6-D2048-E0_01-e 33%[=====> ] 338.54M 13.5MB/s eta 54s 

5-L6-D2048-E0_01-en 33%[=====> ] 341.71M 13.5MB/s eta 54s 

-L6-D2048-E0_01-enw 33%[=====> ] 344.93M 13.5MB/s eta 54s 

L6-D2048-E0_01-enwi 34%[=====> ] 348.18M 13.5MB/s eta 54s 

6-D2048-E0_01-enwik 34%[=====> ] 351.45M 13.6MB/s eta 54s 

-D2048-E0_01-enwiki 34%[=====> ] 354.68M 13.6MB/s eta 52s 


















































































































































































































































































































































































































2023-10-09 13:46:22 (13.5 MB/s) - ‘v5-L6-D2048-E0_01-enwiki-4k-p1.pth’ saved [1066537217/1066537217]



--2023-10-09 13:46:23-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth
Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.102, ...
Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.
HTTP request sent, awaiting response... 

302 Found
Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/06105d96413046fce0ec189b9c4685a813cfa7147300851c5d2afc7b5adbcb38?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L12-D2048-E0_01-neox-v5base-init.pth%3B+filename%3D%22v5-L12-D2048-E0_01-neox-v5base-init.pth%22%3B&Expires=1697118383&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzExODM4M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzA2MTA1ZDk2NDEzMDQ2ZmNlMGVjMTg5YjljNDY4NWE4MTNjZmE3MTQ3MzAwODUxYzVkMmFmYzdiNWFkYmNiMzg%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=kzZilHhZvddaNFoit6Wo-sHUcFMwnJ-lyCrMqv8rODaw3nx1-atQMmw2NJlKDSPTrDtq-nsdN%7EXRDadCZxNdfOnVh41qrAbj4Rb9lqg7CPls1GRWS6j2tw4ZMZ151dO1DsdimId0RZllQb1bW4cyoR7KyoqejnW8lflzejQQfDsdNwBo8Xq2sL%7ENJDHP0TD9VsH3MA7tQr

18.155.68.73, 18.155.68.128, 18.155.68.94, ...
Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.73|:443... connected.


HTTP request sent, awaiting response... 

200 OK
Length: 1721189797 (1.6G) [binary/octet-stream]
Saving to: ‘v5-L12-D2048-E0_01-neox-v5base-init.pth’

 v5-L12-D2 0%[ ] 0 --.-KB/s 

 v5-L12-D20 0%[ ] 9.26K 40.6KB/s 

 v5-L12-D204 0%[ ] 41.26K 90.7KB/s 

 v5-L12-D2048 0%[ ] 109.26K 160KB/s 

 v5-L12-D2048- 0%[ ] 211.26K 232KB/s 

 v5-L12-D2048-E 0%[ ] 449.26K 394KB/s 

 v5-L12-D2048-E0 0%[ ] 925.26K 676KB/s 

 v5-L12-D2048-E0_ 0%[ ] 1.82M 1.14MB/s 

 v5-L12-D2048-E0_0 0%[ ] 3.65M 2.00MB/s 

 v5-L12-D2048-E0_01 0%[ ] 7.26M 3.53MB/s 

v5-L12-D2048-E0_01- 0%[ ] 10.79M 4.73MB/s 

5-L12-D2048-E0_01-n 0%[ ] 14.65M 5.83MB/s 

-L12-D2048-E0_01-ne 1%[ ] 18.58M 6.77MB/s 

L12-D2048-E0_01-neo 1%[ ] 22.33M 7.50MB/s 

12-D2048-E0_01-neox 1%[ ] 26.14M 8.15MB/s eta 3m 18s 

2-D2048-E0_01-neox- 1%[ ] 29.93M 8.71MB/s eta 3m 18s 

-D2048-E0_01-neox-v 2%[ ] 33.83M 9.22MB/s eta 3m 18s 

D2048-E0_01-neox-v5 2%[ ] 37.64M 9.65MB/s eta 3m 18s 

2048-E0_01-neox-v5b 2%[ ] 41.36M 10.0MB/s eta 3m 18s 

048-E0_01-neox-v5ba 2%[ ] 45.08M 10.3MB/s eta 2m 34s 

48-E0_01-neox-v5bas 2%[ ] 48.83M 10.6MB/s eta 2m 34s 

8-E0_01-neox-v5base 3%[ ] 52.09M 11.3MB/s eta 2m 34s 

-E0_01-neox-v5base- 3%[ ] 55.98M 12.2MB/s eta 2m 34s 

E0_01-neox-v5base-i 3%[ ] 59.86M 13.0MB/s eta 2m 34s 

0_01-neox-v5base-in 3%[ ] 63.59M 13.8MB/s eta 2m 17s 

_01-neox-v5base-ini 4%[ ] 67.34M 14.5MB/s eta 2m 17s 

01-neox-v5base-init 4%[ ] 71.14M 15.2MB/s eta 2m 17s 

1-neox-v5base-init. 4%[ ] 75.03M 15.9MB/s eta 2m 17s 

-neox-v5base-init.p 4%[ ] 78.84M 16.3MB/s eta 2m 17s 

neox-v5base-init.pt 5%[> ] 82.73M 16.3MB/s eta 2m 6s 

eox-v5base-init.pth 5%[> ] 86.64M 16.4MB/s eta 2m 6s 

ox-v5base-init.pth 5%[> ] 90.37M 16.4MB/s eta 2m 6s 

x-v5base-init.pth 5%[> ] 94.28M 16.4MB/s eta 2m 6s 

-v5base-init.pth 5%[> ] 98.20M 16.4MB/s eta 2m 6s 

v5base-init.pth 6%[> ] 102.11M 16.5MB/s eta 1m 58s 

5base-init.pth 6%[> ] 105.84M 16.4MB/s eta 1m 58s 

base-init.pth 6%[> ] 109.56M 16.4MB/s eta 1m 58s 

ase-init.pth 6%[> ] 113.39M 16.4MB/s eta 1m 58s 

se-init.pth 7%[> ] 117.14M 16.4MB/s eta 1m 58s 

e-init.pth 7%[> ] 120.83M 16.4MB/s eta 1m 53s 

-init.pth 7%[> ] 124.67M 16.4MB/s eta 1m 53s 

init.pth 7%[> ] 128.54M 16.6MB/s eta 1m 53s 

nit.pth 8%[> ] 132.43M 16.6MB/s eta 1m 53s 

it.pth 8%[> ] 136.18M 16.5MB/s eta 1m 53s 

t.pth 8%[> ] 139.95M 16.5MB/s eta 1m 49s 

.pth 8%[> ] 143.79M 16.6MB/s eta 1m 49s 

pth 8%[> ] 147.70M 16.6MB/s eta 1m 49s 

th 9%[> ] 151.56M 16.6MB/s eta 1m 49s 

h 9%[> ] 155.33M 16.6MB/s eta 1m 49s 

 9%[> ] 159.15M 16.6MB/s eta 1m 45s 

 v 9%[> ] 163.01M 16.5MB/s eta 1m 45s 

 v5 10%[=> ] 166.86M 16.6MB/s eta 1m 45s 

 v5- 10%[=> ] 170.62M 16.5MB/s eta 1m 45s 

 v5-L 10%[=> ] 174.36M 16.5MB/s eta 1m 45s 

 v5-L1 10%[=> ] 178.23M 16.5MB/s eta 1m 42s 

 v5-L12 11%[=> ] 182.08M 16.5MB/s eta 1m 42s 

 v5-L12- 11%[=> ] 185.81M 16.5MB/s eta 1m 42s 

 v5-L12-D 11%[=> ] 189.61M 16.5MB/s eta 1m 42s 

 v5-L12-D2 11%[=> ] 193.42M 16.5MB/s eta 1m 42s 

 v5-L12-D20 12%[=> ] 197.18M 16.5MB/s eta 1m 40s 

 v5-L12-D204 12%[=> ] 201.04M 16.5MB/s eta 1m 40s 

 v5-L12-D2048 12%[=> ] 203.64M 16.3MB/s eta 1m 40s 

 v5-L12-D2048- 12%[=> ] 207.51M 16.3MB/s eta 1m 40s 

 v5-L12-D2048-E 12%[=> ] 211.31M 16.3MB/s eta 1m 40s 

 v5-L12-D2048-E0 13%[=> ] 215.08M 16.3MB/s eta 98s 

 v5-L12-D2048-E0_ 13%[=> ] 218.98M 16.3MB/s eta 98s 

 v5-L12-D2048-E0_0 13%[=> ] 222.92M 16.3MB/s eta 98s 

 v5-L12-D2048-E0_01 13%[=> ] 226.67M 16.3MB/s eta 98s 

v5-L12-D2048-E0_01- 14%[=> ] 230.58M 16.3MB/s eta 98s 

5-L12-D2048-E0_01-n 14%[=> ] 234.40M 16.3MB/s eta 95s 

-L12-D2048-E0_01-ne 14%[=> ] 238.25M 16.3MB/s eta 95s 

L12-D2048-E0_01-neo 14%[=> ] 242.03M 16.3MB/s eta 95s 

12-D2048-E0_01-neox 14%[=> ] 245.90M 16.3MB/s eta 95s 

2-D2048-E0_01-neox- 15%[==> ] 249.78M 16.3MB/s eta 95s 

-D2048-E0_01-neox-v 15%[==> ] 253.64M 16.3MB/s eta 93s 

D2048-E0_01-neox-v5 15%[==> ] 257.47M 16.3MB/s eta 93s 

2048-E0_01-neox-v5b 15%[==> ] 261.33M 16.4MB/s eta 93s 

048-E0_01-neox-v5ba 16%[==> ] 265.09M 16.3MB/s eta 93s 

48-E0_01-neox-v5bas 16%[==> ] 268.98M 16.4MB/s eta 93s 

8-E0_01-neox-v5base 16%[==> ] 272.84M 16.4MB/s eta 91s 

-E0_01-neox-v5base- 16%[==> ] 276.65M 16.4MB/s eta 91s 

E0_01-neox-v5base-i 17%[==> ] 280.51M 16.6MB/s eta 91s 

0_01-neox-v5base-in 17%[==> ] 284.26M 16.6MB/s eta 91s 

_01-neox-v5base-ini 17%[==> ] 288.12M 16.6MB/s eta 91s 

01-neox-v5base-init 17%[==> ] 291.93M 16.6MB/s eta 90s 

1-neox-v5base-init. 18%[==> ] 295.78M 16.6MB/s eta 90s 

-neox-v5base-init.p 18%[==> ] 299.54M 16.6MB/s eta 90s 

neox-v5base-init.pt 18%[==> ] 303.33M 16.6MB/s eta 90s 

eox-v5base-init.pth 18%[==> ] 307.12M 16.6MB/s eta 90s 

ox-v5base-init.pth 18%[==> ] 310.92M 16.5MB/s eta 88s 

x-v5base-init.pth 19%[==> ] 313.47M 16.3MB/s eta 88s 

-v5base-init.pth 19%[==> ] 317.25M 16.3MB/s eta 88s 

v5base-init.pth 19%[==> ] 319.84M 16.0MB/s eta 88s 

5base-init.pth 19%[==> ] 323.73M 16.0MB/s eta 88s 

base-init.pth 19%[==> ] 327.47M 16.0MB/s eta 87s 

ase-init.pth 20%[===> ] 331.25M 16.0MB/s eta 87s 

se-init.pth 20%[===> ] 335.01M 16.0MB/s eta 87s 

e-init.pth 20%[===> ] 338.84M 16.0MB/s eta 87s 

-init.pth 20%[===> ] 342.62M 16.0MB/s eta 87s 

init.pth 21%[===> ] 346.36M 15.9MB/s eta 85s 

nit.pth 21%[===> ] 350.23M 15.9MB/s eta 85s 

it.pth 21%[===> ] 354.03M 15.9MB/s eta 85s 

t.pth 21%[===> ] 357.90M 16.0MB/s eta 85s 

.pth 22%[===> ] 361.73M 16.0MB/s eta 85s 

pth 22%[===> ] 365.59M 16.0MB/s eta 84s 

th 22%[===> ] 369.33M 15.9MB/s eta 84s 

h 22%[===> ] 373.08M 15.9MB/s eta 84s 

 22%[===> ] 376.92M 16.0MB/s eta 84s 

 v 23%[===> ] 380.76M 16.0MB/s eta 84s 

 v5 23%[===> ] 384.56M 16.0MB/s eta 82s 

 v5- 23%[===> ] 388.29M 16.2MB/s eta 82s 

 v5-L 23%[===> ] 392.11M 16.2MB/s eta 82s 

 v5-L1 24%[===> ] 395.89M 16.5MB/s eta 82s 

 v5-L12 24%[===> ] 399.76M 16.5MB/s eta 82s 

 v5-L12- 24%[===> ] 403.62M 16.4MB/s eta 81s 

 v5-L12-D 24%[===> ] 407.48M 16.5MB/s eta 81s 

 v5-L12-D2 25%[====> ] 411.36M 16.5MB/s eta 81s 

 v5-L12-D20 25%[====> ] 415.09M 16.5MB/s eta 81s 

 v5-L12-D204 25%[====> ] 418.97M 16.5MB/s eta 81s 

 v5-L12-D2048 25%[====> ] 422.73M 16.5MB/s eta 79s 

 v5-L12-D2048- 25%[====> ] 426.48M 16.5MB/s eta 79s 

 v5-L12-D2048-E 26%[====> ] 430.20M 16.5MB/s eta 79s 

 v5-L12-D2048-E0 26%[====> ] 434.01M 16.5MB/s eta 79s 

 v5-L12-D2048-E0_ 26%[====> ] 437.90M 16.5MB/s eta 79s 

 v5-L12-D2048-E0_0 26%[====> ] 441.73M 16.5MB/s eta 78s 

 v5-L12-D2048-E0_01 27%[====> ] 445.64M 16.5MB/s eta 78s 

v5-L12-D2048-E0_01- 27%[====> ] 449.48M 16.5MB/s eta 78s 

5-L12-D2048-E0_01-n 27%[====> ] 453.33M 16.6MB/s eta 78s 

-L12-D2048-E0_01-ne 27%[====> ] 457.20M 16.6MB/s eta 78s 

L12-D2048-E0_01-neo 28%[====> ] 461.09M 16.6MB/s eta 76s 

12-D2048-E0_01-neox 28%[====> ] 464.89M 16.6MB/s eta 76s 

2-D2048-E0_01-neox- 28%[====> ] 468.67M 16.6MB/s eta 76s 

-D2048-E0_01-neox-v 28%[====> ] 472.47M 16.6MB/s eta 76s 

D2048-E0_01-neox-v5 29%[====> ] 476.37M 16.6MB/s eta 76s 

2048-E0_01-neox-v5b 29%[====> ] 480.20M 16.6MB/s eta 75s 

048-E0_01-neox-v5ba 29%[====> ] 484.06M 16.6MB/s eta 75s 

48-E0_01-neox-v5bas 29%[====> ] 487.95M 16.6MB/s eta 75s 

8-E0_01-neox-v5base 29%[====> ] 491.75M 16.6MB/s eta 75s 

-E0_01-neox-v5base- 30%[=====> ] 495.61M 16.6MB/s eta 75s 

E0_01-neox-v5base-i 30%[=====> ] 499.36M 16.6MB/s eta 73s 

0_01-neox-v5base-in 30%[=====> ] 503.23M 16.6MB/s eta 73s 

_01-neox-v5base-ini 30%[=====> ] 507.04M 16.6MB/s eta 73s 

01-neox-v5base-init 31%[=====> ] 510.78M 16.6MB/s eta 73s 

1-neox-v5base-init. 31%[=====> ] 514.51M 16.5MB/s eta 73s 

-neox-v5base-init.p 31%[=====> ] 518.26M 16.5MB/s eta 72s 

neox-v5base-init.pt 31%[=====> ] 522.08M 16.5MB/s eta 72s 

eox-v5base-init.pth 32%[=====> ] 525.81M 16.5MB/s eta 72s 

ox-v5base-init.pth 32%[=====> ] 529.58M 16.5MB/s eta 72s 

x-v5base-init.pth 32%[=====> ] 533.40M 16.5MB/s eta 72s 

-v5base-init.pth 32%[=====> ] 537.23M 16.4MB/s eta 71s 

v5base-init.pth 32%[=====> ] 540.59M 16.4MB/s eta 71s 

5base-init.pth 33%[=====> ] 544.48M 16.4MB/s eta 71s 

base-init.pth 33%[=====> ] 548.28M 16.4MB/s eta 71s 

ase-init.pth 33%[=====> ] 552.15M 16.4MB/s eta 71s 

se-init.pth 33%[=====> ] 555.98M 16.4MB/s eta 69s 

e-init.pth 34%[=====> ] 559.87M 16.4MB/s eta 69s 

-init.pth 34%[=====> ] 563.75M 16.4MB/s eta 69s 

init.pth 34%[=====> ] 567.64M 16.4MB/s eta 69s 

nit.pth 34%[=====> ] 571.48M 16.4MB/s eta 69s 


























































































































































































































































































































































































































































































































































































2023-10-09 13:48:06 (16.0 MB/s) - ‘v5-L12-D2048-E0_01-neox-v5base-init.pth’ saved [1721189797/1721189797]



In [4]:
# Lets build the merged model
!cd "{TRAINER_DIR}" && \
 python3 model_merge.py \
 --merge-mode="layer_expansion" \
 "{PROJECT_DIR}/model/v5-L12-D2048-E0_01-neox-v5base-init.pth" \
 "{PROJECT_DIR}/model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth" \
 "{PROJECT_DIR}/model/{FILENAME_PREFIX}-layer-expansion-p1.pth"

---- Merging model ----
Baseline model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L12-D2048-E0_01-neox-v5base-init.pth
Source model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth
Output model path: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/model/v5-L6+6-D2048-E0_01-layer-expansion-p1.pth
Merge mode: layer_expansion
---- ----- ----


Merging blocks.0.att.gate.weight ...
Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/model_merge.py", line 143, in 
 main()
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/model_merge.py", line 133, in main
 model_merge(
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/model_merge.py", line 102, in model_merge
 raise Exception(f"Unknown merge mode: {merge_mode}")
Exception: Unknown merge mode: layer_expansion


In [5]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
 export WANDB_MODE="{WANDB_MODE}" && \
 python3 lightning_trainer.py fit \
 -c "{NOTEBOOK_DIR}/enwiki-4k-part2.yaml" \
 --trainer.logger.init_args.name="{WANDB_PREFIX} - Overwrite Merge Part 2 (train-ctx=4k, {DEEPSPEED_STRAT})" \
 --trainer.strategy="{DEEPSPEED_STRAT}" \
 --trainer.devices="{GPU_DEVICES}" \
 --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-layer-expansion-p2/" \
 --model.load_model="../model/{FILENAME_PREFIX}-layer-expansion-p1.pth" \
 --model.ctx_len=4096 \
 --model.bptt_learning_range=1

Saving the dataset (0/2 shards): 0%| | 0/27202 [00:00
 cli_main()
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 253, in cli_main
 LightningCLI(
 File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 350, in __init__
 self.instantiate_classes()
 File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 499, in instantiate_classes
 self.config_init = self.parser.instantiate_classes(self.config)
 File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py", line 139, in patched_instantiate_classes
 cfg = self._unpatched_instantiate_classes(cfg, **kwargs)
 File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py", line 1130, in instantiate_classes
 cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)
 File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py", line 139, in patched_inst

[34m[1mwandb[0m: 🚀 View run [33m[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 2 (train-ctx=4k, deepspeed_stage_2_offload)[0m at: [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/tl8hlm81[0m
[34m[1mwandb[0m: ️⚡ View job at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v12[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20231009_134831-tl8hlm81/logs[0m


In [7]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
 python3 export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-layer-expansion-p2/last.ckpt" "../model/{FILENAME_PREFIX}-layer-expansion-p2.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-layer-expansion-p2.pth"

[2023-10-09 13:48:46,300] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 651, in 
 convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 542, in convert_zero_checkpoint_to_fp32_state_dict
 state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 516, in get_fp32_state_dict_from_zero_checkpoint
 raise ValueError(f"Unable to find 'latest' file at {latest_path}")
ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-p2/last.ckpt/latest


ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-p2.pth': No such file or directory


In [8]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
 python3 dragon_test.py "../model/{FILENAME_PREFIX}-layer-expansion-p2.pth" "cuda fp32"

[2023-10-09 13:48:49,874] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py", line 52, in 
 model = SimpleRWKV(MODEL_PATH, device=DEVICE)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1420, in __init__
 self.model = RWKV(**model_config)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 566, in __init__
 raise ValueError(f"load_model file '{load_model}' does not exist")
ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-p2.pth' does not exist


## Enwiki Stage 3 : Baseline training

In [9]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
 export WANDB_MODE="{WANDB_MODE}" && \
 python3 lightning_trainer.py fit \
 -c "{NOTEBOOK_DIR}/enwiki-4k-part3.yaml" \
 --trainer.logger.init_args.name="{WANDB_PREFIX} - Overwrite Merge Part 3 (train-ctx=4k, {DEEPSPEED_STRAT})" \
 --trainer.strategy="{DEEPSPEED_STRAT}" \
 --trainer.devices="{GPU_DEVICES}" \
 --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-baseline-p3/" \
 --model.load_model="../model/{FILENAME_PREFIX}-layer-expansion-p2.pth" \
 --model.ctx_len=4096 \
 --model.bptt_learning_range=1

[2023-10-09 13:48:53,746] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


 rank_zero_warn(


 rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 4224737379


[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.15.12
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20231009_134856-e7u5abp0[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/e7u5abp0[0m


Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 278, in 
 cli_main()
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 253, in cli_main
 LightningCLI(
 File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 350, in __init__
 self.instantiate_classes()
 File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 499, in instantiate_classes
 self.config_init = self.parser.instantiate_classes(self.config)
 File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py", line 139, in patched_instantiate_classes
 cfg = self._unpatched_instantiate_classes(cfg, **kwargs)
 File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py", line 1130, in instantiate_classes
 cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)
 File "

[34m[1mwandb[0m: 🚀 View run [33m[Multi-size] v5-L6+6-D2048-E0.01 - Overwrite Merge Part 3 (train-ctx=4k, deepspeed_stage_2_offload)[0m at: [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/e7u5abp0[0m
[34m[1mwandb[0m: ️⚡ View job at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v12[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20231009_134856-e7u5abp0/logs[0m


In [10]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
 python3 export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-baseline-p3/last.ckpt" "../model/{FILENAME_PREFIX}-baseline-p3.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-layer-expansion-p3.pth"

[2023-10-09 13:49:06,868] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 651, in 
 convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 542, in convert_zero_checkpoint_to_fp32_state_dict
 state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 516, in get_fp32_state_dict_from_zero_checkpoint
 raise ValueError(f"Unable to find 'latest' file at {latest_path}")
ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-baseline-p3/last.ckpt/latest


ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-p3.pth': No such file or directory


In [11]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
 python3 dragon_test.py "../model/{FILENAME_PREFIX}-layer-expansion-p3.pth" "cuda fp32"

[2023-10-09 13:49:10,451] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
Traceback (most recent call last):
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py", line 52, in 
 model = SimpleRWKV(MODEL_PATH, device=DEVICE)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1420, in __init__
 self.model = RWKV(**model_config)
 File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 566, in __init__
 raise ValueError(f"load_model file '{load_model}' does not exist")
ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-p3.pth' does not exist
