[WIP] [GHA] experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part5.ipynb result model (multi-commit f2c69dfb744754fded94d535bba24e09c60ab0794c227b106b0028e21a8955ba)

#90
This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +0 -26
  2. experiment/memory-bench/Benchmark-V5-L96.ipynb +0 -0
  3. experiment/memory-bench/Benchmark-V5.ipynb +0 -0
  4. experiment/memory-bench/logs/BaseV5-C-Tune5-16k.csv +2 -2
  5. experiment/memory-bench/logs/BaseV5-C-Tune5-1k.csv +0 -0
  6. experiment/memory-bench/logs/BaseV5-C-Tune5-4k.csv +2 -2
  7. experiment/memory-bench/logs/v5-L6-D1024-E0_1-16k.csv +0 -3
  8. experiment/memory-bench/logs/v5-L6-D1024-E0_1-1k.csv +0 -0
  9. experiment/memory-bench/logs/v5-L6-D1024-E0_1-4k.csv +0 -3
  10. experiment/memory-bench/logs/v5-L6-D2048-E0_1-16k.csv +2 -2
  11. experiment/memory-bench/logs/v5-L6-D2048-E0_1-1k.csv +0 -0
  12. experiment/memory-bench/logs/v5-L6-D2048-E0_1-4k.csv +2 -2
  13. experiment/memory-bench/logs/v5-L6-D4096-E0_1-16k.csv +2 -2
  14. experiment/memory-bench/logs/v5-L6-D4096-E0_1-1k.csv +0 -0
  15. experiment/memory-bench/logs/v5-L6-D4096-E0_1-4k.csv +2 -2
  16. experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-16k.csv +0 -3
  17. experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-1k.csv +0 -0
  18. experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-4k.csv +0 -3
  19. experiment/memory-bench/v5-L6-D1024-E0_1-mem-ctx-8k.pth +0 -3
  20. experiment/memory-bench/v5-L96-D1024-E0_1-mem-ctx-8k.pth +0 -3
  21. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-baseline-p2.pth +0 -3
  22. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-baseline-p3.pth +0 -3
  23. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-enwiki-4k-p1.pth +0 -3
  24. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth +0 -3
  25. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-baseline.ipynb +0 -3
  26. experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb +0 -3
  27. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-baseline-p3.pth +0 -3
  28. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-layer-expansion-p1.pth +0 -3
  29. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-layer-expansion-p2.pth +0 -3
  30. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-overwrite-naive-p1.pth +0 -3
  31. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-overwrite-naive-p2.pth +0 -3
  32. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb +0 -2461
  33. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-expansion.ipynb +0 -3
  34. experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb +0 -3
  35. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-baseline-p2.pth +0 -3
  36. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-baseline-p3.pth +0 -3
  37. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth +0 -3
  38. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-layer-expansion-a3.pth +0 -3
  39. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-layer-expansion-b3.pth +0 -3
  40. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-neox-v5base-init.pth +0 -3
  41. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth +0 -3
  42. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth +0 -3
  43. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-merge-2m.pth +0 -3
  44. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-merge-p3.pth +0 -3
  45. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb +0 -3
  46. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb +0 -3
  47. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-baseline.ipynb +0 -3
  48. experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-train.ipynb +0 -3
  49. experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E0_1-mem-ctx-8k.pth +0 -3
  50. experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part5.ipynb +0 -0
.gitattributes CHANGED
@@ -79,29 +79,3 @@ experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-tra
79
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/v5-L6-D2048-E0_1-4k.csv filter=lfs diff=lfs merge=lfs -text
80
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/BaseV5-C-Tune5-4k.csv filter=lfs diff=lfs merge=lfs -text
81
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/BaseV5-C-Tune5-16k.csv filter=lfs diff=lfs merge=lfs -text
82
- experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-16k.csv filter=lfs diff=lfs merge=lfs -text
83
- experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-4k.csv filter=lfs diff=lfs merge=lfs -text
84
- experiment/rwkv-x-exp/v5-slim-memory/v5-L6-D1024-E1e-1-ctx4k.ipynb filter=lfs diff=lfs merge=lfs -text
85
- experiment/memory-bench/logs/v5-L6-D1024-E0_1-4k.csv filter=lfs diff=lfs merge=lfs -text
86
- experiment/memory-bench/logs/v5-L6-D1024-E0_1-16k.csv filter=lfs diff=lfs merge=lfs -text
87
- experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part1.ipynb filter=lfs diff=lfs merge=lfs -text
88
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part1.ipynb filter=lfs diff=lfs merge=lfs -text
89
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/part2.ipynb filter=lfs diff=lfs merge=lfs -text
90
- experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb filter=lfs diff=lfs merge=lfs -text
91
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/part1.ipynb filter=lfs diff=lfs merge=lfs -text
92
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage2.ipynb filter=lfs diff=lfs merge=lfs -text
93
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage2.ipynb filter=lfs diff=lfs merge=lfs -text
94
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage3.ipynb filter=lfs diff=lfs merge=lfs -text
95
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage3.ipynb filter=lfs diff=lfs merge=lfs -text
96
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage4.ipynb filter=lfs diff=lfs merge=lfs -text
97
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage4.ipynb filter=lfs diff=lfs merge=lfs -text
98
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
99
- experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
100
- experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
101
- experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
102
- experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
103
- experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
104
- experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb filter=lfs diff=lfs merge=lfs -text
105
- experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-train.ipynb filter=lfs diff=lfs merge=lfs -text
106
- experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-expansion.ipynb filter=lfs diff=lfs merge=lfs -text
107
- experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
 
79
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/v5-L6-D2048-E0_1-4k.csv filter=lfs diff=lfs merge=lfs -text
80
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/BaseV5-C-Tune5-4k.csv filter=lfs diff=lfs merge=lfs -text
81
  experiment/memory-bench/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/memory-bench/logs/BaseV5-C-Tune5-16k.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment/memory-bench/Benchmark-V5-L96.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/Benchmark-V5.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/BaseV5-C-Tune5-16k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b131e25f5a022b59f3a80de7054ed0fc02d23ba0c88109799a463362bcd091d
3
- size 118790470
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8b28de9a89121cbc5dba89c2bdefa65144edd20acc723e2d3e1500e06dc695
3
+ size 118790922
experiment/memory-bench/logs/BaseV5-C-Tune5-1k.csv CHANGED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/BaseV5-C-Tune5-4k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5b6dfd18ae9886dfef7b40490e01709bed3a6fb6ae80c1b71d4e579fe33bec3
3
- size 14197389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:561c2d53eefbb5418752b1822f175164fc7469fb393acdbe5e92094c088e5f9f
3
+ size 14197495
experiment/memory-bench/logs/v5-L6-D1024-E0_1-16k.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ede23f588af24bffdae0758be980b5ff8e0add92cb4973d6c4e0492842db60aa
3
- size 118299852
 
 
 
 
experiment/memory-bench/logs/v5-L6-D1024-E0_1-1k.csv DELETED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/v5-L6-D1024-E0_1-4k.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ff6c289d0138740b732bdace1edd39e8b7443e0d5d6ac756bf18641cdf2462b
3
- size 14155151
 
 
 
 
experiment/memory-bench/logs/v5-L6-D2048-E0_1-16k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75d9d09b51300d41bd9f4ffc5aa20e5e8433bf5120e50bcc840f50c55d8b164a
3
- size 118428057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a0445a053abfebda5105e4e4d38722a713eb272220b43a9cb509abfbbdbbbc
3
+ size 118434365
experiment/memory-bench/logs/v5-L6-D2048-E0_1-1k.csv CHANGED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/v5-L6-D2048-E0_1-4k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa4f4a3f30a08194d38c0ea2b9f5991cd4bbc6224cc47fd98f067a34b50ffe38
3
- size 14551219
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3c8e0ab7f864eda60a244b3d0499bfb67dbcd3497c577a15cac0e4656a885e4
3
+ size 14550324
experiment/memory-bench/logs/v5-L6-D4096-E0_1-16k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16471c2afb16c0a1af294ca554513787e419bdebd0345a073c296d080e100141
3
- size 118660437
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb7b2f22dabce4536720add3024c0f45eb04b2506f2f6f7ec02a7166182e05c
3
+ size 118660846
experiment/memory-bench/logs/v5-L6-D4096-E0_1-1k.csv CHANGED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/v5-L6-D4096-E0_1-4k.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2808d046aa2c87afc129face7b54ca66dc5027337164178d1a539a6681732de
3
- size 14457770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056044ff2cf4c7f49ba892ff799776f77a9b7cfe08a4b8ddc4e93c81f426ca84
3
+ size 14457759
experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-16k.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dda096ac9d555beaff647faf5ed464cdbf7f3f305b40799ffccb17bd1ab86a65
3
- size 119504606
 
 
 
 
experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-1k.csv DELETED
The diff for this file is too large to render. See raw diff
 
experiment/memory-bench/logs/v5-L96-D1024-E0_1-mem-ctx-8k-4k.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a55a6859bc6194265096fa53f2a138596a0b7c0c8be327cb55d5534b17ceb16
3
- size 14287660
 
 
 
 
experiment/memory-bench/v5-L6-D1024-E0_1-mem-ctx-8k.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cefa72363bb7dfddb34723de3848f3cc5116c1ff15481f5562317058324b8bf0
3
- size 369696005
 
 
 
 
experiment/memory-bench/v5-L96-D1024-E0_1-mem-ctx-8k.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d042262601b79b1635bdd82e73c9f26fb35b05d8cded92a03aad5df56944dde
3
- size 2825976699
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-baseline-p2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c447bfd1844d0c3e536fb8824d029fd8b0e334e1368f807a4e85cd7099005130
3
- size 1721187285
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-baseline-p3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:70b243f59685c4df841f16343bc7ff6947a3125cec5dabf9035b28b65c04da0e
3
- size 1721187285
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-enwiki-4k-p1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a57c278ed7e7e2f9d7f0436540674bfa5178adcd04c3154f5d92992e0602c55b
3
- size 1721187621
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-E0_01-neox-v5base-init.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:06105d96413046fce0ec189b9c4685a813cfa7147300851c5d2afc7b5adbcb38
3
- size 1721189797
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-baseline.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:31f61ce42e82d9a475446458ed015a190f16dd9b2b17bd67f4feedd9f72750ad
3
- size 16577145
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b53c27ed2c20b9f1f690647a83c0fbe2ce09594518b9ec557f515a4f8b548f2b
3
- size 15941299
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-baseline-p3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c380bcd4b861a8af263fd56dc6e183b9e06ba0bc8f9895c4dcd8a678b58296e8
3
- size 1721187621
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-layer-expansion-p1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:89f8caf661887bdba1897a10009f033331c552bfb763112e6da1b850d8ec3ff7
3
- size 1721189525
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-layer-expansion-p2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2680e091197e798686c97bdd2af0f6827f2b29c648cc1ae03f67d6f094859618
3
- size 1721189525
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-overwrite-naive-p1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:35a5d7571d90160edc20ce95abfdbcb6109ad47eccdefe8051bd8f15d12bf326
3
- size 1721189525
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-E0_01-overwrite-naive-p2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6b50bf05f191da87a6a17072d485d4059a4ded1335605e6b7bb8e9f2648d966
3
- size 1721189525
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb DELETED
@@ -1,2461 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "attachments": {},
5
- "cell_type": "markdown",
6
- "id": "d3126ef2",
7
- "metadata": {
8
- "papermill": {
9
- "duration": 0.004879,
10
- "end_time": "2023-10-11T08:02:23.608034",
11
- "exception": false,
12
- "start_time": "2023-10-11T08:02:23.603155",
13
- "status": "completed"
14
- },
15
- "tags": []
16
- },
17
- "source": [
18
- "# RWKV v5 multi-size training experiment\n",
19
- "\n",
20
- "**Note:** This project assumes you have the rwkv-infctx conda env setup"
21
- ]
22
- },
23
- {
24
- "attachments": {},
25
- "cell_type": "markdown",
26
- "id": "986070aa",
27
- "metadata": {
28
- "papermill": {
29
- "duration": 0.002523,
30
- "end_time": "2023-10-11T08:02:23.613605",
31
- "exception": false,
32
- "start_time": "2023-10-11T08:02:23.611082",
33
- "status": "completed"
34
- },
35
- "tags": []
36
- },
37
- "source": [
38
- "# Basic Setup"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": 1,
44
- "id": "dc924c7f",
45
- "metadata": {
46
- "execution": {
47
- "iopub.execute_input": "2023-10-11T08:02:23.620990Z",
48
- "iopub.status.busy": "2023-10-11T08:02:23.620432Z",
49
- "iopub.status.idle": "2023-10-11T08:02:24.379549Z",
50
- "shell.execute_reply": "2023-10-11T08:02:24.378580Z"
51
- },
52
- "papermill": {
53
- "duration": 0.765369,
54
- "end_time": "2023-10-11T08:02:24.381741",
55
- "exception": false,
56
- "start_time": "2023-10-11T08:02:23.616372",
57
- "status": "completed"
58
- },
59
- "tags": []
60
- },
61
- "outputs": [],
62
- "source": [
63
- "# First lets setup the various directories, and init the model\n",
64
- "!mkdir -p ../../../../model/\n",
65
- "!mkdir -p ../../../../datapath/\n",
66
- "!mkdir -p ../../../../checkpoint/"
67
- ]
68
- },
69
- {
70
- "cell_type": "code",
71
- "execution_count": 2,
72
- "id": "2bbc32ac",
73
- "metadata": {
74
- "execution": {
75
- "iopub.execute_input": "2023-10-11T08:02:24.389788Z",
76
- "iopub.status.busy": "2023-10-11T08:02:24.389227Z",
77
- "iopub.status.idle": "2023-10-11T08:02:24.398441Z",
78
- "shell.execute_reply": "2023-10-11T08:02:24.397578Z"
79
- },
80
- "papermill": {
81
- "duration": 0.015548,
82
- "end_time": "2023-10-11T08:02:24.400362",
83
- "exception": false,
84
- "start_time": "2023-10-11T08:02:24.384814",
85
- "status": "completed"
86
- },
87
- "tags": []
88
- },
89
- "outputs": [
90
- {
91
- "name": "stdout",
92
- "output_type": "stream",
93
- "text": [
94
- "DEEPSPEED_STRAT: deepspeed_stage_2_offload\n",
95
- "ENABLE_WANDB: True\n",
96
- "GPU_DEVICES: auto\n",
97
- "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
98
- "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
99
- "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
100
- "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
101
- ]
102
- }
103
- ],
104
- "source": [
105
- "DEEPSPEED_STRAT=\"deepspeed_stage_2_offload\"\n",
106
- "GPU_DEVICES=\"auto\"\n",
107
- "ENABLE_WANDB=True\n",
108
- "\n",
109
- "EMBED_SCALE=0.01\n",
110
- "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
111
- "\n",
112
- "EMBED_SIZE=2048\n",
113
- "\n",
114
- "WANDB_PREFIX=f\"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
115
- "FILENAME_PREFIX=f\"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
116
- "\n",
117
- "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
118
- "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
119
- "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
120
- "\n",
121
- "if ENABLE_WANDB:\n",
122
- " WANDB_MODE=\"online\"\n",
123
- "else:\n",
124
- " WANDB_MODE=\"disabled\"\n",
125
- "\n",
126
- "# Computing the notebook, and various paths\n",
127
- "import os\n",
128
- "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
129
- "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
130
- "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
131
- "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
132
- "\n",
133
- "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
134
- "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
135
- "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
136
- "print(\"PROJECT_DIR:\", PROJECT_DIR)"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": 3,
142
- "id": "ffa69634",
143
- "metadata": {
144
- "execution": {
145
- "iopub.execute_input": "2023-10-11T08:02:24.408311Z",
146
- "iopub.status.busy": "2023-10-11T08:02:24.407798Z",
147
- "iopub.status.idle": "2023-10-11T08:03:19.634663Z",
148
- "shell.execute_reply": "2023-10-11T08:03:19.633765Z"
149
- },
150
- "papermill": {
151
- "duration": 55.233419,
152
- "end_time": "2023-10-11T08:03:19.636895",
153
- "exception": false,
154
- "start_time": "2023-10-11T08:02:24.403476",
155
- "status": "completed"
156
- },
157
- "tags": []
158
- },
159
- "outputs": [
160
- {
161
- "name": "stdout",
162
- "output_type": "stream",
163
- "text": [
164
- "--2023-10-11 08:02:24-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\r\n",
165
- "Resolving huggingface.co (huggingface.co)... 18.154.227.87, 18.154.227.7, 18.154.227.69, ...\r\n",
166
- "Connecting to huggingface.co (huggingface.co)|18.154.227.87|:443... connected.\r\n",
167
- "HTTP request sent, awaiting response... 302 Found\r\n",
168
- "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/2f52085cee9c3db4bb079dc44edf50b0a19c170bd92128e918e6203efef83cea?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2a.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2a.pth%22%3B&Expires=1697270544&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU0NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzJmNTIwODVjZWU5YzNkYjRiYjA3OWRjNDRlZGY1MGIwYTE5YzE3MGJkOTIxMjhlOTE4ZTYyMDNlZmVmODNjZWE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=AW451jyDioqxesXvDVp%7EgfYV3uhgFTDwTn3SlZa-gk-yCDb7c-QR44rTm9sWCGSJjaa%7EvJvj9zLGUK7fvbr%7E%7EGQJgL2L%7Es9vkVPg8qs1k%7EtCh-MX%7E45bxo4CapTIo8fx4xLJ738Tks8uzpx3Sy9hWbfuGQFCUwBHzJXG5uGNRzPv87Zdfy4gIIAt0NytaC3bFmKZl4DbXLF4%7EtVWXED7H3NAlBvGETdhjzK5Qr0FLZB2vqC1LQpPTexdTH-ETkPEIQpXRBV-JctzaKBfI1Da-tGpt4JdPlhyPIu1kaNtX13yTibuBrT-mDOy6OVJZ9Zsj%7EHdVUtDrdp-I01dhylHpQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n",
169
- "--2023-10-11 08:02:24-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/2f52085cee9c3db4bb079dc44edf50b0a19c170bd92128e918e6203efef83cea?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2a.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2a.pth%22%3B&Expires=1697270544&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU0NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzJmNTIwODVjZWU5YzNkYjRiYjA3OWRjNDRlZGY1MGIwYTE5YzE3MGJkOTIxMjhlOTE4ZTYyMDNlZmVmODNjZWE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=AW451jyDioqxesXvDVp%7EgfYV3uhgFTDwTn3SlZa-gk-yCDb7c-QR44rTm9sWCGSJjaa%7EvJvj9zLGUK7fvbr%7E%7EGQJgL2L%7Es9vkVPg8qs1k%7EtCh-MX%7E45bxo4CapTIo8fx4xLJ738Tks8uzpx3Sy9hWbfuGQFCUwBHzJXG5uGNRzPv87Zdfy4gIIAt0NytaC3bFmKZl4DbXLF4%7EtVWXED7H3NAlBvGETdhjzK5Qr0FLZB2vqC1LQpPTexdTH-ETkPEIQpXRBV-JctzaKBfI1Da-tGpt4JdPlhyPIu1kaNtX13yTibuBrT-mDOy6OVJZ9Zsj%7EHdVUtDrdp-I01dhylHpQ__&Key-Pair-Id=KVTP0A1DKRTAX\r\n",
170
- "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 3.162.112.69, 3.162.112.2, 3.162.112.100, ...\r\n",
171
- "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|3.162.112.69|:443... connected.\r\n",
172
- "HTTP request sent, awaiting response... "
173
- ]
174
- },
175
- {
176
- "name": "stdout",
177
- "output_type": "stream",
178
- "text": [
179
- "200 OK\r\n",
180
- "Length: 1066536657 (1017M) [binary/octet-stream]\r\n",
181
- "Saving to: ‘v5-L6-D2048-E0_01-split-2a.pth’\r\n",
182
- "\r\n",
183
- "\r",
184
- " v5-L6-D20 0%[ ] 0 --.-KB/s "
185
- ]
186
- },
187
- {
188
- "name": "stdout",
189
- "output_type": "stream",
190
- "text": [
191
- "\r",
192
- " v5-L6-D204 1%[ ] 15.26M 42.9MB/s "
193
- ]
194
- },
195
- {
196
- "name": "stdout",
197
- "output_type": "stream",
198
- "text": [
199
- "\r",
200
- " v5-L6-D2048 3%[ ] 30.52M 47.7MB/s "
201
- ]
202
- },
203
- {
204
- "name": "stdout",
205
- "output_type": "stream",
206
- "text": [
207
- "\r",
208
- " v5-L6-D2048- 4%[ ] 45.26M 51.5MB/s "
209
- ]
210
- },
211
- {
212
- "name": "stdout",
213
- "output_type": "stream",
214
- "text": [
215
- "\r",
216
- " v5-L6-D2048-E 5%[> ] 59.20M 52.1MB/s "
217
- ]
218
- },
219
- {
220
- "name": "stdout",
221
- "output_type": "stream",
222
- "text": [
223
- "\r",
224
- " v5-L6-D2048-E0 6%[> ] 65.20M 48.8MB/s "
225
- ]
226
- },
227
- {
228
- "name": "stdout",
229
- "output_type": "stream",
230
- "text": [
231
- "\r",
232
- " v5-L6-D2048-E0_ 7%[> ] 76.29M 44.4MB/s "
233
- ]
234
- },
235
- {
236
- "name": "stdout",
237
- "output_type": "stream",
238
- "text": [
239
- "\r",
240
- " v5-L6-D2048-E0_0 8%[> ] 91.03M 47.2MB/s "
241
- ]
242
- },
243
- {
244
- "name": "stdout",
245
- "output_type": "stream",
246
- "text": [
247
- "\r",
248
- " v5-L6-D2048-E0_01 9%[> ] 91.55M 43.0MB/s "
249
- ]
250
- },
251
- {
252
- "name": "stdout",
253
- "output_type": "stream",
254
- "text": [
255
- "\r",
256
- " v5-L6-D2048-E0_01- 10%[=> ] 106.81M 43.2MB/s "
257
- ]
258
- },
259
- {
260
- "name": "stdout",
261
- "output_type": "stream",
262
- "text": [
263
- "\r",
264
- "v5-L6-D2048-E0_01-s 11%[=> ] 120.25M 43.9MB/s "
265
- ]
266
- },
267
- {
268
- "name": "stdout",
269
- "output_type": "stream",
270
- "text": [
271
- "\r",
272
- "5-L6-D2048-E0_01-sp 12%[=> ] 122.07M 41.4MB/s "
273
- ]
274
- },
275
- {
276
- "name": "stdout",
277
- "output_type": "stream",
278
- "text": [
279
- "\r",
280
- "-L6-D2048-E0_01-spl 13%[=> ] 136.81M 42.5MB/s eta 21s "
281
- ]
282
- },
283
- {
284
- "name": "stdout",
285
- "output_type": "stream",
286
- "text": [
287
- "\r",
288
- "L6-D2048-E0_01-spli 14%[=> ] 152.07M 42.8MB/s eta 21s "
289
- ]
290
- },
291
- {
292
- "name": "stdout",
293
- "output_type": "stream",
294
- "text": [
295
- "\r",
296
- "6-D2048-E0_01-split 15%[==> ] 152.72M 40.5MB/s eta 21s "
297
- ]
298
- },
299
- {
300
- "name": "stdout",
301
- "output_type": "stream",
302
- "text": [
303
- "\r",
304
- "-D2048-E0_01-split- 16%[==> ] 167.85M 41.5MB/s eta 21s "
305
- ]
306
- },
307
- {
308
- "name": "stdout",
309
- "output_type": "stream",
310
- "text": [
311
- "\r",
312
- "D2048-E0_01-split-2 18%[==> ] 183.10M 43.0MB/s eta 19s "
313
- ]
314
- },
315
- {
316
- "name": "stdout",
317
- "output_type": "stream",
318
- "text": [
319
- "\r",
320
- "2048-E0_01-split-2a 19%[==> ] 198.36M 43.4MB/s eta 19s "
321
- ]
322
- },
323
- {
324
- "name": "stdout",
325
- "output_type": "stream",
326
- "text": [
327
- "\r",
328
- "048-E0_01-split-2a. 20%[===> ] 213.11M 44.1MB/s eta 19s "
329
- ]
330
- },
331
- {
332
- "name": "stdout",
333
- "output_type": "stream",
334
- "text": [
335
- "\r",
336
- "48-E0_01-split-2a.p 22%[===> ] 228.36M 43.3MB/s eta 19s "
337
- ]
338
- },
339
- {
340
- "name": "stdout",
341
- "output_type": "stream",
342
- "text": [
343
- "\r",
344
- "8-E0_01-split-2a.pt 22%[===> ] 228.87M 41.1MB/s eta 18s "
345
- ]
346
- },
347
- {
348
- "name": "stdout",
349
- "output_type": "stream",
350
- "text": [
351
- "\r",
352
- "-E0_01-split-2a.pth 24%[===> ] 244.13M 41.0MB/s eta 18s "
353
- ]
354
- },
355
- {
356
- "name": "stdout",
357
- "output_type": "stream",
358
- "text": [
359
- "\r",
360
- "E0_01-split-2a.pth 25%[====> ] 259.40M 42.4MB/s eta 18s "
361
- ]
362
- },
363
- {
364
- "name": "stdout",
365
- "output_type": "stream",
366
- "text": [
367
- "\r",
368
- "0_01-split-2a.pth 26%[====> ] 272.83M 40.4MB/s eta 18s "
369
- ]
370
- },
371
- {
372
- "name": "stdout",
373
- "output_type": "stream",
374
- "text": [
375
- "\r",
376
- "_01-split-2a.pth 28%[====> ] 289.40M 41.4MB/s eta 18s "
377
- ]
378
- },
379
- {
380
- "name": "stdout",
381
- "output_type": "stream",
382
- "text": [
383
- "\r",
384
- "01-split-2a.pth 28%[====> ] 289.92M 37.9MB/s eta 18s "
385
- ]
386
- },
387
- {
388
- "name": "stdout",
389
- "output_type": "stream",
390
- "text": [
391
- "\r",
392
- "1-split-2a.pth 29%[====> ] 304.66M 36.1MB/s eta 19s "
393
- ]
394
- },
395
- {
396
- "name": "stdout",
397
- "output_type": "stream",
398
- "text": [
399
- "\r",
400
- "-split-2a.pth 30%[=====> ] 305.18M 33.4MB/s eta 19s "
401
- ]
402
- },
403
- {
404
- "name": "stdout",
405
- "output_type": "stream",
406
- "text": [
407
- "\r",
408
- "split-2a.pth 31%[=====> ] 318.60M 33.2MB/s eta 19s "
409
- ]
410
- },
411
- {
412
- "name": "stdout",
413
- "output_type": "stream",
414
- "text": [
415
- "\r",
416
- "plit-2a.pth 31%[=====> ] 320.29M 33.3MB/s eta 19s "
417
- ]
418
- },
419
- {
420
- "name": "stdout",
421
- "output_type": "stream",
422
- "text": [
423
- "\r",
424
- "lit-2a.pth 31%[=====> ] 320.57M 30.7MB/s eta 19s "
425
- ]
426
- },
427
- {
428
- "name": "stdout",
429
- "output_type": "stream",
430
- "text": [
431
- "\r",
432
- "it-2a.pth 32%[=====> ] 335.18M 30.1MB/s eta 19s "
433
- ]
434
- },
435
- {
436
- "name": "stdout",
437
- "output_type": "stream",
438
- "text": [
439
- "\r",
440
- "t-2a.pth 33%[=====> ] 345.53M 31.2MB/s eta 19s "
441
- ]
442
- },
443
- {
444
- "name": "stdout",
445
- "output_type": "stream",
446
- "text": [
447
- "\r",
448
- "-2a.pth 34%[=====> ] 350.82M 29.7MB/s eta 19s "
449
- ]
450
- },
451
- {
452
- "name": "stdout",
453
- "output_type": "stream",
454
- "text": [
455
- "\r",
456
- "2a.pth 35%[======> ] 360.98M 31.3MB/s eta 19s "
457
- ]
458
- },
459
- {
460
- "name": "stdout",
461
- "output_type": "stream",
462
- "text": [
463
- "\r",
464
- "a.pth 36%[======> ] 366.20M 29.6MB/s eta 19s "
465
- ]
466
- },
467
- {
468
- "name": "stdout",
469
- "output_type": "stream",
470
- "text": [
471
- "\r",
472
- ".pth 37%[======> ] 380.96M 30.8MB/s eta 17s "
473
- ]
474
- },
475
- {
476
- "name": "stdout",
477
- "output_type": "stream",
478
- "text": [
479
- "\r",
480
- "pth 38%[======> ] 392.79M 32.0MB/s eta 17s "
481
- ]
482
- },
483
- {
484
- "name": "stdout",
485
- "output_type": "stream",
486
- "text": [
487
- "\r",
488
- "th 39%[======> ] 396.73M 29.1MB/s eta 17s "
489
- ]
490
- },
491
- {
492
- "name": "stdout",
493
- "output_type": "stream",
494
- "text": [
495
- "\r",
496
- "h 40%[=======> ] 411.99M 29.1MB/s eta 17s "
497
- ]
498
- },
499
- {
500
- "name": "stdout",
501
- "output_type": "stream",
502
- "text": [
503
- "\r",
504
- " 41%[=======> ] 426.73M 28.7MB/s eta 16s "
505
- ]
506
- },
507
- {
508
- "name": "stdout",
509
- "output_type": "stream",
510
- "text": [
511
- "\r",
512
- " v 42%[=======> ] 427.25M 29.1MB/s eta 16s "
513
- ]
514
- },
515
- {
516
- "name": "stdout",
517
- "output_type": "stream",
518
- "text": [
519
- "\r",
520
- " v5 42%[=======> ] 435.25M 27.9MB/s eta 16s "
521
- ]
522
- },
523
- {
524
- "name": "stdout",
525
- "output_type": "stream",
526
- "text": [
527
- "\r",
528
- " v5- 43%[=======> ] 438.04M 28.2MB/s eta 16s "
529
- ]
530
- },
531
- {
532
- "name": "stdout",
533
- "output_type": "stream",
534
- "text": [
535
- "\r",
536
- " v5-L 43%[=======> ] 442.05M 29.7MB/s eta 16s "
537
- ]
538
- },
539
- {
540
- "name": "stdout",
541
- "output_type": "stream",
542
- "text": [
543
- "\r",
544
- " v5-L6 43%[=======> ] 446.00M 31.1MB/s eta 16s "
545
- ]
546
- },
547
- {
548
- "name": "stdout",
549
- "output_type": "stream",
550
- "text": [
551
- "\r",
552
- " v5-L6- 44%[=======> ] 457.24M 33.9MB/s eta 16s "
553
- ]
554
- },
555
- {
556
- "name": "stdout",
557
- "output_type": "stream",
558
- "text": [
559
- "\r",
560
- " v5-L6-D 45%[========> ] 457.89M 31.6MB/s eta 16s "
561
- ]
562
- },
563
- {
564
- "name": "stdout",
565
- "output_type": "stream",
566
- "text": [
567
- "\r",
568
- " v5-L6-D2 46%[========> ] 473.02M 34.8MB/s eta 16s "
569
- ]
570
- },
571
- {
572
- "name": "stdout",
573
- "output_type": "stream",
574
- "text": [
575
- "\r",
576
- " v5-L6-D20 48%[========> ] 488.28M 34.1MB/s eta 15s "
577
- ]
578
- },
579
- {
580
- "name": "stdout",
581
- "output_type": "stream",
582
- "text": [
583
- "\r",
584
- " v5-L6-D204 49%[========> ] 503.03M 34.6MB/s eta 15s "
585
- ]
586
- },
587
- {
588
- "name": "stdout",
589
- "output_type": "stream",
590
- "text": [
591
- "\r",
592
- " v5-L6-D2048 50%[=========> ] 518.29M 37.3MB/s eta 15s "
593
- ]
594
- },
595
- {
596
- "name": "stdout",
597
- "output_type": "stream",
598
- "text": [
599
- "\r",
600
- " v5-L6-D2048- 51%[=========> ] 525.10M 35.8MB/s eta 15s "
601
- ]
602
- },
603
- {
604
- "name": "stdout",
605
- "output_type": "stream",
606
- "text": [
607
- "\r",
608
- " v5-L6-D2048-E 52%[=========> ] 534.05M 34.4MB/s eta 13s "
609
- ]
610
- },
611
- {
612
- "name": "stdout",
613
- "output_type": "stream",
614
- "text": [
615
- "\r",
616
- " v5-L6-D2048-E0 53%[=========> ] 548.80M 34.4MB/s eta 13s "
617
- ]
618
- },
619
- {
620
- "name": "stdout",
621
- "output_type": "stream",
622
- "text": [
623
- "\r",
624
- " v5-L6-D2048-E0_ 55%[==========> ] 562.75M 33.8MB/s eta 13s "
625
- ]
626
- },
627
- {
628
- "name": "stdout",
629
- "output_type": "stream",
630
- "text": [
631
- "\r",
632
- " v5-L6-D2048-E0_0 56%[==========> ] 579.31M 36.0MB/s eta 13s "
633
- ]
634
- },
635
- {
636
- "name": "stdout",
637
- "output_type": "stream",
638
- "text": [
639
- "\r",
640
- " v5-L6-D2048-E0_01 57%[==========> ] 581.49M 36.7MB/s eta 12s "
641
- ]
642
- },
643
- {
644
- "name": "stdout",
645
- "output_type": "stream",
646
- "text": [
647
- "\r",
648
- " v5-L6-D2048-E0_01- 58%[==========> ] 592.93M 37.4MB/s eta 12s "
649
- ]
650
- },
651
- {
652
- "name": "stdout",
653
- "output_type": "stream",
654
- "text": [
655
- "\r",
656
- "v5-L6-D2048-E0_01-s 58%[==========> ] 595.09M 37.1MB/s eta 12s "
657
- ]
658
- },
659
- {
660
- "name": "stdout",
661
- "output_type": "stream",
662
- "text": [
663
- "\r",
664
- "5-L6-D2048-E0_01-sp 60%[===========> ] 610.35M 38.5MB/s eta 12s "
665
- ]
666
- },
667
- {
668
- "name": "stdout",
669
- "output_type": "stream",
670
- "text": [
671
- "\r",
672
- "-L6-D2048-E0_01-spl 61%[===========> ] 625.61M 38.7MB/s eta 11s "
673
- ]
674
- },
675
- {
676
- "name": "stdout",
677
- "output_type": "stream",
678
- "text": [
679
- "\r",
680
- "L6-D2048-E0_01-spli 62%[===========> ] 640.36M 39.9MB/s eta 11s "
681
- ]
682
- },
683
- {
684
- "name": "stdout",
685
- "output_type": "stream",
686
- "text": [
687
- "\r",
688
- "6-D2048-E0_01-split 64%[===========> ] 653.30M 39.5MB/s eta 11s "
689
- ]
690
- },
691
- {
692
- "name": "stdout",
693
- "output_type": "stream",
694
- "text": [
695
- "\r",
696
- "-D2048-E0_01-split- 64%[===========> ] 656.13M 38.5MB/s eta 11s "
697
- ]
698
- },
699
- {
700
- "name": "stdout",
701
- "output_type": "stream",
702
- "text": [
703
- "\r",
704
- "D2048-E0_01-split-2 66%[============> ] 671.38M 38.9MB/s eta 9s "
705
- ]
706
- },
707
- {
708
- "name": "stdout",
709
- "output_type": "stream",
710
- "text": [
711
- "\r",
712
- "2048-E0_01-split-2a 67%[============> ] 685.57M 39.7MB/s eta 9s "
713
- ]
714
- },
715
- {
716
- "name": "stdout",
717
- "output_type": "stream",
718
- "text": [
719
- "\r",
720
- "048-E0_01-split-2a. 67%[============> ] 686.64M 37.5MB/s eta 9s "
721
- ]
722
- },
723
- {
724
- "name": "stdout",
725
- "output_type": "stream",
726
- "text": [
727
- "\r",
728
- "48-E0_01-split-2a.p 68%[============> ] 701.39M 37.9MB/s eta 9s "
729
- ]
730
- },
731
- {
732
- "name": "stdout",
733
- "output_type": "stream",
734
- "text": [
735
- "\r",
736
- "8-E0_01-split-2a.pt 69%[============> ] 708.59M 38.8MB/s eta 8s "
737
- ]
738
- },
739
- {
740
- "name": "stdout",
741
- "output_type": "stream",
742
- "text": [
743
- "\r",
744
- "-E0_01-split-2a.pth 70%[=============> ] 715.34M 38.2MB/s eta 8s "
745
- ]
746
- },
747
- {
748
- "name": "stdout",
749
- "output_type": "stream",
750
- "text": [
751
- "\r",
752
- "E0_01-split-2a.pth 71%[=============> ] 731.91M 40.7MB/s eta 8s "
753
- ]
754
- },
755
- {
756
- "name": "stdout",
757
- "output_type": "stream",
758
- "text": [
759
- "\r",
760
- "0_01-split-2a.pth 73%[=============> ] 747.17M 38.0MB/s eta 8s "
761
- ]
762
- },
763
- {
764
- "name": "stdout",
765
- "output_type": "stream",
766
- "text": [
767
- "\r",
768
- "_01-split-2a.pth 73%[=============> ] 747.75M 38.0MB/s eta 7s "
769
- ]
770
- },
771
- {
772
- "name": "stdout",
773
- "output_type": "stream",
774
- "text": [
775
- "\r",
776
- "01-split-2a.pth 74%[=============> ] 762.42M 40.2MB/s eta 7s "
777
- ]
778
- },
779
- {
780
- "name": "stdout",
781
- "output_type": "stream",
782
- "text": [
783
- "\r",
784
- "1-split-2a.pth 75%[==============> ] 762.94M 37.2MB/s eta 7s "
785
- ]
786
- },
787
- {
788
- "name": "stdout",
789
- "output_type": "stream",
790
- "text": [
791
- "\r",
792
- "-split-2a.pth 76%[==============> ] 776.37M 36.7MB/s eta 7s "
793
- ]
794
- },
795
- {
796
- "name": "stdout",
797
- "output_type": "stream",
798
- "text": [
799
- "\r",
800
- "split-2a.pth 76%[==============> ] 778.20M 34.9MB/s eta 7s "
801
- ]
802
- },
803
- {
804
- "name": "stdout",
805
- "output_type": "stream",
806
- "text": [
807
- "\r",
808
- "plit-2a.pth 77%[==============> ] 791.63M 38.1MB/s eta 7s "
809
- ]
810
- },
811
- {
812
- "name": "stdout",
813
- "output_type": "stream",
814
- "text": [
815
- "\r",
816
- "lit-2a.pth 78%[==============> ] 793.46M 36.0MB/s eta 7s "
817
- ]
818
- },
819
- {
820
- "name": "stdout",
821
- "output_type": "stream",
822
- "text": [
823
- "\r",
824
- "it-2a.pth 79%[==============> ] 808.20M 38.6MB/s eta 7s "
825
- ]
826
- },
827
- {
828
- "name": "stdout",
829
- "output_type": "stream",
830
- "text": [
831
- "\r",
832
- "t-2a.pth 80%[===============> ] 816.07M 36.7MB/s eta 7s "
833
- ]
834
- },
835
- {
836
- "name": "stdout",
837
- "output_type": "stream",
838
- "text": [
839
- "\r",
840
- "-2a.pth 81%[===============> ] 823.97M 34.7MB/s eta 5s "
841
- ]
842
- },
843
- {
844
- "name": "stdout",
845
- "output_type": "stream",
846
- "text": [
847
- "\r",
848
- "2a.pth 82%[===============> ] 837.41M 36.4MB/s eta 5s "
849
- ]
850
- },
851
- {
852
- "name": "stdout",
853
- "output_type": "stream",
854
- "text": [
855
- "\r",
856
- "a.pth 83%[===============> ] 853.98M 38.3MB/s eta 5s "
857
- ]
858
- },
859
- {
860
- "name": "stdout",
861
- "output_type": "stream",
862
- "text": [
863
- "\r",
864
- ".pth 85%[================> ] 867.67M 38.4MB/s eta 5s "
865
- ]
866
- },
867
- {
868
- "name": "stdout",
869
- "output_type": "stream",
870
- "text": [
871
- "\r",
872
- "pth 85%[================> ] 873.17M 39.1MB/s eta 5s "
873
- ]
874
- },
875
- {
876
- "name": "stdout",
877
- "output_type": "stream",
878
- "text": [
879
- "\r",
880
- "th 87%[================> ] 885.01M 36.1MB/s eta 4s "
881
- ]
882
- },
883
- {
884
- "name": "stdout",
885
- "output_type": "stream",
886
- "text": [
887
- "\r",
888
- "h 88%[================> ] 899.75M 37.6MB/s eta 4s "
889
- ]
890
- },
891
- {
892
- "name": "stdout",
893
- "output_type": "stream",
894
- "text": [
895
- "\r",
896
- " 88%[================> ] 900.40M 34.6MB/s eta 4s "
897
- ]
898
- },
899
- {
900
- "name": "stdout",
901
- "output_type": "stream",
902
- "text": [
903
- "\r",
904
- " v 90%[=================> ] 915.53M 35.4MB/s eta 4s "
905
- ]
906
- },
907
- {
908
- "name": "stdout",
909
- "output_type": "stream",
910
- "text": [
911
- "\r",
912
- " v5 91%[=================> ] 930.78M 37.6MB/s eta 2s "
913
- ]
914
- },
915
- {
916
- "name": "stdout",
917
- "output_type": "stream",
918
- "text": [
919
- "\r",
920
- " v5- 92%[=================> ] 945.53M 40.9MB/s eta 2s "
921
- ]
922
- },
923
- {
924
- "name": "stdout",
925
- "output_type": "stream",
926
- "text": [
927
- "\r",
928
- " v5-L 93%[=================> ] 946.04M 37.6MB/s eta 2s "
929
- ]
930
- },
931
- {
932
- "name": "stdout",
933
- "output_type": "stream",
934
- "text": [
935
- "\r",
936
- " v5-L6 94%[=================> ] 959.48M 38.0MB/s eta 2s "
937
- ]
938
- },
939
- {
940
- "name": "stdout",
941
- "output_type": "stream",
942
- "text": [
943
- "\r",
944
- " v5-L6- 94%[=================> ] 961.30M 33.1MB/s eta 2s "
945
- ]
946
- },
947
- {
948
- "name": "stdout",
949
- "output_type": "stream",
950
- "text": [
951
- "\r",
952
- " v5-L6-D 95%[==================> ] 976.05M 34.9MB/s eta 2s "
953
- ]
954
- },
955
- {
956
- "name": "stdout",
957
- "output_type": "stream",
958
- "text": [
959
- "\r",
960
- " v5-L6-D2 97%[==================> ] 991.31M 34.8MB/s eta 2s "
961
- ]
962
- },
963
- {
964
- "name": "stdout",
965
- "output_type": "stream",
966
- "text": [
967
- "\r",
968
- " v5-L6-D20 97%[==================> ] 992.94M 35.0MB/s eta 2s "
969
- ]
970
- },
971
- {
972
- "name": "stdout",
973
- "output_type": "stream",
974
- "text": [
975
- "\r",
976
- " v5-L6-D204 98%[==================> ] 1005M 34.7MB/s eta 0s "
977
- ]
978
- },
979
- {
980
- "name": "stdout",
981
- "output_type": "stream",
982
- "text": [
983
- "\r",
984
- " v5-L6-D2048 99%[==================> ] 1016M 33.7MB/s eta 0s \r",
985
- "v5-L6-D2048-E0_01-s 100%[===================>] 1017M 33.9MB/s in 28s \r\n",
986
- "\r\n",
987
- "2023-10-11 08:02:52 (36.4 MB/s) - ‘v5-L6-D2048-E0_01-split-2a.pth’ saved [1066536657/1066536657]\r\n",
988
- "\r\n"
989
- ]
990
- },
991
- {
992
- "name": "stdout",
993
- "output_type": "stream",
994
- "text": [
995
- "--2023-10-11 08:02:53-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\r\n",
996
- "Resolving huggingface.co (huggingface.co)... 18.154.227.67, 18.154.227.69, 18.154.227.7, ...\r\n",
997
- "Connecting to huggingface.co (huggingface.co)|18.154.227.67|:443... connected.\r\n",
998
- "HTTP request sent, awaiting response... 302 Found\r\n",
999
- "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/6b64a1018631b9ddd15a746002bab3eafe956dced78a91af7abcdadaae4a7b25?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2b.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2b.pth%22%3B&Expires=1697270573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzZiNjRhMTAxODYzMWI5ZGRkMTVhNzQ2MDAyYmFiM2VhZmU5NTZkY2VkNzhhOTFhZjdhYmNkYWRhYWU0YTdiMjU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=WkKE1KjbKeVQp4dWdBuAAbOfx2JJs%7EDJaKbx8gRQSGABLfGDhkq2L8Q9KZ1fg1v%7E74c0Mkrbvop33pAwQDh782jzEiogbDb8HXSO7AtIYQqvI6K-fmb%7EpxQPFrmypJwWhQj9ePRZX2KSL6LcqN1X0GAheI-PQENpVH3svxhhib2-fYDmuvnpGX7pc6n36GES6lvwOuCQOxfIhlFnIiuNEU00NaBdDiaXb-uteXhSkKO-1EFCM0fBtwT5hVkdHZQG2m6iMcI2KaN0AHV%7EvF838f4DM%7ERbjVkRgwphRaYZxmJxUKZxGTV7rRJjIQA%7EOlnPllE1dSdwJ7y0ULOIKQHYUQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n",
1000
- "--2023-10-11 08:02:53-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/6b64a1018631b9ddd15a746002bab3eafe956dced78a91af7abcdadaae4a7b25?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2b.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2b.pth%22%3B&Expires=1697270573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzZiNjRhMTAxODYzMWI5ZGRkMTVhNzQ2MDAyYmFiM2VhZmU5NTZkY2VkNzhhOTFhZjdhYmNkYWRhYWU0YTdiMjU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=WkKE1KjbKeVQp4dWdBuAAbOfx2JJs%7EDJaKbx8gRQSGABLfGDhkq2L8Q9KZ1fg1v%7E74c0Mkrbvop33pAwQDh782jzEiogbDb8HXSO7AtIYQqvI6K-fmb%7EpxQPFrmypJwWhQj9ePRZX2KSL6LcqN1X0GAheI-PQENpVH3svxhhib2-fYDmuvnpGX7pc6n36GES6lvwOuCQOxfIhlFnIiuNEU00NaBdDiaXb-uteXhSkKO-1EFCM0fBtwT5hVkdHZQG2m6iMcI2KaN0AHV%7EvF838f4DM%7ERbjVkRgwphRaYZxmJxUKZxGTV7rRJjIQA%7EOlnPllE1dSdwJ7y0ULOIKQHYUQ__&Key-Pair-Id=KVTP0A1DKRTAX\r\n",
1001
- "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 3.162.112.95, 3.162.112.100, 3.162.112.2, ...\r\n",
1002
- "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|3.162.112.95|:443... connected.\r\n"
1003
- ]
1004
- },
1005
- {
1006
- "name": "stdout",
1007
- "output_type": "stream",
1008
- "text": [
1009
- "HTTP request sent, awaiting response... "
1010
- ]
1011
- },
1012
- {
1013
- "name": "stdout",
1014
- "output_type": "stream",
1015
- "text": [
1016
- "200 OK\r\n",
1017
- "Length: 1066536657 (1017M) [binary/octet-stream]\r\n",
1018
- "Saving to: ‘v5-L6-D2048-E0_01-split-2b.pth’\r\n",
1019
- "\r\n",
1020
- "\r",
1021
- " v5-L6-D20 0%[ ] 0 --.-KB/s "
1022
- ]
1023
- },
1024
- {
1025
- "name": "stdout",
1026
- "output_type": "stream",
1027
- "text": [
1028
- "\r",
1029
- " v5-L6-D204 1%[ ] 14.74M 67.8MB/s "
1030
- ]
1031
- },
1032
- {
1033
- "name": "stdout",
1034
- "output_type": "stream",
1035
- "text": [
1036
- "\r",
1037
- " v5-L6-D2048 2%[ ] 28.69M 63.1MB/s "
1038
- ]
1039
- },
1040
- {
1041
- "name": "stdout",
1042
- "output_type": "stream",
1043
- "text": [
1044
- "\r",
1045
- " v5-L6-D2048- 3%[ ] 30.52M 42.3MB/s "
1046
- ]
1047
- },
1048
- {
1049
- "name": "stdout",
1050
- "output_type": "stream",
1051
- "text": [
1052
- "\r",
1053
- " v5-L6-D2048-E 4%[ ] 45.26M 45.2MB/s "
1054
- ]
1055
- },
1056
- {
1057
- "name": "stdout",
1058
- "output_type": "stream",
1059
- "text": [
1060
- "\r",
1061
- " v5-L6-D2048-E0 4%[ ] 45.78M 37.9MB/s "
1062
- ]
1063
- },
1064
- {
1065
- "name": "stdout",
1066
- "output_type": "stream",
1067
- "text": [
1068
- "\r",
1069
- " v5-L6-D2048-E0_ 6%[> ] 61.03M 41.2MB/s "
1070
- ]
1071
- },
1072
- {
1073
- "name": "stdout",
1074
- "output_type": "stream",
1075
- "text": [
1076
- "\r",
1077
- " v5-L6-D2048-E0_0 7%[> ] 75.78M 45.0MB/s "
1078
- ]
1079
- },
1080
- {
1081
- "name": "stdout",
1082
- "output_type": "stream",
1083
- "text": [
1084
- "\r",
1085
- " v5-L6-D2048-E0_01 8%[> ] 85.94M 45.6MB/s "
1086
- ]
1087
- },
1088
- {
1089
- "name": "stdout",
1090
- "output_type": "stream",
1091
- "text": [
1092
- "\r",
1093
- " v5-L6-D2048-E0_01- 9%[> ] 91.55M 40.8MB/s "
1094
- ]
1095
- },
1096
- {
1097
- "name": "stdout",
1098
- "output_type": "stream",
1099
- "text": [
1100
- "\r",
1101
- "v5-L6-D2048-E0_01-s 10%[=> ] 106.81M 40.5MB/s "
1102
- ]
1103
- },
1104
- {
1105
- "name": "stdout",
1106
- "output_type": "stream",
1107
- "text": [
1108
- "\r",
1109
- "5-L6-D2048-E0_01-sp 12%[=> ] 122.07M 40.2MB/s eta 22s "
1110
- ]
1111
- },
1112
- {
1113
- "name": "stdout",
1114
- "output_type": "stream",
1115
- "text": [
1116
- "\r",
1117
- "-L6-D2048-E0_01-spl 13%[=> ] 137.33M 41.7MB/s eta 22s "
1118
- ]
1119
- },
1120
- {
1121
- "name": "stdout",
1122
- "output_type": "stream",
1123
- "text": [
1124
- "\r",
1125
- "L6-D2048-E0_01-spli 14%[=> ] 152.07M 42.9MB/s eta 22s "
1126
- ]
1127
- },
1128
- {
1129
- "name": "stdout",
1130
- "output_type": "stream",
1131
- "text": [
1132
- "\r",
1133
- "6-D2048-E0_01-split 16%[==> ] 167.33M 43.6MB/s eta 22s "
1134
- ]
1135
- },
1136
- {
1137
- "name": "stdout",
1138
- "output_type": "stream",
1139
- "text": [
1140
- "\r",
1141
- "-D2048-E0_01-split- 17%[==> ] 181.32M 44.9MB/s eta 19s "
1142
- ]
1143
- },
1144
- {
1145
- "name": "stdout",
1146
- "output_type": "stream",
1147
- "text": [
1148
- "\r",
1149
- "D2048-E0_01-split-2 18%[==> ] 183.10M 41.7MB/s eta 19s "
1150
- ]
1151
- },
1152
- {
1153
- "name": "stdout",
1154
- "output_type": "stream",
1155
- "text": [
1156
- "\r",
1157
- "2048-E0_01-split-2b 19%[==> ] 196.53M 41.8MB/s eta 19s "
1158
- ]
1159
- },
1160
- {
1161
- "name": "stdout",
1162
- "output_type": "stream",
1163
- "text": [
1164
- "\r",
1165
- "048-E0_01-split-2b. 19%[==> ] 198.36M 39.0MB/s eta 19s "
1166
- ]
1167
- },
1168
- {
1169
- "name": "stdout",
1170
- "output_type": "stream",
1171
- "text": [
1172
- "\r",
1173
- "48-E0_01-split-2b.p 20%[===> ] 213.11M 39.0MB/s eta 20s "
1174
- ]
1175
- },
1176
- {
1177
- "name": "stdout",
1178
- "output_type": "stream",
1179
- "text": [
1180
- "\r",
1181
- "8-E0_01-split-2b.pt 21%[===> ] 220.29M 40.8MB/s eta 20s "
1182
- ]
1183
- },
1184
- {
1185
- "name": "stdout",
1186
- "output_type": "stream",
1187
- "text": [
1188
- "\r",
1189
- "-E0_01-split-2b.pth 22%[===> ] 228.36M 39.9MB/s eta 20s "
1190
- ]
1191
- },
1192
- {
1193
- "name": "stdout",
1194
- "output_type": "stream",
1195
- "text": [
1196
- "\r",
1197
- "E0_01-split-2b.pth 24%[===> ] 244.13M 40.3MB/s eta 20s "
1198
- ]
1199
- },
1200
- {
1201
- "name": "stdout",
1202
- "output_type": "stream",
1203
- "text": [
1204
- "\r",
1205
- "0_01-split-2b.pth 25%[====> ] 259.40M 40.4MB/s eta 18s "
1206
- ]
1207
- },
1208
- {
1209
- "name": "stdout",
1210
- "output_type": "stream",
1211
- "text": [
1212
- "\r",
1213
- "_01-split-2b.pth 26%[====> ] 274.14M 42.1MB/s eta 18s "
1214
- ]
1215
- },
1216
- {
1217
- "name": "stdout",
1218
- "output_type": "stream",
1219
- "text": [
1220
- "\r",
1221
- "01-split-2b.pth 27%[====> ] 274.66M 38.5MB/s eta 18s "
1222
- ]
1223
- },
1224
- {
1225
- "name": "stdout",
1226
- "output_type": "stream",
1227
- "text": [
1228
- "\r",
1229
- "1-split-2b.pth 28%[====> ] 289.92M 41.6MB/s eta 18s "
1230
- ]
1231
- },
1232
- {
1233
- "name": "stdout",
1234
- "output_type": "stream",
1235
- "text": [
1236
- "\r",
1237
- "-split-2b.pth 30%[=====> ] 305.18M 41.6MB/s eta 17s "
1238
- ]
1239
- },
1240
- {
1241
- "name": "stdout",
1242
- "output_type": "stream",
1243
- "text": [
1244
- "\r",
1245
- "split-2b.pth 31%[=====> ] 320.43M 40.9MB/s eta 17s "
1246
- ]
1247
- },
1248
- {
1249
- "name": "stdout",
1250
- "output_type": "stream",
1251
- "text": [
1252
- "\r",
1253
- "plit-2b.pth 32%[=====> ] 335.18M 41.2MB/s eta 17s "
1254
- ]
1255
- },
1256
- {
1257
- "name": "stdout",
1258
- "output_type": "stream",
1259
- "text": [
1260
- "\r",
1261
- "lit-2b.pth 33%[=====> ] 335.69M 38.5MB/s eta 17s "
1262
- ]
1263
- },
1264
- {
1265
- "name": "stdout",
1266
- "output_type": "stream",
1267
- "text": [
1268
- "\r",
1269
- "it-2b.pth 34%[=====> ] 350.95M 38.6MB/s eta 16s "
1270
- ]
1271
- },
1272
- {
1273
- "name": "stdout",
1274
- "output_type": "stream",
1275
- "text": [
1276
- "\r",
1277
- "t-2b.pth 35%[======> ] 365.70M 40.9MB/s eta 16s "
1278
- ]
1279
- },
1280
- {
1281
- "name": "stdout",
1282
- "output_type": "stream",
1283
- "text": [
1284
- "\r",
1285
- "-2b.pth 36%[======> ] 366.20M 38.0MB/s eta 16s "
1286
- ]
1287
- },
1288
- {
1289
- "name": "stdout",
1290
- "output_type": "stream",
1291
- "text": [
1292
- "\r",
1293
- "2b.pth 37%[======> ] 381.47M 38.8MB/s eta 16s "
1294
- ]
1295
- },
1296
- {
1297
- "name": "stdout",
1298
- "output_type": "stream",
1299
- "text": [
1300
- "\r",
1301
- "b.pth 37%[======> ] 385.65M 39.1MB/s eta 16s "
1302
- ]
1303
- },
1304
- {
1305
- "name": "stdout",
1306
- "output_type": "stream",
1307
- "text": [
1308
- "\r",
1309
- ".pth 39%[======> ] 396.73M 36.3MB/s eta 16s "
1310
- ]
1311
- },
1312
- {
1313
- "name": "stdout",
1314
- "output_type": "stream",
1315
- "text": [
1316
- "\r",
1317
- "pth 39%[======> ] 406.75M 37.5MB/s eta 16s "
1318
- ]
1319
- },
1320
- {
1321
- "name": "stdout",
1322
- "output_type": "stream",
1323
- "text": [
1324
- "\r",
1325
- "th 40%[=======> ] 411.99M 33.0MB/s eta 16s "
1326
- ]
1327
- },
1328
- {
1329
- "name": "stdout",
1330
- "output_type": "stream",
1331
- "text": [
1332
- "\r",
1333
- "h 42%[=======> ] 427.25M 33.6MB/s eta 15s "
1334
- ]
1335
- },
1336
- {
1337
- "name": "stdout",
1338
- "output_type": "stream",
1339
- "text": [
1340
- "\r",
1341
- " 43%[=======> ] 441.98M 32.7MB/s eta 15s "
1342
- ]
1343
- },
1344
- {
1345
- "name": "stdout",
1346
- "output_type": "stream",
1347
- "text": [
1348
- "\r",
1349
- " v 43%[=======> ] 442.51M 32.5MB/s eta 15s "
1350
- ]
1351
- },
1352
- {
1353
- "name": "stdout",
1354
- "output_type": "stream",
1355
- "text": [
1356
- "\r",
1357
- " v5 44%[=======> ] 457.25M 32.2MB/s eta 15s "
1358
- ]
1359
- },
1360
- {
1361
- "name": "stdout",
1362
- "output_type": "stream",
1363
- "text": [
1364
- "\r",
1365
- " v5- 45%[========> ] 457.76M 32.2MB/s eta 15s "
1366
- ]
1367
- },
1368
- {
1369
- "name": "stdout",
1370
- "output_type": "stream",
1371
- "text": [
1372
- "\r",
1373
- " v5-L 46%[========> ] 472.50M 31.5MB/s eta 14s "
1374
- ]
1375
- },
1376
- {
1377
- "name": "stdout",
1378
- "output_type": "stream",
1379
- "text": [
1380
- "\r",
1381
- " v5-L6 46%[========> ] 473.02M 31.5MB/s eta 14s "
1382
- ]
1383
- },
1384
- {
1385
- "name": "stdout",
1386
- "output_type": "stream",
1387
- "text": [
1388
- "\r",
1389
- " v5-L6- 48%[========> ] 488.28M 30.9MB/s eta 14s "
1390
- ]
1391
- },
1392
- {
1393
- "name": "stdout",
1394
- "output_type": "stream",
1395
- "text": [
1396
- "\r",
1397
- " v5-L6-D 49%[========> ] 503.54M 33.6MB/s eta 14s "
1398
- ]
1399
- },
1400
- {
1401
- "name": "stdout",
1402
- "output_type": "stream",
1403
- "text": [
1404
- "\r",
1405
- " v5-L6-D2 50%[=========> ] 518.29M 34.2MB/s eta 14s "
1406
- ]
1407
- },
1408
- {
1409
- "name": "stdout",
1410
- "output_type": "stream",
1411
- "text": [
1412
- "\r",
1413
- " v5-L6-D20 51%[=========> ] 518.80M 34.8MB/s eta 13s "
1414
- ]
1415
- },
1416
- {
1417
- "name": "stdout",
1418
- "output_type": "stream",
1419
- "text": [
1420
- "\r",
1421
- " v5-L6-D204 52%[=========> ] 534.05M 34.2MB/s eta 13s "
1422
- ]
1423
- },
1424
- {
1425
- "name": "stdout",
1426
- "output_type": "stream",
1427
- "text": [
1428
- "\r",
1429
- " v5-L6-D2048 54%[=========> ] 549.31M 37.5MB/s eta 13s "
1430
- ]
1431
- },
1432
- {
1433
- "name": "stdout",
1434
- "output_type": "stream",
1435
- "text": [
1436
- "\r",
1437
- " v5-L6-D2048- 55%[==========> ] 564.06M 37.7MB/s eta 13s "
1438
- ]
1439
- },
1440
- {
1441
- "name": "stdout",
1442
- "output_type": "stream",
1443
- "text": [
1444
- "\r",
1445
- " v5-L6-D2048-E 55%[==========> ] 565.78M 37.7MB/s eta 12s "
1446
- ]
1447
- },
1448
- {
1449
- "name": "stdout",
1450
- "output_type": "stream",
1451
- "text": [
1452
- "\r",
1453
- " v5-L6-D2048-E0 57%[==========> ] 579.83M 37.8MB/s eta 12s "
1454
- ]
1455
- },
1456
- {
1457
- "name": "stdout",
1458
- "output_type": "stream",
1459
- "text": [
1460
- "\r",
1461
- " v5-L6-D2048-E0_ 58%[==========> ] 595.09M 39.7MB/s eta 12s "
1462
- ]
1463
- },
1464
- {
1465
- "name": "stdout",
1466
- "output_type": "stream",
1467
- "text": [
1468
- "\r",
1469
- " v5-L6-D2048-E0_0 60%[===========> ] 610.35M 40.9MB/s eta 12s "
1470
- ]
1471
- },
1472
- {
1473
- "name": "stdout",
1474
- "output_type": "stream",
1475
- "text": [
1476
- "\r",
1477
- " v5-L6-D2048-E0_01 61%[===========> ] 625.47M 44.1MB/s eta 10s "
1478
- ]
1479
- },
1480
- {
1481
- "name": "stdout",
1482
- "output_type": "stream",
1483
- "text": [
1484
- "\r",
1485
- " v5-L6-D2048-E0_01- 61%[===========> ] 629.82M 42.6MB/s eta 10s "
1486
- ]
1487
- },
1488
- {
1489
- "name": "stdout",
1490
- "output_type": "stream",
1491
- "text": [
1492
- "\r",
1493
- "v5-L6-D2048-E0_01-s 63%[===========> ] 640.87M 42.6MB/s eta 10s "
1494
- ]
1495
- },
1496
- {
1497
- "name": "stdout",
1498
- "output_type": "stream",
1499
- "text": [
1500
- "\r",
1501
- "5-L6-D2048-E0_01-sp 64%[===========> ] 656.13M 45.6MB/s eta 10s "
1502
- ]
1503
- },
1504
- {
1505
- "name": "stdout",
1506
- "output_type": "stream",
1507
- "text": [
1508
- "\r",
1509
- "-L6-D2048-E0_01-spl 66%[============> ] 671.38M 45.8MB/s eta 9s "
1510
- ]
1511
- },
1512
- {
1513
- "name": "stdout",
1514
- "output_type": "stream",
1515
- "text": [
1516
- "\r",
1517
- "L6-D2048-E0_01-spli 67%[============> ] 686.64M 47.1MB/s eta 9s "
1518
- ]
1519
- },
1520
- {
1521
- "name": "stdout",
1522
- "output_type": "stream",
1523
- "text": [
1524
- "\r",
1525
- "6-D2048-E0_01-split 69%[============> ] 701.90M 47.0MB/s eta 9s "
1526
- ]
1527
- },
1528
- {
1529
- "name": "stdout",
1530
- "output_type": "stream",
1531
- "text": [
1532
- "\r",
1533
- "-D2048-E0_01-split- 70%[=============> ] 717.16M 46.9MB/s eta 9s "
1534
- ]
1535
- },
1536
- {
1537
- "name": "stdout",
1538
- "output_type": "stream",
1539
- "text": [
1540
- "\r",
1541
- "D2048-E0_01-split-2 71%[=============> ] 730.60M 47.8MB/s eta 7s "
1542
- ]
1543
- },
1544
- {
1545
- "name": "stdout",
1546
- "output_type": "stream",
1547
- "text": [
1548
- "\r",
1549
- "2048-E0_01-split-2b 73%[=============> ] 747.17M 45.9MB/s eta 7s "
1550
- ]
1551
- },
1552
- {
1553
- "name": "stdout",
1554
- "output_type": "stream",
1555
- "text": [
1556
- "\r",
1557
- "048-E0_01-split-2b. 74%[=============> ] 755.98M 45.7MB/s eta 7s "
1558
- ]
1559
- },
1560
- {
1561
- "name": "stdout",
1562
- "output_type": "stream",
1563
- "text": [
1564
- "\r",
1565
- "48-E0_01-split-2b.p 75%[==============> ] 762.94M 43.2MB/s eta 6s "
1566
- ]
1567
- },
1568
- {
1569
- "name": "stdout",
1570
- "output_type": "stream",
1571
- "text": [
1572
- "\r",
1573
- "8-E0_01-split-2b.pt 76%[==============> ] 777.68M 45.4MB/s eta 6s "
1574
- ]
1575
- },
1576
- {
1577
- "name": "stdout",
1578
- "output_type": "stream",
1579
- "text": [
1580
- "\r",
1581
- "-E0_01-split-2b.pth 76%[==============> ] 778.32M 42.7MB/s eta 6s "
1582
- ]
1583
- },
1584
- {
1585
- "name": "stdout",
1586
- "output_type": "stream",
1587
- "text": [
1588
- "\r",
1589
- "E0_01-split-2b.pth 78%[==============> ] 793.46M 42.0MB/s eta 6s "
1590
- ]
1591
- },
1592
- {
1593
- "name": "stdout",
1594
- "output_type": "stream",
1595
- "text": [
1596
- "\r",
1597
- "0_01-split-2b.pth 79%[==============> ] 808.20M 41.7MB/s eta 5s "
1598
- ]
1599
- },
1600
- {
1601
- "name": "stdout",
1602
- "output_type": "stream",
1603
- "text": [
1604
- "\r",
1605
- "_01-split-2b.pth 80%[===============> ] 814.09M 42.3MB/s eta 5s "
1606
- ]
1607
- },
1608
- {
1609
- "name": "stdout",
1610
- "output_type": "stream",
1611
- "text": [
1612
- "\r",
1613
- "01-split-2b.pth 80%[===============> ] 823.46M 41.0MB/s eta 5s "
1614
- ]
1615
- },
1616
- {
1617
- "name": "stdout",
1618
- "output_type": "stream",
1619
- "text": [
1620
- "\r",
1621
- "1-split-2b.pth 81%[===============> ] 823.97M 40.5MB/s eta 5s "
1622
- ]
1623
- },
1624
- {
1625
- "name": "stdout",
1626
- "output_type": "stream",
1627
- "text": [
1628
- "\r",
1629
- "-split-2b.pth 82%[===============> ] 838.71M 38.6MB/s eta 4s "
1630
- ]
1631
- },
1632
- {
1633
- "name": "stdout",
1634
- "output_type": "stream",
1635
- "text": [
1636
- "\r",
1637
- "split-2b.pth 83%[===============> ] 853.98M 41.4MB/s eta 4s "
1638
- ]
1639
- },
1640
- {
1641
- "name": "stdout",
1642
- "output_type": "stream",
1643
- "text": [
1644
- "\r",
1645
- "plit-2b.pth 84%[===============> ] 854.61M 38.0MB/s eta 4s "
1646
- ]
1647
- },
1648
- {
1649
- "name": "stdout",
1650
- "output_type": "stream",
1651
- "text": [
1652
- "\r",
1653
- "lit-2b.pth 85%[================> ] 869.24M 35.7MB/s eta 4s "
1654
- ]
1655
- },
1656
- {
1657
- "name": "stdout",
1658
- "output_type": "stream",
1659
- "text": [
1660
- "\r",
1661
- "it-2b.pth 85%[================> ] 869.75M 35.3MB/s eta 4s "
1662
- ]
1663
- },
1664
- {
1665
- "name": "stdout",
1666
- "output_type": "stream",
1667
- "text": [
1668
- "\r",
1669
- "t-2b.pth 86%[================> ] 875.74M 34.3MB/s eta 4s "
1670
- ]
1671
- },
1672
- {
1673
- "name": "stdout",
1674
- "output_type": "stream",
1675
- "text": [
1676
- "\r",
1677
- "-2b.pth 87%[================> ] 885.01M 32.5MB/s eta 4s "
1678
- ]
1679
- },
1680
- {
1681
- "name": "stdout",
1682
- "output_type": "stream",
1683
- "text": [
1684
- "\r",
1685
- "2b.pth 88%[================> ] 900.27M 33.8MB/s eta 4s "
1686
- ]
1687
- },
1688
- {
1689
- "name": "stdout",
1690
- "output_type": "stream",
1691
- "text": [
1692
- "\r",
1693
- "b.pth 89%[================> ] 913.70M 34.5MB/s eta 3s "
1694
- ]
1695
- },
1696
- {
1697
- "name": "stdout",
1698
- "output_type": "stream",
1699
- "text": [
1700
- "\r",
1701
- ".pth 90%[=================> ] 924.21M 34.8MB/s eta 3s "
1702
- ]
1703
- },
1704
- {
1705
- "name": "stdout",
1706
- "output_type": "stream",
1707
- "text": [
1708
- "\r",
1709
- "pth 91%[=================> ] 930.27M 35.3MB/s eta 3s "
1710
- ]
1711
- },
1712
- {
1713
- "name": "stdout",
1714
- "output_type": "stream",
1715
- "text": [
1716
- "\r",
1717
- "th 93%[=================> ] 946.04M 34.9MB/s eta 3s "
1718
- ]
1719
- },
1720
- {
1721
- "name": "stdout",
1722
- "output_type": "stream",
1723
- "text": [
1724
- "\r",
1725
- "h 94%[=================> ] 961.30M 37.2MB/s eta 1s "
1726
- ]
1727
- },
1728
- {
1729
- "name": "stdout",
1730
- "output_type": "stream",
1731
- "text": [
1732
- "\r",
1733
- " 95%[==================> ] 970.14M 35.7MB/s eta 1s "
1734
- ]
1735
- },
1736
- {
1737
- "name": "stdout",
1738
- "output_type": "stream",
1739
- "text": [
1740
- "\r",
1741
- " v 96%[==================> ] 976.55M 34.9MB/s eta 1s "
1742
- ]
1743
- },
1744
- {
1745
- "name": "stdout",
1746
- "output_type": "stream",
1747
- "text": [
1748
- "\r",
1749
- " v5 97%[==================> ] 991.82M 37.0MB/s eta 1s "
1750
- ]
1751
- },
1752
- {
1753
- "name": "stdout",
1754
- "output_type": "stream",
1755
- "text": [
1756
- "\r",
1757
- " v5- 98%[==================> ] 998.13M 35.6MB/s eta 1s "
1758
- ]
1759
- },
1760
- {
1761
- "name": "stdout",
1762
- "output_type": "stream",
1763
- "text": [
1764
- "\r",
1765
- " v5-L 98%[==================> ] 1007M 37.2MB/s eta 0s "
1766
- ]
1767
- },
1768
- {
1769
- "name": "stdout",
1770
- "output_type": "stream",
1771
- "text": [
1772
- "\r",
1773
- " v5-L6 99%[==================> ] 1016M 34.9MB/s eta 0s \r",
1774
- "v5-L6-D2048-E0_01-s 100%[===================>] 1017M 35.1MB/s in 26s \r\n",
1775
- "\r\n",
1776
- "2023-10-11 08:03:19 (38.9 MB/s) - ‘v5-L6-D2048-E0_01-split-2b.pth’ saved [1066536657/1066536657]\r\n",
1777
- "\r\n"
1778
- ]
1779
- }
1780
- ],
1781
- "source": [
1782
- "# Get the init split model, and finetune from there\n",
1783
- "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\"\n",
1784
- "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\""
1785
- ]
1786
- },
1787
- {
1788
- "cell_type": "code",
1789
- "execution_count": 4,
1790
- "id": "2a3cd2d1",
1791
- "metadata": {
1792
- "execution": {
1793
- "iopub.execute_input": "2023-10-11T08:03:19.666619Z",
1794
- "iopub.status.busy": "2023-10-11T08:03:19.665958Z",
1795
- "iopub.status.idle": "2023-10-11T08:03:29.305787Z",
1796
- "shell.execute_reply": "2023-10-11T08:03:29.304873Z"
1797
- },
1798
- "papermill": {
1799
- "duration": 9.658186,
1800
- "end_time": "2023-10-11T08:03:29.308744",
1801
- "exception": false,
1802
- "start_time": "2023-10-11T08:03:19.650558",
1803
- "status": "completed"
1804
- },
1805
- "tags": []
1806
- },
1807
- "outputs": [
1808
- {
1809
- "name": "stdout",
1810
- "output_type": "stream",
1811
- "text": [
1812
- "\r",
1813
- "Saving the dataset (0/2 shards): 0%| | 0/27200 [00:00<?, ? examples/s]"
1814
- ]
1815
- },
1816
- {
1817
- "name": "stdout",
1818
- "output_type": "stream",
1819
- "text": [
1820
- "\r",
1821
- "Saving the dataset (0/2 shards): 7%| | 2000/27200 [00:00<00:01, 16356.85 examp"
1822
- ]
1823
- },
1824
- {
1825
- "name": "stdout",
1826
- "output_type": "stream",
1827
- "text": [
1828
- "\r",
1829
- "Saving the dataset (0/2 shards): 15%|▏| 4000/27200 [00:00<00:01, 17283.77 examp"
1830
- ]
1831
- },
1832
- {
1833
- "name": "stdout",
1834
- "output_type": "stream",
1835
- "text": [
1836
- "\r",
1837
- "Saving the dataset (0/2 shards): 22%|▏| 6000/27200 [00:00<00:01, 17873.97 examp"
1838
- ]
1839
- },
1840
- {
1841
- "name": "stdout",
1842
- "output_type": "stream",
1843
- "text": [
1844
- "\r",
1845
- "Saving the dataset (0/2 shards): 29%|▎| 8000/27200 [00:00<00:01, 18442.59 examp"
1846
- ]
1847
- },
1848
- {
1849
- "name": "stdout",
1850
- "output_type": "stream",
1851
- "text": [
1852
- "\r",
1853
- "Saving the dataset (0/2 shards): 40%|▍| 11000/27200 [00:00<00:00, 19135.78 exam"
1854
- ]
1855
- },
1856
- {
1857
- "name": "stdout",
1858
- "output_type": "stream",
1859
- "text": [
1860
- "\r",
1861
- "Saving the dataset (0/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19543.92 exam\r",
1862
- "Saving the dataset (1/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19543.92 exam"
1863
- ]
1864
- },
1865
- {
1866
- "name": "stdout",
1867
- "output_type": "stream",
1868
- "text": [
1869
- "\r",
1870
- "Saving the dataset (1/2 shards): 65%|▋| 17600/27200 [00:00<00:00, 20515.42 exam"
1871
- ]
1872
- },
1873
- {
1874
- "name": "stdout",
1875
- "output_type": "stream",
1876
- "text": [
1877
- "\r",
1878
- "Saving the dataset (1/2 shards): 79%|▊| 21600/27200 [00:01<00:00, 21426.57 exam"
1879
- ]
1880
- },
1881
- {
1882
- "name": "stdout",
1883
- "output_type": "stream",
1884
- "text": [
1885
- "\r",
1886
- "Saving the dataset (1/2 shards): 94%|▉| 25600/27200 [00:01<00:00, 22078.81 exam"
1887
- ]
1888
- },
1889
- {
1890
- "name": "stdout",
1891
- "output_type": "stream",
1892
- "text": [
1893
- "\r",
1894
- "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 22078.81 exam\r",
1895
- "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 20603.99 exam\r\n",
1896
- "\r",
1897
- "Saving the dataset (0/1 shards): 0%| | 0/109 [00:00<?, ? examples/s]\r",
1898
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 8117.24 examples/\r",
1899
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7809.82 examples/\r\n"
1900
- ]
1901
- }
1902
- ],
1903
- "source": [
1904
- "# Lets preload the requried datasets\n",
1905
- "!cd \"{TRAINER_DIR}\" && \\\n",
1906
- " python3 preload_datapath.py \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\""
1907
- ]
1908
- },
1909
- {
1910
- "cell_type": "markdown",
1911
- "id": "77d1d3e8",
1912
- "metadata": {
1913
- "papermill": {
1914
- "duration": 0.016656,
1915
- "end_time": "2023-10-11T08:03:29.342825",
1916
- "exception": false,
1917
- "start_time": "2023-10-11T08:03:29.326169",
1918
- "status": "completed"
1919
- },
1920
- "tags": []
1921
- },
1922
- "source": [
1923
- "## Enwiki Stage 3 : Split-Baseline-A training"
1924
- ]
1925
- },
1926
- {
1927
- "cell_type": "code",
1928
- "execution_count": 5,
1929
- "id": "42cb403e",
1930
- "metadata": {
1931
- "execution": {
1932
- "iopub.execute_input": "2023-10-11T08:03:29.379159Z",
1933
- "iopub.status.busy": "2023-10-11T08:03:29.378428Z",
1934
- "iopub.status.idle": "2023-10-11T08:03:46.935627Z",
1935
- "shell.execute_reply": "2023-10-11T08:03:46.934802Z"
1936
- },
1937
- "papermill": {
1938
- "duration": 17.577903,
1939
- "end_time": "2023-10-11T08:03:46.937715",
1940
- "exception": false,
1941
- "start_time": "2023-10-11T08:03:29.359812",
1942
- "status": "completed"
1943
- },
1944
- "tags": []
1945
- },
1946
- "outputs": [
1947
- {
1948
- "name": "stdout",
1949
- "output_type": "stream",
1950
- "text": [
1951
- "[2023-10-11 08:03:33,838] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
1952
- ]
1953
- },
1954
- {
1955
- "name": "stdout",
1956
- "output_type": "stream",
1957
- "text": [
1958
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
1959
- ]
1960
- },
1961
- {
1962
- "name": "stdout",
1963
- "output_type": "stream",
1964
- "text": [
1965
- "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
1966
- " rank_zero_warn(\r\n"
1967
- ]
1968
- },
1969
- {
1970
- "name": "stdout",
1971
- "output_type": "stream",
1972
- "text": [
1973
- "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 1933922385\r\n",
1974
- " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
1975
- "Global seed set to 1933922385\r\n"
1976
- ]
1977
- },
1978
- {
1979
- "name": "stdout",
1980
- "output_type": "stream",
1981
- "text": [
1982
- "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
1983
- ]
1984
- },
1985
- {
1986
- "name": "stdout",
1987
- "output_type": "stream",
1988
- "text": [
1989
- "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
1990
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_080337-5696uouo\u001b[0m\r\n",
1991
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
1992
- "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
1993
- "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
1994
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/5696uouo\u001b[0m\r\n"
1995
- ]
1996
- },
1997
- {
1998
- "name": "stdout",
1999
- "output_type": "stream",
2000
- "text": [
2001
- "Traceback (most recent call last):\r\n",
2002
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
2003
- " cli_main()\r\n",
2004
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
2005
- " LightningCLI(\r\n",
2006
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
2007
- " self.instantiate_classes()\r\n",
2008
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
2009
- " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
2010
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
2011
- " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
2012
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
2013
- " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
2014
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
2015
- " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
2016
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
2017
- " component.instantiate_class(component, cfg)\r\n",
2018
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
2019
- " parent[key] = group.group_class(**value)\r\n",
2020
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
2021
- " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
2022
- "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2a.pth' does not exist\r\n"
2023
- ]
2024
- },
2025
- {
2026
- "name": "stdout",
2027
- "output_type": "stream",
2028
- "text": [
2029
- "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
2030
- ]
2031
- },
2032
- {
2033
- "name": "stdout",
2034
- "output_type": "stream",
2035
- "text": [
2036
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/5696uouo\u001b[0m\r\n",
2037
- "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v16\u001b[0m\r\n",
2038
- "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n",
2039
- "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_080337-5696uouo/logs\u001b[0m\r\n"
2040
- ]
2041
- }
2042
- ],
2043
- "source": [
2044
- "# Start the foundation model training\n",
2045
- "!cd \"{TRAINER_DIR}\" && \\\n",
2046
- " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
2047
- " python3 lightning_trainer.py fit \\\n",
2048
- " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
2049
- " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion A3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
2050
- " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
2051
- " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
2052
- " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/\" \\\n",
2053
- " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2a.pth\" \\\n",
2054
- " --model.ctx_len=4096 \\\n",
2055
- " --model.bptt_learning_range=1"
2056
- ]
2057
- },
2058
- {
2059
- "cell_type": "code",
2060
- "execution_count": 6,
2061
- "id": "53867c42",
2062
- "metadata": {
2063
- "execution": {
2064
- "iopub.execute_input": "2023-10-11T08:03:46.969471Z",
2065
- "iopub.status.busy": "2023-10-11T08:03:46.969019Z",
2066
- "iopub.status.idle": "2023-10-11T08:03:50.682437Z",
2067
- "shell.execute_reply": "2023-10-11T08:03:50.680986Z"
2068
- },
2069
- "papermill": {
2070
- "duration": 3.732808,
2071
- "end_time": "2023-10-11T08:03:50.685581",
2072
- "exception": false,
2073
- "start_time": "2023-10-11T08:03:46.952773",
2074
- "status": "completed"
2075
- },
2076
- "tags": []
2077
- },
2078
- "outputs": [
2079
- {
2080
- "name": "stdout",
2081
- "output_type": "stream",
2082
- "text": [
2083
- "[2023-10-11 08:03:49,278] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2084
- ]
2085
- },
2086
- {
2087
- "name": "stdout",
2088
- "output_type": "stream",
2089
- "text": [
2090
- "Traceback (most recent call last):\r\n",
2091
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
2092
- " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
2093
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
2094
- " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
2095
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
2096
- " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
2097
- "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/last.ckpt/latest\r\n"
2098
- ]
2099
- },
2100
- {
2101
- "name": "stdout",
2102
- "output_type": "stream",
2103
- "text": [
2104
- "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth': No such file or directory\r\n"
2105
- ]
2106
- }
2107
- ],
2108
- "source": [
2109
- "# Lets export the model from the checkpoint\n",
2110
- "!cd \"{TRAINER_DIR}\" && \\\n",
2111
- " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"bf16\"\n",
2112
- "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\""
2113
- ]
2114
- },
2115
- {
2116
- "cell_type": "code",
2117
- "execution_count": 7,
2118
- "id": "5688e577",
2119
- "metadata": {
2120
- "execution": {
2121
- "iopub.execute_input": "2023-10-11T08:03:50.806267Z",
2122
- "iopub.status.busy": "2023-10-11T08:03:50.804997Z",
2123
- "iopub.status.idle": "2023-10-11T08:03:56.788036Z",
2124
- "shell.execute_reply": "2023-10-11T08:03:56.786568Z"
2125
- },
2126
- "papermill": {
2127
- "duration": 6.08675,
2128
- "end_time": "2023-10-11T08:03:56.790510",
2129
- "exception": false,
2130
- "start_time": "2023-10-11T08:03:50.703760",
2131
- "status": "completed"
2132
- },
2133
- "tags": []
2134
- },
2135
- "outputs": [
2136
- {
2137
- "name": "stdout",
2138
- "output_type": "stream",
2139
- "text": [
2140
- "[2023-10-11 08:03:54,934] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2141
- ]
2142
- },
2143
- {
2144
- "name": "stdout",
2145
- "output_type": "stream",
2146
- "text": [
2147
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
2148
- ]
2149
- },
2150
- {
2151
- "name": "stdout",
2152
- "output_type": "stream",
2153
- "text": [
2154
- "Traceback (most recent call last):\r\n",
2155
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
2156
- " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
2157
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
2158
- " self.model = RWKV(**model_config)\r\n",
2159
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
2160
- " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
2161
- "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth' does not exist\r\n"
2162
- ]
2163
- }
2164
- ],
2165
- "source": [
2166
- "# # Lets do a quick dragon prompt validation\n",
2167
- "!cd \"{INFERENCE_DIR}\" && \\\n",
2168
- " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"cuda fp32\""
2169
- ]
2170
- },
2171
- {
2172
- "cell_type": "markdown",
2173
- "id": "b4927e87",
2174
- "metadata": {
2175
- "papermill": {
2176
- "duration": 0.015295,
2177
- "end_time": "2023-10-11T08:03:56.820640",
2178
- "exception": false,
2179
- "start_time": "2023-10-11T08:03:56.805345",
2180
- "status": "completed"
2181
- },
2182
- "tags": []
2183
- },
2184
- "source": [
2185
- "## Enwiki Stage 3 : Split-Baseline-B training"
2186
- ]
2187
- },
2188
- {
2189
- "cell_type": "code",
2190
- "execution_count": 8,
2191
- "id": "6bdd285a",
2192
- "metadata": {
2193
- "execution": {
2194
- "iopub.execute_input": "2023-10-11T08:03:56.853495Z",
2195
- "iopub.status.busy": "2023-10-11T08:03:56.852946Z",
2196
- "iopub.status.idle": "2023-10-11T08:04:11.500794Z",
2197
- "shell.execute_reply": "2023-10-11T08:04:11.499336Z"
2198
- },
2199
- "papermill": {
2200
- "duration": 14.668001,
2201
- "end_time": "2023-10-11T08:04:11.503644",
2202
- "exception": false,
2203
- "start_time": "2023-10-11T08:03:56.835643",
2204
- "status": "completed"
2205
- },
2206
- "tags": []
2207
- },
2208
- "outputs": [
2209
- {
2210
- "name": "stdout",
2211
- "output_type": "stream",
2212
- "text": [
2213
- "[2023-10-11 08:04:01,096] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2214
- ]
2215
- },
2216
- {
2217
- "name": "stdout",
2218
- "output_type": "stream",
2219
- "text": [
2220
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
2221
- ]
2222
- },
2223
- {
2224
- "name": "stdout",
2225
- "output_type": "stream",
2226
- "text": [
2227
- "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
2228
- " rank_zero_warn(\r\n"
2229
- ]
2230
- },
2231
- {
2232
- "name": "stdout",
2233
- "output_type": "stream",
2234
- "text": [
2235
- "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 1732922148\r\n",
2236
- " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
2237
- "Global seed set to 1732922148\r\n"
2238
- ]
2239
- },
2240
- {
2241
- "name": "stdout",
2242
- "output_type": "stream",
2243
- "text": [
2244
- "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
2245
- ]
2246
- },
2247
- {
2248
- "name": "stdout",
2249
- "output_type": "stream",
2250
- "text": [
2251
- "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
2252
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_080403-88lcuk7j\u001b[0m\r\n",
2253
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
2254
- "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
2255
- "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
2256
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/88lcuk7j\u001b[0m\r\n"
2257
- ]
2258
- },
2259
- {
2260
- "name": "stdout",
2261
- "output_type": "stream",
2262
- "text": [
2263
- "Traceback (most recent call last):\r\n",
2264
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
2265
- " cli_main()\r\n",
2266
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
2267
- " LightningCLI(\r\n",
2268
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
2269
- " self.instantiate_classes()\r\n",
2270
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
2271
- " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
2272
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
2273
- " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
2274
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
2275
- " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
2276
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
2277
- " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
2278
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
2279
- " component.instantiate_class(component, cfg)\r\n",
2280
- " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
2281
- " parent[key] = group.group_class(**value)\r\n",
2282
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
2283
- " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
2284
- "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2b.pth' does not exist\r\n"
2285
- ]
2286
- },
2287
- {
2288
- "name": "stdout",
2289
- "output_type": "stream",
2290
- "text": [
2291
- "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
2292
- ]
2293
- },
2294
- {
2295
- "name": "stdout",
2296
- "output_type": "stream",
2297
- "text": [
2298
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/88lcuk7j\u001b[0m\r\n",
2299
- "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v16\u001b[0m\r\n",
2300
- "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\r\n",
2301
- "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_080403-88lcuk7j/logs\u001b[0m\r\n"
2302
- ]
2303
- }
2304
- ],
2305
- "source": [
2306
- "# Start the foundation model training\n",
2307
- "!cd \"{TRAINER_DIR}\" && \\\n",
2308
- " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
2309
- " python3 lightning_trainer.py fit \\\n",
2310
- " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
2311
- " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion B3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
2312
- " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
2313
- " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
2314
- " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/\" \\\n",
2315
- " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2b.pth\" \\\n",
2316
- " --model.ctx_len=4096 \\\n",
2317
- " --model.bptt_learning_range=1"
2318
- ]
2319
- },
2320
- {
2321
- "cell_type": "code",
2322
- "execution_count": 9,
2323
- "id": "ae4623a1",
2324
- "metadata": {
2325
- "execution": {
2326
- "iopub.execute_input": "2023-10-11T08:04:11.546046Z",
2327
- "iopub.status.busy": "2023-10-11T08:04:11.544870Z",
2328
- "iopub.status.idle": "2023-10-11T08:04:15.274349Z",
2329
- "shell.execute_reply": "2023-10-11T08:04:15.272957Z"
2330
- },
2331
- "papermill": {
2332
- "duration": 3.754115,
2333
- "end_time": "2023-10-11T08:04:15.277163",
2334
- "exception": false,
2335
- "start_time": "2023-10-11T08:04:11.523048",
2336
- "status": "completed"
2337
- },
2338
- "tags": []
2339
- },
2340
- "outputs": [
2341
- {
2342
- "name": "stdout",
2343
- "output_type": "stream",
2344
- "text": [
2345
- "[2023-10-11 08:04:13,869] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2346
- ]
2347
- },
2348
- {
2349
- "name": "stdout",
2350
- "output_type": "stream",
2351
- "text": [
2352
- "Traceback (most recent call last):\r\n",
2353
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
2354
- " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
2355
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
2356
- " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
2357
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
2358
- " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
2359
- "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/last.ckpt/latest\r\n"
2360
- ]
2361
- },
2362
- {
2363
- "name": "stdout",
2364
- "output_type": "stream",
2365
- "text": [
2366
- "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth': No such file or directory\r\n"
2367
- ]
2368
- }
2369
- ],
2370
- "source": [
2371
- "# Lets export the model from the checkpoint\n",
2372
- "!cd \"{TRAINER_DIR}\" && \\\n",
2373
- " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"bf16\"\n",
2374
- "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\""
2375
- ]
2376
- },
2377
- {
2378
- "cell_type": "code",
2379
- "execution_count": 10,
2380
- "id": "8e1b1152",
2381
- "metadata": {
2382
- "execution": {
2383
- "iopub.execute_input": "2023-10-11T08:04:15.319747Z",
2384
- "iopub.status.busy": "2023-10-11T08:04:15.318636Z",
2385
- "iopub.status.idle": "2023-10-11T08:04:21.268526Z",
2386
- "shell.execute_reply": "2023-10-11T08:04:21.267073Z"
2387
- },
2388
- "papermill": {
2389
- "duration": 5.974644,
2390
- "end_time": "2023-10-11T08:04:21.271495",
2391
- "exception": false,
2392
- "start_time": "2023-10-11T08:04:15.296851",
2393
- "status": "completed"
2394
- },
2395
- "tags": []
2396
- },
2397
- "outputs": [
2398
- {
2399
- "name": "stdout",
2400
- "output_type": "stream",
2401
- "text": [
2402
- "[2023-10-11 08:04:19,430] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2403
- ]
2404
- },
2405
- {
2406
- "name": "stdout",
2407
- "output_type": "stream",
2408
- "text": [
2409
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
2410
- "Traceback (most recent call last):\r\n",
2411
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
2412
- " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
2413
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
2414
- " self.model = RWKV(**model_config)\r\n",
2415
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
2416
- " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
2417
- "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth' does not exist\r\n"
2418
- ]
2419
- }
2420
- ],
2421
- "source": [
2422
- "# # Lets do a quick dragon prompt validation\n",
2423
- "!cd \"{INFERENCE_DIR}\" && \\\n",
2424
- " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"cuda fp32\""
2425
- ]
2426
- }
2427
- ],
2428
- "metadata": {
2429
- "kernelspec": {
2430
- "display_name": "Python 3 (ipykernel)",
2431
- "language": "python",
2432
- "name": "python3"
2433
- },
2434
- "language_info": {
2435
- "codemirror_mode": {
2436
- "name": "ipython",
2437
- "version": 3
2438
- },
2439
- "file_extension": ".py",
2440
- "mimetype": "text/x-python",
2441
- "name": "python",
2442
- "nbconvert_exporter": "python",
2443
- "pygments_lexer": "ipython3",
2444
- "version": "3.10.12"
2445
- },
2446
- "papermill": {
2447
- "default_parameters": {},
2448
- "duration": 119.315066,
2449
- "end_time": "2023-10-11T08:04:21.714050",
2450
- "environment_variables": {},
2451
- "exception": null,
2452
- "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
2453
- "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
2454
- "parameters": {},
2455
- "start_time": "2023-10-11T08:02:22.398984",
2456
- "version": "2.4.0"
2457
- }
2458
- },
2459
- "nbformat": 4,
2460
- "nbformat_minor": 5
2461
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-expansion.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0aa2c37ab25e53ed3e45a9e7b5b09d1ac2d2f627412df5c98cc1f113838d800
3
- size 15734950
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-overwrite-naive.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d643e2a64a0f7323eb7b14b90ce5a0e5457818349c75e666dbf52b7319f5de72
3
- size 15733849
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-baseline-p2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:849b57b4d493d40313ef04b30ffc22ec6f5cb99e05225615ee0cb00acb78a95d
3
- size 1066537077
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-baseline-p3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8413565273ef40f61db246dcbf793e045b39d1163e18885441be5a16d733f34c
3
- size 1066537077
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-enwiki-4k-p1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:235d88b0aa939596392f2b5734a426940535816aa13106498974a809051a4c75
3
- size 1066537217
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-layer-expansion-a3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1afd8d92632792f498805ac222d159524badf4ecbcaaae597060b6bb87a53110
3
- size 1066538057
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-layer-expansion-b3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e61d8f8901d1eb50759f0242e2886678ed24b9931295a270b14120ba74cb5c3
3
- size 1066538057
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-neox-v5base-init.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2d60ede71bc384ee4eff0a591b3fa57dd670c27e5e8ce5eadf25a7f0d7e226d
3
- size 1066538337
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f52085cee9c3db4bb079dc44edf50b0a19c170bd92128e918e6203efef83cea
3
- size 1066536657
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b64a1018631b9ddd15a746002bab3eafe956dced78a91af7abcdadaae4a7b25
3
- size 1066536657
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-merge-2m.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f10f8f00c42b6408db81a3b26d53411c41edc7f23f5097ac095ad3096d6c5dc1
3
- size 1066537497
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-merge-p3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f07a8414cd0cd1c3df705dff8a0f2142231171ee52a94d12c55dfe7c888fef7
3
- size 1066537497
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fccffc430231ad06fdb02a7e50ea57acfbeae3c42a97b018f62f937d30736e4
3
- size 16519239
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7260b3fe80de461d6dc923b21af87361f71e26a4a7191d51dd9665403728ddfa
3
- size 15732960
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-baseline.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f95adf89d498a4dd58af22ba192b2fd4d08ceec250784c7e9f6f9b8de0fed2bc
3
- size 15855123
 
 
 
 
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-split-train.ipynb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c308e5ae9f8fde5fd24cafccf60917dca9c97fc2e0a5fbcfa01027d6d50e927d
3
- size 16623766
 
 
 
 
experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E0_1-mem-ctx-8k.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d042262601b79b1635bdd82e73c9f26fb35b05d8cded92a03aad5df56944dde
3
- size 2825976699
 
 
 
 
experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part5.ipynb CHANGED
The diff for this file is too large to render. See raw diff