[GHA] experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb result notebook & reports

#197
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb ADDED
@@ -0,0 +1,893 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "c4287c8b",
7
+ "metadata": {
8
+ "papermill": {
9
+ "duration": 0.004479,
10
+ "end_time": "2023-10-11T07:03:00.576797",
11
+ "exception": false,
12
+ "start_time": "2023-10-11T07:03:00.572318",
13
+ "status": "completed"
14
+ },
15
+ "tags": []
16
+ },
17
+ "source": [
18
+ "# RWKV v5 multi-size training experiment\n",
19
+ "\n",
20
+ "**Note:** This project assumes you have the rwkv-infctx conda env setup"
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "id": "5202003b",
27
+ "metadata": {
28
+ "papermill": {
29
+ "duration": 0.002433,
30
+ "end_time": "2023-10-11T07:03:00.582179",
31
+ "exception": false,
32
+ "start_time": "2023-10-11T07:03:00.579746",
33
+ "status": "completed"
34
+ },
35
+ "tags": []
36
+ },
37
+ "source": [
38
+ "# Basic Setup"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 1,
44
+ "id": "fc416bc2",
45
+ "metadata": {
46
+ "execution": {
47
+ "iopub.execute_input": "2023-10-11T07:03:00.589288Z",
48
+ "iopub.status.busy": "2023-10-11T07:03:00.588784Z",
49
+ "iopub.status.idle": "2023-10-11T07:03:01.346279Z",
50
+ "shell.execute_reply": "2023-10-11T07:03:01.345465Z"
51
+ },
52
+ "papermill": {
53
+ "duration": 0.763505,
54
+ "end_time": "2023-10-11T07:03:01.348385",
55
+ "exception": false,
56
+ "start_time": "2023-10-11T07:03:00.584880",
57
+ "status": "completed"
58
+ },
59
+ "tags": []
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "# First lets setup the various directories, and init the model\n",
64
+ "!mkdir -p ../../../../model/\n",
65
+ "!mkdir -p ../../../../datapath/\n",
66
+ "!mkdir -p ../../../../checkpoint/"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "id": "a9dea07f",
73
+ "metadata": {
74
+ "execution": {
75
+ "iopub.execute_input": "2023-10-11T07:03:01.355823Z",
76
+ "iopub.status.busy": "2023-10-11T07:03:01.355275Z",
77
+ "iopub.status.idle": "2023-10-11T07:03:01.363208Z",
78
+ "shell.execute_reply": "2023-10-11T07:03:01.362389Z"
79
+ },
80
+ "papermill": {
81
+ "duration": 0.013626,
82
+ "end_time": "2023-10-11T07:03:01.364896",
83
+ "exception": false,
84
+ "start_time": "2023-10-11T07:03:01.351270",
85
+ "status": "completed"
86
+ },
87
+ "tags": []
88
+ },
89
+ "outputs": [
90
+ {
91
+ "name": "stdout",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "DEEPSPEED_STRAT: deepspeed_stage_2_offload\n",
95
+ "ENABLE_WANDB: True\n",
96
+ "GPU_DEVICES: auto\n",
97
+ "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
98
+ "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
99
+ "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
100
+ "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "DEEPSPEED_STRAT=\"deepspeed_stage_2_offload\"\n",
106
+ "GPU_DEVICES=\"auto\"\n",
107
+ "ENABLE_WANDB=True\n",
108
+ "\n",
109
+ "EMBED_SCALE=0.01\n",
110
+ "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
111
+ "\n",
112
+ "EMBED_SIZE=2048\n",
113
+ "\n",
114
+ "WANDB_PREFIX=f\"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
115
+ "FILENAME_PREFIX=f\"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
116
+ "\n",
117
+ "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
118
+ "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
119
+ "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
120
+ "\n",
121
+ "if ENABLE_WANDB:\n",
122
+ " WANDB_MODE=\"online\"\n",
123
+ "else:\n",
124
+ " WANDB_MODE=\"disabled\"\n",
125
+ "\n",
126
+ "# Computing the notebook, and various paths\n",
127
+ "import os\n",
128
+ "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
129
+ "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
130
+ "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
131
+ "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
132
+ "\n",
133
+ "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
134
+ "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
135
+ "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
136
+ "print(\"PROJECT_DIR:\", PROJECT_DIR)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 3,
142
+ "id": "bcb68665",
143
+ "metadata": {
144
+ "execution": {
145
+ "iopub.execute_input": "2023-10-11T07:03:01.372738Z",
146
+ "iopub.status.busy": "2023-10-11T07:03:01.371949Z",
147
+ "iopub.status.idle": "2023-10-11T07:03:02.581837Z",
148
+ "shell.execute_reply": "2023-10-11T07:03:02.580259Z"
149
+ },
150
+ "papermill": {
151
+ "duration": 1.216253,
152
+ "end_time": "2023-10-11T07:03:02.584117",
153
+ "exception": false,
154
+ "start_time": "2023-10-11T07:03:01.367864",
155
+ "status": "completed"
156
+ },
157
+ "tags": []
158
+ },
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "--2023-10-11 07:03:01-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\r\n",
165
+ "Resolving huggingface.co (huggingface.co)... 18.154.227.7, 18.154.227.87, 18.154.227.69, ...\r\n",
166
+ "Connecting to huggingface.co (huggingface.co)|18.154.227.7|:443... connected.\r\n",
167
+ "HTTP request sent, awaiting response... "
168
+ ]
169
+ },
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "200 OK\r\n",
175
+ "Length: 44360 (43K) [text/html]\r\n",
176
+ "Saving to: ‘v5-L6-D2048-E0_01-split-2a.pth’\r\n",
177
+ "\r\n",
178
+ "\r",
179
+ " v5-L6-D20 0%[ ] 0 --.-KB/s \r",
180
+ "v5-L6-D2048-E0_01-s 100%[===================>] 43.32K --.-KB/s in 0.001s \r\n",
181
+ "\r\n",
182
+ "2023-10-11 07:03:01 (46.3 MB/s) - ‘v5-L6-D2048-E0_01-split-2a.pth’ saved [44360/44360]\r\n",
183
+ "\r\n"
184
+ ]
185
+ },
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "--2023-10-11 07:03:02-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\r\n",
191
+ "Resolving huggingface.co (huggingface.co)... 18.154.227.67, 18.154.227.7, 18.154.227.69, ...\r\n",
192
+ "Connecting to huggingface.co (huggingface.co)|18.154.227.67|:443... connected.\r\n",
193
+ "HTTP request sent, awaiting response... "
194
+ ]
195
+ },
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "200 OK\r\n",
201
+ "Length: 44360 (43K) [text/html]\r\n",
202
+ "Saving to: ‘v5-L6-D2048-E0_01-split-2b.pth’\r\n",
203
+ "\r\n",
204
+ "\r",
205
+ " v5-L6-D20 0%[ ] 0 --.-KB/s \r",
206
+ "v5-L6-D2048-E0_01-s 100%[===================>] 43.32K --.-KB/s in 0s \r\n",
207
+ "\r\n",
208
+ "2023-10-11 07:03:02 (215 MB/s) - ‘v5-L6-D2048-E0_01-split-2b.pth’ saved [44360/44360]\r\n",
209
+ "\r\n"
210
+ ]
211
+ }
212
+ ],
213
+ "source": [
214
+ "# Get the init split model, and finetune from there\n",
215
+ "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\"\n",
216
+ "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\""
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 4,
222
+ "id": "81e32bce",
223
+ "metadata": {
224
+ "execution": {
225
+ "iopub.execute_input": "2023-10-11T07:03:02.594064Z",
226
+ "iopub.status.busy": "2023-10-11T07:03:02.592869Z",
227
+ "iopub.status.idle": "2023-10-11T07:03:12.284528Z",
228
+ "shell.execute_reply": "2023-10-11T07:03:12.283195Z"
229
+ },
230
+ "papermill": {
231
+ "duration": 9.699478,
232
+ "end_time": "2023-10-11T07:03:12.286978",
233
+ "exception": false,
234
+ "start_time": "2023-10-11T07:03:02.587500",
235
+ "status": "completed"
236
+ },
237
+ "tags": []
238
+ },
239
+ "outputs": [
240
+ {
241
+ "name": "stdout",
242
+ "output_type": "stream",
243
+ "text": [
244
+ "\r",
245
+ "Saving the dataset (0/2 shards): 0%| | 0/27200 [00:00<?, ? examples/s]"
246
+ ]
247
+ },
248
+ {
249
+ "name": "stdout",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "\r",
253
+ "Saving the dataset (0/2 shards): 7%| | 2000/27200 [00:00<00:01, 16059.79 examp"
254
+ ]
255
+ },
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "\r",
261
+ "Saving the dataset (0/2 shards): 15%|▏| 4000/27200 [00:00<00:01, 17165.62 examp"
262
+ ]
263
+ },
264
+ {
265
+ "name": "stdout",
266
+ "output_type": "stream",
267
+ "text": [
268
+ "\r",
269
+ "Saving the dataset (0/2 shards): 22%|▏| 6000/27200 [00:00<00:01, 17873.84 examp"
270
+ ]
271
+ },
272
+ {
273
+ "name": "stdout",
274
+ "output_type": "stream",
275
+ "text": [
276
+ "\r",
277
+ "Saving the dataset (0/2 shards): 29%|▎| 8000/27200 [00:00<00:01, 18442.04 examp"
278
+ ]
279
+ },
280
+ {
281
+ "name": "stdout",
282
+ "output_type": "stream",
283
+ "text": [
284
+ "\r",
285
+ "Saving the dataset (0/2 shards): 40%|▍| 11000/27200 [00:00<00:00, 19214.27 exam"
286
+ ]
287
+ },
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "\r",
293
+ "Saving the dataset (0/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19584.96 exam\r",
294
+ "Saving the dataset (1/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19584.96 exam"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "\r",
302
+ "Saving the dataset (1/2 shards): 65%|▋| 17600/27200 [00:00<00:00, 20456.63 exam"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stdout",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "\r",
310
+ "Saving the dataset (1/2 shards): 79%|▊| 21600/27200 [00:01<00:00, 21374.91 exam"
311
+ ]
312
+ },
313
+ {
314
+ "name": "stdout",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "\r",
318
+ "Saving the dataset (1/2 shards): 94%|▉| 25600/27200 [00:01<00:00, 21799.50 exam"
319
+ ]
320
+ },
321
+ {
322
+ "name": "stdout",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "\r",
326
+ "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 21799.50 exam\r",
327
+ "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 20461.58 exam\r\n"
328
+ ]
329
+ },
330
+ {
331
+ "name": "stdout",
332
+ "output_type": "stream",
333
+ "text": [
334
+ "\r",
335
+ "Saving the dataset (0/1 shards): 0%| | 0/109 [00:00<?, ? examples/s]\r",
336
+ "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7981.06 examples/\r",
337
+ "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7683.94 examples/\r\n"
338
+ ]
339
+ }
340
+ ],
341
+ "source": [
342
+ "# Lets preload the requried datasets\n",
343
+ "!cd \"{TRAINER_DIR}\" && \\\n",
344
+ " python3 preload_datapath.py \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\""
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "markdown",
349
+ "id": "5bf12f9e",
350
+ "metadata": {
351
+ "papermill": {
352
+ "duration": 0.00368,
353
+ "end_time": "2023-10-11T07:03:12.294651",
354
+ "exception": false,
355
+ "start_time": "2023-10-11T07:03:12.290971",
356
+ "status": "completed"
357
+ },
358
+ "tags": []
359
+ },
360
+ "source": [
361
+ "## Enwiki Stage 3 : Split-Baseline-A training"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 5,
367
+ "id": "c1566af8",
368
+ "metadata": {
369
+ "execution": {
370
+ "iopub.execute_input": "2023-10-11T07:03:12.305523Z",
371
+ "iopub.status.busy": "2023-10-11T07:03:12.304583Z",
372
+ "iopub.status.idle": "2023-10-11T07:03:29.051517Z",
373
+ "shell.execute_reply": "2023-10-11T07:03:29.049989Z"
374
+ },
375
+ "papermill": {
376
+ "duration": 16.755429,
377
+ "end_time": "2023-10-11T07:03:29.053992",
378
+ "exception": false,
379
+ "start_time": "2023-10-11T07:03:12.298563",
380
+ "status": "completed"
381
+ },
382
+ "tags": []
383
+ },
384
+ "outputs": [
385
+ {
386
+ "name": "stdout",
387
+ "output_type": "stream",
388
+ "text": [
389
+ "[2023-10-11 07:03:16,508] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
390
+ ]
391
+ },
392
+ {
393
+ "name": "stdout",
394
+ "output_type": "stream",
395
+ "text": [
396
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
397
+ ]
398
+ },
399
+ {
400
+ "name": "stdout",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
404
+ " rank_zero_warn(\r\n"
405
+ ]
406
+ },
407
+ {
408
+ "name": "stdout",
409
+ "output_type": "stream",
410
+ "text": [
411
+ "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 3015291597\r\n",
412
+ " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
413
+ "Global seed set to 3015291597\r\n"
414
+ ]
415
+ },
416
+ {
417
+ "name": "stdout",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
421
+ ]
422
+ },
423
+ {
424
+ "name": "stdout",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
428
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_070319-yi5f0p8v\u001b[0m\r\n",
429
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
430
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
431
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
432
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/yi5f0p8v\u001b[0m\r\n"
433
+ ]
434
+ },
435
+ {
436
+ "name": "stdout",
437
+ "output_type": "stream",
438
+ "text": [
439
+ "Traceback (most recent call last):\r\n",
440
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
441
+ " cli_main()\r\n",
442
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
443
+ " LightningCLI(\r\n",
444
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
445
+ " self.instantiate_classes()\r\n",
446
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
447
+ " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
448
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
449
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
450
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
451
+ " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
452
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
453
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
454
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
455
+ " component.instantiate_class(component, cfg)\r\n",
456
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
457
+ " parent[key] = group.group_class(**value)\r\n",
458
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
459
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
460
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2a.pth' does not exist\r\n"
461
+ ]
462
+ },
463
+ {
464
+ "name": "stdout",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
468
+ ]
469
+ },
470
+ {
471
+ "name": "stdout",
472
+ "output_type": "stream",
473
+ "text": [
474
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/yi5f0p8v\u001b[0m\r\n",
475
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v15\u001b[0m\r\n",
476
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n",
477
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_070319-yi5f0p8v/logs\u001b[0m\r\n"
478
+ ]
479
+ }
480
+ ],
481
+ "source": [
482
+ "# Start the foundation model training\n",
483
+ "!cd \"{TRAINER_DIR}\" && \\\n",
484
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
485
+ " python3 lightning_trainer.py fit \\\n",
486
+ " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
487
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion A3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
488
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
489
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
490
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/\" \\\n",
491
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2a.pth\" \\\n",
492
+ " --model.ctx_len=4096 \\\n",
493
+ " --model.bptt_learning_range=1"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 6,
499
+ "id": "d56d6a30",
500
+ "metadata": {
501
+ "execution": {
502
+ "iopub.execute_input": "2023-10-11T07:03:29.066944Z",
503
+ "iopub.status.busy": "2023-10-11T07:03:29.065449Z",
504
+ "iopub.status.idle": "2023-10-11T07:03:32.793675Z",
505
+ "shell.execute_reply": "2023-10-11T07:03:32.792045Z"
506
+ },
507
+ "papermill": {
508
+ "duration": 3.737581,
509
+ "end_time": "2023-10-11T07:03:32.796266",
510
+ "exception": false,
511
+ "start_time": "2023-10-11T07:03:29.058685",
512
+ "status": "completed"
513
+ },
514
+ "tags": []
515
+ },
516
+ "outputs": [
517
+ {
518
+ "name": "stdout",
519
+ "output_type": "stream",
520
+ "text": [
521
+ "[2023-10-11 07:03:31,388] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
522
+ ]
523
+ },
524
+ {
525
+ "name": "stdout",
526
+ "output_type": "stream",
527
+ "text": [
528
+ "Traceback (most recent call last):\r\n",
529
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
530
+ " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
531
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
532
+ " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
533
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
534
+ " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
535
+ "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/last.ckpt/latest\r\n"
536
+ ]
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth': No such file or directory\r\n"
543
+ ]
544
+ }
545
+ ],
546
+ "source": [
547
+ "# Lets export the model from the checkpoint\n",
548
+ "!cd \"{TRAINER_DIR}\" && \\\n",
549
+ " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"bf16\"\n",
550
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\""
551
+ ]
552
+ },
553
+ {
554
+ "cell_type": "code",
555
+ "execution_count": 7,
556
+ "id": "f4c37d28",
557
+ "metadata": {
558
+ "execution": {
559
+ "iopub.execute_input": "2023-10-11T07:03:32.809297Z",
560
+ "iopub.status.busy": "2023-10-11T07:03:32.808263Z",
561
+ "iopub.status.idle": "2023-10-11T07:03:39.200791Z",
562
+ "shell.execute_reply": "2023-10-11T07:03:39.199726Z"
563
+ },
564
+ "papermill": {
565
+ "duration": 6.402526,
566
+ "end_time": "2023-10-11T07:03:39.203696",
567
+ "exception": false,
568
+ "start_time": "2023-10-11T07:03:32.801170",
569
+ "status": "completed"
570
+ },
571
+ "tags": []
572
+ },
573
+ "outputs": [
574
+ {
575
+ "name": "stdout",
576
+ "output_type": "stream",
577
+ "text": [
578
+ "[2023-10-11 07:03:37,133] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
579
+ ]
580
+ },
581
+ {
582
+ "name": "stdout",
583
+ "output_type": "stream",
584
+ "text": [
585
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
586
+ "Traceback (most recent call last):\r\n",
587
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
588
+ " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
589
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
590
+ " self.model = RWKV(**model_config)\r\n",
591
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
592
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
593
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth' does not exist\r\n"
594
+ ]
595
+ }
596
+ ],
597
+ "source": [
598
+ "# # Lets do a quick dragon prompt validation\n",
599
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
600
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"cuda fp32\""
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "markdown",
605
+ "id": "39b5c9e4",
606
+ "metadata": {
607
+ "papermill": {
608
+ "duration": 0.006891,
609
+ "end_time": "2023-10-11T07:03:39.217738",
610
+ "exception": false,
611
+ "start_time": "2023-10-11T07:03:39.210847",
612
+ "status": "completed"
613
+ },
614
+ "tags": []
615
+ },
616
+ "source": [
617
+ "## Enwiki Stage 3 : Split-Baseline-B training"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "execution_count": 8,
623
+ "id": "6a2803dc",
624
+ "metadata": {
625
+ "execution": {
626
+ "iopub.execute_input": "2023-10-11T07:03:39.235541Z",
627
+ "iopub.status.busy": "2023-10-11T07:03:39.234393Z",
628
+ "iopub.status.idle": "2023-10-11T07:03:54.515077Z",
629
+ "shell.execute_reply": "2023-10-11T07:03:54.513988Z"
630
+ },
631
+ "papermill": {
632
+ "duration": 15.292458,
633
+ "end_time": "2023-10-11T07:03:54.517573",
634
+ "exception": false,
635
+ "start_time": "2023-10-11T07:03:39.225115",
636
+ "status": "completed"
637
+ },
638
+ "tags": []
639
+ },
640
+ "outputs": [
641
+ {
642
+ "name": "stdout",
643
+ "output_type": "stream",
644
+ "text": [
645
+ "[2023-10-11 07:03:43,703] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
646
+ ]
647
+ },
648
+ {
649
+ "name": "stdout",
650
+ "output_type": "stream",
651
+ "text": [
652
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
653
+ ]
654
+ },
655
+ {
656
+ "name": "stdout",
657
+ "output_type": "stream",
658
+ "text": [
659
+ "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
660
+ " rank_zero_warn(\r\n"
661
+ ]
662
+ },
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 736269213\r\n",
668
+ " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
669
+ "Global seed set to 736269213\r\n"
670
+ ]
671
+ },
672
+ {
673
+ "name": "stdout",
674
+ "output_type": "stream",
675
+ "text": [
676
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
677
+ ]
678
+ },
679
+ {
680
+ "name": "stdout",
681
+ "output_type": "stream",
682
+ "text": [
683
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
684
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_070346-9urskvqu\u001b[0m\r\n",
685
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
686
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
687
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
688
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/9urskvqu\u001b[0m\r\n"
689
+ ]
690
+ },
691
+ {
692
+ "name": "stdout",
693
+ "output_type": "stream",
694
+ "text": [
695
+ "Traceback (most recent call last):\r\n",
696
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
697
+ " cli_main()\r\n",
698
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
699
+ " LightningCLI(\r\n",
700
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
701
+ " self.instantiate_classes()\r\n",
702
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
703
+ " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
704
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
705
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
706
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
707
+ " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
708
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
709
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
710
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
711
+ " component.instantiate_class(component, cfg)\r\n",
712
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
713
+ " parent[key] = group.group_class(**value)\r\n",
714
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
715
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
716
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2b.pth' does not exist\r\n"
717
+ ]
718
+ },
719
+ {
720
+ "name": "stdout",
721
+ "output_type": "stream",
722
+ "text": [
723
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
724
+ ]
725
+ },
726
+ {
727
+ "name": "stdout",
728
+ "output_type": "stream",
729
+ "text": [
730
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/9urskvqu\u001b[0m\r\n",
731
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v15\u001b[0m\r\n",
732
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\r\n",
733
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_070346-9urskvqu/logs\u001b[0m\r\n"
734
+ ]
735
+ }
736
+ ],
737
+ "source": [
738
+ "# Start the foundation model training\n",
739
+ "!cd \"{TRAINER_DIR}\" && \\\n",
740
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
741
+ " python3 lightning_trainer.py fit \\\n",
742
+ " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
743
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion B3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
744
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
745
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
746
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/\" \\\n",
747
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2b.pth\" \\\n",
748
+ " --model.ctx_len=4096 \\\n",
749
+ " --model.bptt_learning_range=1"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": 9,
755
+ "id": "9f50e589",
756
+ "metadata": {
757
+ "execution": {
758
+ "iopub.execute_input": "2023-10-11T07:03:54.533944Z",
759
+ "iopub.status.busy": "2023-10-11T07:03:54.532500Z",
760
+ "iopub.status.idle": "2023-10-11T07:03:58.532096Z",
761
+ "shell.execute_reply": "2023-10-11T07:03:58.530934Z"
762
+ },
763
+ "papermill": {
764
+ "duration": 4.010668,
765
+ "end_time": "2023-10-11T07:03:58.534788",
766
+ "exception": false,
767
+ "start_time": "2023-10-11T07:03:54.524120",
768
+ "status": "completed"
769
+ },
770
+ "tags": []
771
+ },
772
+ "outputs": [
773
+ {
774
+ "name": "stdout",
775
+ "output_type": "stream",
776
+ "text": [
777
+ "[2023-10-11 07:03:57,001] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
778
+ ]
779
+ },
780
+ {
781
+ "name": "stdout",
782
+ "output_type": "stream",
783
+ "text": [
784
+ "Traceback (most recent call last):\r\n",
785
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
786
+ " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
787
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
788
+ " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
789
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
790
+ " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
791
+ "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/last.ckpt/latest\r\n"
792
+ ]
793
+ },
794
+ {
795
+ "name": "stdout",
796
+ "output_type": "stream",
797
+ "text": [
798
+ "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth': No such file or directory\r\n"
799
+ ]
800
+ }
801
+ ],
802
+ "source": [
803
+ "# Lets export the model from the checkpoint\n",
804
+ "!cd \"{TRAINER_DIR}\" && \\\n",
805
+ " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"bf16\"\n",
806
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\""
807
+ ]
808
+ },
809
+ {
810
+ "cell_type": "code",
811
+ "execution_count": 10,
812
+ "id": "777a63eb",
813
+ "metadata": {
814
+ "execution": {
815
+ "iopub.execute_input": "2023-10-11T07:03:58.550672Z",
816
+ "iopub.status.busy": "2023-10-11T07:03:58.550018Z",
817
+ "iopub.status.idle": "2023-10-11T07:04:05.053290Z",
818
+ "shell.execute_reply": "2023-10-11T07:04:05.052230Z"
819
+ },
820
+ "papermill": {
821
+ "duration": 6.514677,
822
+ "end_time": "2023-10-11T07:04:05.056026",
823
+ "exception": false,
824
+ "start_time": "2023-10-11T07:03:58.541349",
825
+ "status": "completed"
826
+ },
827
+ "tags": []
828
+ },
829
+ "outputs": [
830
+ {
831
+ "name": "stdout",
832
+ "output_type": "stream",
833
+ "text": [
834
+ "[2023-10-11 07:04:02,965] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
835
+ ]
836
+ },
837
+ {
838
+ "name": "stdout",
839
+ "output_type": "stream",
840
+ "text": [
841
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
842
+ "Traceback (most recent call last):\r\n",
843
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
844
+ " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
845
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
846
+ " self.model = RWKV(**model_config)\r\n",
847
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
848
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
849
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth' does not exist\r\n"
850
+ ]
851
+ }
852
+ ],
853
+ "source": [
854
+ "# # Lets do a quick dragon prompt validation\n",
855
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
856
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"cuda fp32\""
857
+ ]
858
+ }
859
+ ],
860
+ "metadata": {
861
+ "kernelspec": {
862
+ "display_name": "Python 3 (ipykernel)",
863
+ "language": "python",
864
+ "name": "python3"
865
+ },
866
+ "language_info": {
867
+ "codemirror_mode": {
868
+ "name": "ipython",
869
+ "version": 3
870
+ },
871
+ "file_extension": ".py",
872
+ "mimetype": "text/x-python",
873
+ "name": "python",
874
+ "nbconvert_exporter": "python",
875
+ "pygments_lexer": "ipython3",
876
+ "version": "3.10.12"
877
+ },
878
+ "papermill": {
879
+ "default_parameters": {},
880
+ "duration": 66.115721,
881
+ "end_time": "2023-10-11T07:04:05.485131",
882
+ "environment_variables": {},
883
+ "exception": null,
884
+ "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
885
+ "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
886
+ "parameters": {},
887
+ "start_time": "2023-10-11T07:02:59.369410",
888
+ "version": "2.4.0"
889
+ }
890
+ },
891
+ "nbformat": 4,
892
+ "nbformat_minor": 5
893
+ }