picocreator commited on
Commit
f4feb3c
1 Parent(s): 82bda32

3f44fb0588a4b67ebbe52911d4f27fef311310d036a5530b0334f112e00e6b8e

Browse files
.gitattributes CHANGED
@@ -99,3 +99,4 @@ experiment/rwkv-x-exp/v5-r3-memory/L6-D2560-E1e-1-ctx4k/stage5.ipynb filter=lfs
99
  experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
100
  experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
101
  experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
 
 
99
  experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/stage5.ipynb filter=lfs diff=lfs merge=lfs -text
100
  experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
101
  experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-baseline.ipynb filter=lfs diff=lfs merge=lfs -text
102
+ experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb filter=lfs diff=lfs merge=lfs -text
experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb CHANGED
@@ -1,2222 +1,3 @@
1
- {
2
- "cells": [
3
- {
4
- "attachments": {},
5
- "cell_type": "markdown",
6
- "id": "3a71b6b3",
7
- "metadata": {
8
- "papermill": {
9
- "duration": 0.003347,
10
- "end_time": "2023-09-29T09:57:11.488052",
11
- "exception": false,
12
- "start_time": "2023-09-29T09:57:11.484705",
13
- "status": "completed"
14
- },
15
- "tags": []
16
- },
17
- "source": [
18
- "# RWKV v5 multi-size training experiment\n",
19
- "\n",
20
- "**Note:** This project assumes you have the rwkv-infctx conda env setup"
21
- ]
22
- },
23
- {
24
- "attachments": {},
25
- "cell_type": "markdown",
26
- "id": "73dce349",
27
- "metadata": {
28
- "papermill": {
29
- "duration": 0.002599,
30
- "end_time": "2023-09-29T09:57:11.495409",
31
- "exception": false,
32
- "start_time": "2023-09-29T09:57:11.492810",
33
- "status": "completed"
34
- },
35
- "tags": []
36
- },
37
- "source": [
38
- "# Basic Setup"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": 1,
44
- "id": "2fa01ec7",
45
- "metadata": {
46
- "execution": {
47
- "iopub.execute_input": "2023-09-29T09:57:11.502573Z",
48
- "iopub.status.busy": "2023-09-29T09:57:11.502067Z",
49
- "iopub.status.idle": "2023-09-29T09:57:12.255533Z",
50
- "shell.execute_reply": "2023-09-29T09:57:12.254559Z"
51
- },
52
- "papermill": {
53
- "duration": 0.759642,
54
- "end_time": "2023-09-29T09:57:12.257872",
55
- "exception": false,
56
- "start_time": "2023-09-29T09:57:11.498230",
57
- "status": "completed"
58
- },
59
- "tags": []
60
- },
61
- "outputs": [],
62
- "source": [
63
- "# First lets setup the various directories, and init the model\n",
64
- "!mkdir -p ../../../../model/\n",
65
- "!mkdir -p ../../../../datapath/\n",
66
- "!mkdir -p ../../../../checkpoint/"
67
- ]
68
- },
69
- {
70
- "cell_type": "code",
71
- "execution_count": 2,
72
- "id": "39dd6623",
73
- "metadata": {
74
- "execution": {
75
- "iopub.execute_input": "2023-09-29T09:57:12.265541Z",
76
- "iopub.status.busy": "2023-09-29T09:57:12.264984Z",
77
- "iopub.status.idle": "2023-09-29T09:57:12.272639Z",
78
- "shell.execute_reply": "2023-09-29T09:57:12.271887Z"
79
- },
80
- "papermill": {
81
- "duration": 0.013629,
82
- "end_time": "2023-09-29T09:57:12.274503",
83
- "exception": false,
84
- "start_time": "2023-09-29T09:57:12.260874",
85
- "status": "completed"
86
- },
87
- "tags": []
88
- },
89
- "outputs": [
90
- {
91
- "name": "stdout",
92
- "output_type": "stream",
93
- "text": [
94
- "DEEPSPEED_STRAT: deepspeed_stage_1\n",
95
- "ENABLE_WANDB: True\n",
96
- "GPU_DEVICES: auto\n",
97
- "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
98
- "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
99
- "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
100
- "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
101
- ]
102
- }
103
- ],
104
- "source": [
105
- "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
106
- "GPU_DEVICES=\"auto\"\n",
107
- "ENABLE_WANDB=True\n",
108
- "\n",
109
- "EMBED_SCALE=0.01\n",
110
- "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
111
- "\n",
112
- "LAYER_COUNT=12\n",
113
- "EMBED_SIZE=2048\n",
114
- "\n",
115
- "WANDB_PREFIX=f\"[Multi-size] v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
116
- "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
117
- "\n",
118
- "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
119
- "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
120
- "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
121
- "\n",
122
- "if ENABLE_WANDB:\n",
123
- " WANDB_MODE=\"online\"\n",
124
- "else:\n",
125
- " WANDB_MODE=\"disabled\"\n",
126
- "\n",
127
- "# Computing the notebook, and various paths\n",
128
- "import os\n",
129
- "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
130
- "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
131
- "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
132
- "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
133
- "\n",
134
- "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
135
- "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
136
- "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
137
- "print(\"PROJECT_DIR:\", PROJECT_DIR)"
138
- ]
139
- },
140
- {
141
- "cell_type": "code",
142
- "execution_count": 3,
143
- "id": "cf99b23f",
144
- "metadata": {
145
- "execution": {
146
- "iopub.execute_input": "2023-09-29T09:57:12.282369Z",
147
- "iopub.status.busy": "2023-09-29T09:57:12.281873Z",
148
- "iopub.status.idle": "2023-09-29T09:57:56.980304Z",
149
- "shell.execute_reply": "2023-09-29T09:57:56.979053Z"
150
- },
151
- "papermill": {
152
- "duration": 44.705166,
153
- "end_time": "2023-09-29T09:57:56.982856",
154
- "exception": false,
155
- "start_time": "2023-09-29T09:57:12.277690",
156
- "status": "completed"
157
- },
158
- "tags": []
159
- },
160
- "outputs": [
161
- {
162
- "name": "stdout",
163
- "output_type": "stream",
164
- "text": [
165
- "[2023-09-29 09:57:16,435] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
166
- ]
167
- },
168
- {
169
- "name": "stdout",
170
- "output_type": "stream",
171
- "text": [
172
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
173
- "---- Initializing model ----\r\n",
174
- "No of layers: 12\r\n",
175
- "Embedding size: 2048\r\n",
176
- "Output model path: ../model/v5-L12-D2048-E0_01-neox-v5base-init.pth\r\n",
177
- "Vocab size: 50277\r\n",
178
- "Emb scale: 0.01\r\n",
179
- "Note: this process takes a significant time (and ram) for large models\r\n",
180
- "---- ----- ----\r\n"
181
- ]
182
- },
183
- {
184
- "name": "stdout",
185
- "output_type": "stream",
186
- "text": [
187
- "50277 2048 -0.01 emb.weight\r\n"
188
- ]
189
- },
190
- {
191
- "name": "stdout",
192
- "output_type": "stream",
193
- "text": [
194
- "2048 2048 1.0 blocks.0.att.gate.weight\r\n"
195
- ]
196
- },
197
- {
198
- "name": "stdout",
199
- "output_type": "stream",
200
- "text": [
201
- "2048 2048 1.0 blocks.0.att.receptance.weight\r\n"
202
- ]
203
- },
204
- {
205
- "name": "stdout",
206
- "output_type": "stream",
207
- "text": [
208
- "2048 2048 1.0 blocks.0.att.key.weight\r\n"
209
- ]
210
- },
211
- {
212
- "name": "stdout",
213
- "output_type": "stream",
214
- "text": [
215
- "2048 2048 1.0 blocks.0.att.value.weight\r\n"
216
- ]
217
- },
218
- {
219
- "name": "stdout",
220
- "output_type": "stream",
221
- "text": [
222
- "2048 2048 0 blocks.0.att.output.weight\r\n",
223
- "7168 2048 1.0 blocks.0.ffn.key.weight\r\n"
224
- ]
225
- },
226
- {
227
- "name": "stdout",
228
- "output_type": "stream",
229
- "text": [
230
- "2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
231
- "2048 7168 0 blocks.0.ffn.value.weight\r\n"
232
- ]
233
- },
234
- {
235
- "name": "stdout",
236
- "output_type": "stream",
237
- "text": [
238
- "2048 2048 1.0 blocks.1.att.gate.weight\r\n"
239
- ]
240
- },
241
- {
242
- "name": "stdout",
243
- "output_type": "stream",
244
- "text": [
245
- "2048 2048 1.0 blocks.1.att.receptance.weight\r\n"
246
- ]
247
- },
248
- {
249
- "name": "stdout",
250
- "output_type": "stream",
251
- "text": [
252
- "2048 2048 1.0 blocks.1.att.key.weight\r\n"
253
- ]
254
- },
255
- {
256
- "name": "stdout",
257
- "output_type": "stream",
258
- "text": [
259
- "2048 2048 1.0 blocks.1.att.value.weight\r\n"
260
- ]
261
- },
262
- {
263
- "name": "stdout",
264
- "output_type": "stream",
265
- "text": [
266
- "2048 2048 0 blocks.1.att.output.weight\r\n"
267
- ]
268
- },
269
- {
270
- "name": "stdout",
271
- "output_type": "stream",
272
- "text": [
273
- "7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
274
- ]
275
- },
276
- {
277
- "name": "stdout",
278
- "output_type": "stream",
279
- "text": [
280
- "2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
281
- "2048 7168 0 blocks.1.ffn.value.weight\r\n",
282
- "2048 2048 1.0 blocks.2.att.gate.weight\r\n"
283
- ]
284
- },
285
- {
286
- "name": "stdout",
287
- "output_type": "stream",
288
- "text": [
289
- "2048 2048 1.0 blocks.2.att.receptance.weight\r\n"
290
- ]
291
- },
292
- {
293
- "name": "stdout",
294
- "output_type": "stream",
295
- "text": [
296
- "2048 2048 1.0 blocks.2.att.key.weight\r\n"
297
- ]
298
- },
299
- {
300
- "name": "stdout",
301
- "output_type": "stream",
302
- "text": [
303
- "2048 2048 1.0 blocks.2.att.value.weight\r\n"
304
- ]
305
- },
306
- {
307
- "name": "stdout",
308
- "output_type": "stream",
309
- "text": [
310
- "2048 2048 0 blocks.2.att.output.weight\r\n",
311
- "7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
312
- ]
313
- },
314
- {
315
- "name": "stdout",
316
- "output_type": "stream",
317
- "text": [
318
- "2048 2048 0 blocks.2.ffn.receptance.weight\r\n",
319
- "2048 7168 0 blocks.2.ffn.value.weight\r\n"
320
- ]
321
- },
322
- {
323
- "name": "stdout",
324
- "output_type": "stream",
325
- "text": [
326
- "2048 2048 1.0 blocks.3.att.gate.weight\r\n"
327
- ]
328
- },
329
- {
330
- "name": "stdout",
331
- "output_type": "stream",
332
- "text": [
333
- "2048 2048 1.0 blocks.3.att.receptance.weight\r\n"
334
- ]
335
- },
336
- {
337
- "name": "stdout",
338
- "output_type": "stream",
339
- "text": [
340
- "2048 2048 1.0 blocks.3.att.key.weight\r\n"
341
- ]
342
- },
343
- {
344
- "name": "stdout",
345
- "output_type": "stream",
346
- "text": [
347
- "2048 2048 1.0 blocks.3.att.value.weight\r\n"
348
- ]
349
- },
350
- {
351
- "name": "stdout",
352
- "output_type": "stream",
353
- "text": [
354
- "2048 2048 0 blocks.3.att.output.weight\r\n"
355
- ]
356
- },
357
- {
358
- "name": "stdout",
359
- "output_type": "stream",
360
- "text": [
361
- "7168 2048 1.0 blocks.3.ffn.key.weight\r\n"
362
- ]
363
- },
364
- {
365
- "name": "stdout",
366
- "output_type": "stream",
367
- "text": [
368
- "2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
369
- "2048 7168 0 blocks.3.ffn.value.weight\r\n"
370
- ]
371
- },
372
- {
373
- "name": "stdout",
374
- "output_type": "stream",
375
- "text": [
376
- "2048 2048 1.0 blocks.4.att.gate.weight\r\n"
377
- ]
378
- },
379
- {
380
- "name": "stdout",
381
- "output_type": "stream",
382
- "text": [
383
- "2048 2048 1.0 blocks.4.att.receptance.weight\r\n"
384
- ]
385
- },
386
- {
387
- "name": "stdout",
388
- "output_type": "stream",
389
- "text": [
390
- "2048 2048 1.0 blocks.4.att.key.weight\r\n"
391
- ]
392
- },
393
- {
394
- "name": "stdout",
395
- "output_type": "stream",
396
- "text": [
397
- "2048 2048 1.0 blocks.4.att.value.weight\r\n"
398
- ]
399
- },
400
- {
401
- "name": "stdout",
402
- "output_type": "stream",
403
- "text": [
404
- "2048 2048 0 blocks.4.att.output.weight\r\n"
405
- ]
406
- },
407
- {
408
- "name": "stdout",
409
- "output_type": "stream",
410
- "text": [
411
- "7168 2048 1.0 blocks.4.ffn.key.weight\r\n"
412
- ]
413
- },
414
- {
415
- "name": "stdout",
416
- "output_type": "stream",
417
- "text": [
418
- "2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
419
- "2048 7168 0 blocks.4.ffn.value.weight\r\n"
420
- ]
421
- },
422
- {
423
- "name": "stdout",
424
- "output_type": "stream",
425
- "text": [
426
- "2048 2048 1.0 blocks.5.att.gate.weight\r\n"
427
- ]
428
- },
429
- {
430
- "name": "stdout",
431
- "output_type": "stream",
432
- "text": [
433
- "2048 2048 1.0 blocks.5.att.receptance.weight\r\n"
434
- ]
435
- },
436
- {
437
- "name": "stdout",
438
- "output_type": "stream",
439
- "text": [
440
- "2048 2048 1.0 blocks.5.att.key.weight\r\n"
441
- ]
442
- },
443
- {
444
- "name": "stdout",
445
- "output_type": "stream",
446
- "text": [
447
- "2048 2048 1.0 blocks.5.att.value.weight\r\n"
448
- ]
449
- },
450
- {
451
- "name": "stdout",
452
- "output_type": "stream",
453
- "text": [
454
- "2048 2048 0 blocks.5.att.output.weight\r\n",
455
- "7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
456
- ]
457
- },
458
- {
459
- "name": "stdout",
460
- "output_type": "stream",
461
- "text": [
462
- "2048 2048 0 blocks.5.ffn.receptance.weight\r\n",
463
- "2048 7168 0 blocks.5.ffn.value.weight\r\n"
464
- ]
465
- },
466
- {
467
- "name": "stdout",
468
- "output_type": "stream",
469
- "text": [
470
- "2048 2048 1.0 blocks.6.att.gate.weight\r\n"
471
- ]
472
- },
473
- {
474
- "name": "stdout",
475
- "output_type": "stream",
476
- "text": [
477
- "2048 2048 1.0 blocks.6.att.receptance.weight\r\n"
478
- ]
479
- },
480
- {
481
- "name": "stdout",
482
- "output_type": "stream",
483
- "text": [
484
- "2048 2048 1.0 blocks.6.att.key.weight\r\n"
485
- ]
486
- },
487
- {
488
- "name": "stdout",
489
- "output_type": "stream",
490
- "text": [
491
- "2048 2048 1.0 blocks.6.att.value.weight\r\n"
492
- ]
493
- },
494
- {
495
- "name": "stdout",
496
- "output_type": "stream",
497
- "text": [
498
- "2048 2048 0 blocks.6.att.output.weight\r\n",
499
- "7168 2048 1.0 blocks.6.ffn.key.weight\r\n"
500
- ]
501
- },
502
- {
503
- "name": "stdout",
504
- "output_type": "stream",
505
- "text": [
506
- "2048 2048 0 blocks.6.ffn.receptance.weight\r\n",
507
- "2048 7168 0 blocks.6.ffn.value.weight\r\n"
508
- ]
509
- },
510
- {
511
- "name": "stdout",
512
- "output_type": "stream",
513
- "text": [
514
- "2048 2048 1.0 blocks.7.att.gate.weight\r\n"
515
- ]
516
- },
517
- {
518
- "name": "stdout",
519
- "output_type": "stream",
520
- "text": [
521
- "2048 2048 1.0 blocks.7.att.receptance.weight\r\n"
522
- ]
523
- },
524
- {
525
- "name": "stdout",
526
- "output_type": "stream",
527
- "text": [
528
- "2048 2048 1.0 blocks.7.att.key.weight\r\n"
529
- ]
530
- },
531
- {
532
- "name": "stdout",
533
- "output_type": "stream",
534
- "text": [
535
- "2048 2048 1.0 blocks.7.att.value.weight\r\n"
536
- ]
537
- },
538
- {
539
- "name": "stdout",
540
- "output_type": "stream",
541
- "text": [
542
- "2048 2048 0 blocks.7.att.output.weight\r\n",
543
- "7168 2048 1.0 blocks.7.ffn.key.weight\r\n"
544
- ]
545
- },
546
- {
547
- "name": "stdout",
548
- "output_type": "stream",
549
- "text": [
550
- "2048 2048 0 blocks.7.ffn.receptance.weight\r\n",
551
- "2048 7168 0 blocks.7.ffn.value.weight\r\n",
552
- "2048 2048 1.0 blocks.8.att.gate.weight\r\n"
553
- ]
554
- },
555
- {
556
- "name": "stdout",
557
- "output_type": "stream",
558
- "text": [
559
- "2048 2048 1.0 blocks.8.att.receptance.weight\r\n"
560
- ]
561
- },
562
- {
563
- "name": "stdout",
564
- "output_type": "stream",
565
- "text": [
566
- "2048 2048 1.0 blocks.8.att.key.weight\r\n"
567
- ]
568
- },
569
- {
570
- "name": "stdout",
571
- "output_type": "stream",
572
- "text": [
573
- "2048 2048 1.0 blocks.8.att.value.weight\r\n"
574
- ]
575
- },
576
- {
577
- "name": "stdout",
578
- "output_type": "stream",
579
- "text": [
580
- "2048 2048 0 blocks.8.att.output.weight\r\n",
581
- "7168 2048 1.0 blocks.8.ffn.key.weight\r\n"
582
- ]
583
- },
584
- {
585
- "name": "stdout",
586
- "output_type": "stream",
587
- "text": [
588
- "2048 2048 0 blocks.8.ffn.receptance.weight\r\n",
589
- "2048 7168 0 blocks.8.ffn.value.weight\r\n"
590
- ]
591
- },
592
- {
593
- "name": "stdout",
594
- "output_type": "stream",
595
- "text": [
596
- "2048 2048 1.0 blocks.9.att.gate.weight\r\n"
597
- ]
598
- },
599
- {
600
- "name": "stdout",
601
- "output_type": "stream",
602
- "text": [
603
- "2048 2048 1.0 blocks.9.att.receptance.weight\r\n"
604
- ]
605
- },
606
- {
607
- "name": "stdout",
608
- "output_type": "stream",
609
- "text": [
610
- "2048 2048 1.0 blocks.9.att.key.weight\r\n"
611
- ]
612
- },
613
- {
614
- "name": "stdout",
615
- "output_type": "stream",
616
- "text": [
617
- "2048 2048 1.0 blocks.9.att.value.weight\r\n"
618
- ]
619
- },
620
- {
621
- "name": "stdout",
622
- "output_type": "stream",
623
- "text": [
624
- "2048 2048 0 blocks.9.att.output.weight\r\n",
625
- "7168 2048 1.0 blocks.9.ffn.key.weight\r\n"
626
- ]
627
- },
628
- {
629
- "name": "stdout",
630
- "output_type": "stream",
631
- "text": [
632
- "2048 2048 0 blocks.9.ffn.receptance.weight\r\n",
633
- "2048 7168 0 blocks.9.ffn.value.weight\r\n"
634
- ]
635
- },
636
- {
637
- "name": "stdout",
638
- "output_type": "stream",
639
- "text": [
640
- "2048 2048 1.0 blocks.10.att.gate.weight\r\n"
641
- ]
642
- },
643
- {
644
- "name": "stdout",
645
- "output_type": "stream",
646
- "text": [
647
- "2048 2048 1.0 blocks.10.att.receptance.weight\r\n"
648
- ]
649
- },
650
- {
651
- "name": "stdout",
652
- "output_type": "stream",
653
- "text": [
654
- "2048 2048 1.0 blocks.10.att.key.weight\r\n"
655
- ]
656
- },
657
- {
658
- "name": "stdout",
659
- "output_type": "stream",
660
- "text": [
661
- "2048 2048 1.0 blocks.10.att.value.weight\r\n"
662
- ]
663
- },
664
- {
665
- "name": "stdout",
666
- "output_type": "stream",
667
- "text": [
668
- "2048 2048 0 blocks.10.att.output.weight\r\n",
669
- "7168 2048 1.0 blocks.10.ffn.key.weight\r\n"
670
- ]
671
- },
672
- {
673
- "name": "stdout",
674
- "output_type": "stream",
675
- "text": [
676
- "2048 2048 0 blocks.10.ffn.receptance.weight\r\n",
677
- "2048 7168 0 blocks.10.ffn.value.weight\r\n"
678
- ]
679
- },
680
- {
681
- "name": "stdout",
682
- "output_type": "stream",
683
- "text": [
684
- "2048 2048 1.0 blocks.11.att.gate.weight\r\n"
685
- ]
686
- },
687
- {
688
- "name": "stdout",
689
- "output_type": "stream",
690
- "text": [
691
- "2048 2048 1.0 blocks.11.att.receptance.weight\r\n"
692
- ]
693
- },
694
- {
695
- "name": "stdout",
696
- "output_type": "stream",
697
- "text": [
698
- "2048 2048 1.0 blocks.11.att.key.weight\r\n"
699
- ]
700
- },
701
- {
702
- "name": "stdout",
703
- "output_type": "stream",
704
- "text": [
705
- "2048 2048 1.0 blocks.11.att.value.weight\r\n"
706
- ]
707
- },
708
- {
709
- "name": "stdout",
710
- "output_type": "stream",
711
- "text": [
712
- "2048 2048 0 blocks.11.att.output.weight\r\n",
713
- "7168 2048 1.0 blocks.11.ffn.key.weight\r\n"
714
- ]
715
- },
716
- {
717
- "name": "stdout",
718
- "output_type": "stream",
719
- "text": [
720
- "2048 2048 0 blocks.11.ffn.receptance.weight\r\n",
721
- "2048 7168 0 blocks.11.ffn.value.weight\r\n"
722
- ]
723
- },
724
- {
725
- "name": "stdout",
726
- "output_type": "stream",
727
- "text": [
728
- "50277 2048 0.5 head.weight\r\n"
729
- ]
730
- }
731
- ],
732
- "source": [
733
- "# Init the model\n",
734
- "!cd \"{TRAINER_DIR}\" && \\\n",
735
- " python3 ./init_model.py \\\n",
736
- " --n_layer {LAYER_COUNT} --n_embd {EMBED_SIZE} \\\n",
737
- " --emb-scale \"{EMBED_SCALE}\" \\\n",
738
- " --vocab_size neox --skip-if-exists \\\n",
739
- " \"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\""
740
- ]
741
- },
742
- {
743
- "cell_type": "markdown",
744
- "id": "0c176d9f",
745
- "metadata": {
746
- "papermill": {
747
- "duration": 0.008403,
748
- "end_time": "2023-09-29T09:57:57.000220",
749
- "exception": false,
750
- "start_time": "2023-09-29T09:57:56.991817",
751
- "status": "completed"
752
- },
753
- "tags": []
754
- },
755
- "source": [
756
- "## Enwiki Stage 1 : Foundation 4k model training"
757
- ]
758
- },
759
- {
760
- "cell_type": "code",
761
- "execution_count": 4,
762
- "id": "bd55a062",
763
- "metadata": {
764
- "execution": {
765
- "iopub.execute_input": "2023-09-29T09:57:57.020044Z",
766
- "iopub.status.busy": "2023-09-29T09:57:57.019632Z",
767
- "iopub.status.idle": "2023-09-29T09:58:08.660786Z",
768
- "shell.execute_reply": "2023-09-29T09:58:08.660057Z"
769
- },
770
- "papermill": {
771
- "duration": 11.65489,
772
- "end_time": "2023-09-29T09:58:08.664002",
773
- "exception": false,
774
- "start_time": "2023-09-29T09:57:57.009112",
775
- "status": "completed"
776
- },
777
- "tags": []
778
- },
779
- "outputs": [
780
- {
781
- "name": "stdout",
782
- "output_type": "stream",
783
- "text": [
784
- "\r",
785
- "Saving the dataset (0/3 shards): 0%| | 0/54401 [00:00<?, ? examples/s]"
786
- ]
787
- },
788
- {
789
- "name": "stdout",
790
- "output_type": "stream",
791
- "text": [
792
- "\r",
793
- "Saving the dataset (0/3 shards): 4%| | 2000/54401 [00:00<00:03, 15197.33 examp"
794
- ]
795
- },
796
- {
797
- "name": "stdout",
798
- "output_type": "stream",
799
- "text": [
800
- "\r",
801
- "Saving the dataset (0/3 shards): 7%| | 4000/54401 [00:00<00:03, 15929.46 examp"
802
- ]
803
- },
804
- {
805
- "name": "stdout",
806
- "output_type": "stream",
807
- "text": [
808
- "\r",
809
- "Saving the dataset (0/3 shards): 11%| | 6000/54401 [00:00<00:02, 16418.37 examp"
810
- ]
811
- },
812
- {
813
- "name": "stdout",
814
- "output_type": "stream",
815
- "text": [
816
- "\r",
817
- "Saving the dataset (0/3 shards): 15%|▏| 8000/54401 [00:00<00:02, 16923.89 examp"
818
- ]
819
- },
820
- {
821
- "name": "stdout",
822
- "output_type": "stream",
823
- "text": [
824
- "\r",
825
- "Saving the dataset (0/3 shards): 18%|▏| 10000/54401 [00:00<00:02, 17273.31 exam"
826
- ]
827
- },
828
- {
829
- "name": "stdout",
830
- "output_type": "stream",
831
- "text": [
832
- "\r",
833
- "Saving the dataset (0/3 shards): 22%|▏| 12000/54401 [00:00<00:02, 17662.61 exam"
834
- ]
835
- },
836
- {
837
- "name": "stdout",
838
- "output_type": "stream",
839
- "text": [
840
- "\r",
841
- "Saving the dataset (0/3 shards): 26%|▎| 14000/54401 [00:00<00:02, 17923.49 exam"
842
- ]
843
- },
844
- {
845
- "name": "stdout",
846
- "output_type": "stream",
847
- "text": [
848
- "\r",
849
- "Saving the dataset (0/3 shards): 29%|▎| 16000/54401 [00:00<00:02, 18184.27 exam"
850
- ]
851
- },
852
- {
853
- "name": "stdout",
854
- "output_type": "stream",
855
- "text": [
856
- "\r",
857
- "Saving the dataset (0/3 shards): 33%|▎| 18000/54401 [00:01<00:01, 18438.75 exam\r",
858
- "Saving the dataset (1/3 shards): 33%|▎| 18134/54401 [00:01<00:01, 18438.75 exam"
859
- ]
860
- },
861
- {
862
- "name": "stdout",
863
- "output_type": "stream",
864
- "text": [
865
- "\r",
866
- "Saving the dataset (1/3 shards): 37%|▎| 20134/54401 [00:01<00:01, 17356.03 exam"
867
- ]
868
- },
869
- {
870
- "name": "stdout",
871
- "output_type": "stream",
872
- "text": [
873
- "\r",
874
- "Saving the dataset (1/3 shards): 41%|▍| 22134/54401 [00:01<00:01, 17970.31 exam"
875
- ]
876
- },
877
- {
878
- "name": "stdout",
879
- "output_type": "stream",
880
- "text": [
881
- "\r",
882
- "Saving the dataset (1/3 shards): 44%|▍| 24134/54401 [00:01<00:01, 18401.36 exam"
883
- ]
884
- },
885
- {
886
- "name": "stdout",
887
- "output_type": "stream",
888
- "text": [
889
- "\r",
890
- "Saving the dataset (1/3 shards): 48%|▍| 26134/54401 [00:01<00:01, 18772.52 exam"
891
- ]
892
- },
893
- {
894
- "name": "stdout",
895
- "output_type": "stream",
896
- "text": [
897
- "\r",
898
- "Saving the dataset (1/3 shards): 52%|▌| 28134/54401 [00:01<00:01, 19015.25 exam"
899
- ]
900
- },
901
- {
902
- "name": "stdout",
903
- "output_type": "stream",
904
- "text": [
905
- "\r",
906
- "Saving the dataset (1/3 shards): 55%|▌| 30134/54401 [00:01<00:01, 19175.86 exam"
907
- ]
908
- },
909
- {
910
- "name": "stdout",
911
- "output_type": "stream",
912
- "text": [
913
- "\r",
914
- "Saving the dataset (1/3 shards): 59%|▌| 32134/54401 [00:01<00:01, 19340.44 exam"
915
- ]
916
- },
917
- {
918
- "name": "stdout",
919
- "output_type": "stream",
920
- "text": [
921
- "\r",
922
- "Saving the dataset (1/3 shards): 63%|▋| 34134/54401 [00:01<00:01, 19458.62 exam"
923
- ]
924
- },
925
- {
926
- "name": "stdout",
927
- "output_type": "stream",
928
- "text": [
929
- "\r",
930
- "Saving the dataset (1/3 shards): 67%|▋| 36268/54401 [00:01<00:00, 19480.21 exam\r",
931
- "Saving the dataset (2/3 shards): 67%|▋| 36268/54401 [00:01<00:00, 19480.21 exam"
932
- ]
933
- },
934
- {
935
- "name": "stdout",
936
- "output_type": "stream",
937
- "text": [
938
- "\r",
939
- "Saving the dataset (2/3 shards): 72%|▋| 39268/54401 [00:02<00:00, 19488.59 exam"
940
- ]
941
- },
942
- {
943
- "name": "stdout",
944
- "output_type": "stream",
945
- "text": [
946
- "\r",
947
- "Saving the dataset (2/3 shards): 80%|▊| 43268/54401 [00:02<00:00, 19830.22 exam"
948
- ]
949
- },
950
- {
951
- "name": "stdout",
952
- "output_type": "stream",
953
- "text": [
954
- "\r",
955
- "Saving the dataset (2/3 shards): 87%|▊| 47268/54401 [00:02<00:00, 20058.57 exam"
956
- ]
957
- },
958
- {
959
- "name": "stdout",
960
- "output_type": "stream",
961
- "text": [
962
- "\r",
963
- "Saving the dataset (2/3 shards): 94%|▉| 51268/54401 [00:02<00:00, 20178.13 exam"
964
- ]
965
- },
966
- {
967
- "name": "stdout",
968
- "output_type": "stream",
969
- "text": [
970
- "\r",
971
- "Saving the dataset (2/3 shards): 100%|█| 54401/54401 [00:02<00:00, 20197.74 exam\r",
972
- "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:02<00:00, 20197.74 exam\r",
973
- "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:02<00:00, 18877.90 exam\r\n",
974
- "\r",
975
- "Saving the dataset (0/1 shards): 0%| | 0/109 [00:00<?, ? examples/s]"
976
- ]
977
- },
978
- {
979
- "name": "stdout",
980
- "output_type": "stream",
981
- "text": [
982
- "\r",
983
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7330.11 examples/\r",
984
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7058.50 examples/\r\n"
985
- ]
986
- }
987
- ],
988
- "source": [
989
- "# Lets preload the requried dataset \n",
990
- "!cd \"{TRAINER_DIR}\" && \\\n",
991
- " python3 preload_datapath.py \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\""
992
- ]
993
- },
994
- {
995
- "cell_type": "code",
996
- "execution_count": 5,
997
- "id": "5e1ede96",
998
- "metadata": {
999
- "execution": {
1000
- "iopub.execute_input": "2023-09-29T09:58:08.702501Z",
1001
- "iopub.status.busy": "2023-09-29T09:58:08.701875Z",
1002
- "iopub.status.idle": "2023-09-29T10:00:21.515956Z",
1003
- "shell.execute_reply": "2023-09-29T10:00:21.514599Z"
1004
- },
1005
- "papermill": {
1006
- "duration": 132.843495,
1007
- "end_time": "2023-09-29T10:00:21.518814",
1008
- "exception": false,
1009
- "start_time": "2023-09-29T09:58:08.675319",
1010
- "status": "completed"
1011
- },
1012
- "tags": []
1013
- },
1014
- "outputs": [
1015
- {
1016
- "name": "stdout",
1017
- "output_type": "stream",
1018
- "text": [
1019
- "[2023-09-29 09:58:12,868] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
1020
- ]
1021
- },
1022
- {
1023
- "name": "stdout",
1024
- "output_type": "stream",
1025
- "text": [
1026
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
1027
- ]
1028
- },
1029
- {
1030
- "name": "stdout",
1031
- "output_type": "stream",
1032
- "text": [
1033
- "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part1.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/', '--model.load_model=../model/v5-L12-D2048-E0_01-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part1.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/', '--model.load_model=../model/v5-L12-D2048-E0_01-neox-v5base-init.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
1034
- " rank_zero_warn(\r\n"
1035
- ]
1036
- },
1037
- {
1038
- "name": "stdout",
1039
- "output_type": "stream",
1040
- "text": [
1041
- "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 207026176\r\n",
1042
- " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
1043
- "Global seed set to 207026176\r\n"
1044
- ]
1045
- },
1046
- {
1047
- "name": "stdout",
1048
- "output_type": "stream",
1049
- "text": [
1050
- "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
1051
- ]
1052
- },
1053
- {
1054
- "name": "stdout",
1055
- "output_type": "stream",
1056
- "text": [
1057
- "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.11\r\n",
1058
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230929_095815-3rwyj6ei\u001b[0m\r\n",
1059
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
1060
- "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m\r\n",
1061
- "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
1062
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/3rwyj6ei\u001b[0m\r\n"
1063
- ]
1064
- },
1065
- {
1066
- "name": "stdout",
1067
- "output_type": "stream",
1068
- "text": [
1069
- "GPU available: True (cuda), used: True\r\n",
1070
- "TPU available: False, using: 0 TPU cores\r\n",
1071
- "IPU available: False, using: 0 IPUs\r\n",
1072
- "HPU available: False, using: 0 HPUs\r\n",
1073
- "\r\n",
1074
- "\r\n",
1075
- "[RWKV.Trainer] Applying 'target_batch_size' with the following:\r\n",
1076
- " - target_batch_size: 32\r\n",
1077
- " - num_nodes: 1\r\n",
1078
- " - num_devices: 1\r\n",
1079
- " - accumulate_grad_batches: 32\r\n",
1080
- " - effective_batch_size: 32\r\n",
1081
- "\r\n"
1082
- ]
1083
- },
1084
- {
1085
- "name": "stdout",
1086
- "output_type": "stream",
1087
- "text": [
1088
- "\r",
1089
- "Saving the dataset (0/3 shards): 0%| | 0/54401 [00:00<?, ? examples/s]"
1090
- ]
1091
- },
1092
- {
1093
- "name": "stdout",
1094
- "output_type": "stream",
1095
- "text": [
1096
- "\r",
1097
- "Saving the dataset (0/3 shards): 2%| | 1000/54401 [00:00<00:07, 6759.93 exampl"
1098
- ]
1099
- },
1100
- {
1101
- "name": "stdout",
1102
- "output_type": "stream",
1103
- "text": [
1104
- "\r",
1105
- "Saving the dataset (0/3 shards): 6%| | 3000/54401 [00:00<00:04, 12309.88 examp"
1106
- ]
1107
- },
1108
- {
1109
- "name": "stdout",
1110
- "output_type": "stream",
1111
- "text": [
1112
- "\r",
1113
- "Saving the dataset (0/3 shards): 9%| | 5000/54401 [00:00<00:03, 14423.41 examp"
1114
- ]
1115
- },
1116
- {
1117
- "name": "stdout",
1118
- "output_type": "stream",
1119
- "text": [
1120
- "\r",
1121
- "Saving the dataset (0/3 shards): 13%|▏| 7000/54401 [00:00<00:03, 15440.72 examp"
1122
- ]
1123
- },
1124
- {
1125
- "name": "stdout",
1126
- "output_type": "stream",
1127
- "text": [
1128
- "\r",
1129
- "Saving the dataset (0/3 shards): 17%|▏| 9000/54401 [00:00<00:02, 16338.39 examp"
1130
- ]
1131
- },
1132
- {
1133
- "name": "stdout",
1134
- "output_type": "stream",
1135
- "text": [
1136
- "\r",
1137
- "Saving the dataset (0/3 shards): 22%|▏| 12000/54401 [00:00<00:02, 17519.79 exam"
1138
- ]
1139
- },
1140
- {
1141
- "name": "stdout",
1142
- "output_type": "stream",
1143
- "text": [
1144
- "\r",
1145
- "Saving the dataset (0/3 shards): 28%|▎| 15000/54401 [00:00<00:02, 18285.95 exam"
1146
- ]
1147
- },
1148
- {
1149
- "name": "stdout",
1150
- "output_type": "stream",
1151
- "text": [
1152
- "\r",
1153
- "Saving the dataset (0/3 shards): 33%|▎| 18134/54401 [00:01<00:01, 19055.07 exam"
1154
- ]
1155
- },
1156
- {
1157
- "name": "stdout",
1158
- "output_type": "stream",
1159
- "text": [
1160
- "\r",
1161
- "Saving the dataset (1/3 shards): 33%|▎| 18134/54401 [00:01<00:01, 19055.07 exam"
1162
- ]
1163
- },
1164
- {
1165
- "name": "stdout",
1166
- "output_type": "stream",
1167
- "text": [
1168
- "\r",
1169
- "Saving the dataset (1/3 shards): 37%|▎| 20134/54401 [00:01<00:05, 6782.90 examp"
1170
- ]
1171
- },
1172
- {
1173
- "name": "stdout",
1174
- "output_type": "stream",
1175
- "text": [
1176
- "\r",
1177
- "Saving the dataset (1/3 shards): 44%|▍| 24134/54401 [00:02<00:03, 9522.94 examp"
1178
- ]
1179
- },
1180
- {
1181
- "name": "stdout",
1182
- "output_type": "stream",
1183
- "text": [
1184
- "\r",
1185
- "Saving the dataset (1/3 shards): 52%|▌| 28134/54401 [00:02<00:02, 12076.84 exam"
1186
- ]
1187
- },
1188
- {
1189
- "name": "stdout",
1190
- "output_type": "stream",
1191
- "text": [
1192
- "\r",
1193
- "Saving the dataset (1/3 shards): 59%|▌| 32134/54401 [00:02<00:01, 14336.61 exam"
1194
- ]
1195
- },
1196
- {
1197
- "name": "stdout",
1198
- "output_type": "stream",
1199
- "text": [
1200
- "\r",
1201
- "Saving the dataset (1/3 shards): 66%|▋| 36134/54401 [00:02<00:01, 16234.67 exam"
1202
- ]
1203
- },
1204
- {
1205
- "name": "stdout",
1206
- "output_type": "stream",
1207
- "text": [
1208
- "\r",
1209
- "Saving the dataset (2/3 shards): 67%|▋| 36268/54401 [00:03<00:01, 16234.67 exam"
1210
- ]
1211
- },
1212
- {
1213
- "name": "stdout",
1214
- "output_type": "stream",
1215
- "text": [
1216
- "\r",
1217
- "Saving the dataset (2/3 shards): 70%|▋| 38268/54401 [00:03<00:02, 7102.74 examp"
1218
- ]
1219
- },
1220
- {
1221
- "name": "stdout",
1222
- "output_type": "stream",
1223
- "text": [
1224
- "\r",
1225
- "Saving the dataset (2/3 shards): 78%|▊| 42268/54401 [00:03<00:01, 9363.53 examp"
1226
- ]
1227
- },
1228
- {
1229
- "name": "stdout",
1230
- "output_type": "stream",
1231
- "text": [
1232
- "\r",
1233
- "Saving the dataset (2/3 shards): 85%|▊| 46268/54401 [00:03<00:00, 11635.76 exam"
1234
- ]
1235
- },
1236
- {
1237
- "name": "stdout",
1238
- "output_type": "stream",
1239
- "text": [
1240
- "\r",
1241
- "Saving the dataset (2/3 shards): 92%|▉| 50268/54401 [00:04<00:00, 13831.20 exam"
1242
- ]
1243
- },
1244
- {
1245
- "name": "stdout",
1246
- "output_type": "stream",
1247
- "text": [
1248
- "\r",
1249
- "Saving the dataset (2/3 shards): 100%|▉| 54268/54401 [00:04<00:00, 15794.33 exam"
1250
- ]
1251
- },
1252
- {
1253
- "name": "stdout",
1254
- "output_type": "stream",
1255
- "text": [
1256
- "\r",
1257
- "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:05<00:00, 15794.33 exam\r",
1258
- "Saving the dataset (3/3 shards): 100%|█| 54401/54401 [00:05<00:00, 10560.28 exam\r\n",
1259
- "\r",
1260
- "Saving the dataset (0/1 shards): 0%| | 0/109 [00:00<?, ? examples/s]\r",
1261
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7298.75 examples/"
1262
- ]
1263
- },
1264
- {
1265
- "name": "stdout",
1266
- "output_type": "stream",
1267
- "text": [
1268
- "\r",
1269
- "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 6907.18 examples/\r\n"
1270
- ]
1271
- },
1272
- {
1273
- "name": "stdout",
1274
- "output_type": "stream",
1275
- "text": [
1276
- "[rank: 0] Global seed set to 207026176\r\n",
1277
- "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\r\n",
1278
- "[2023-09-29 09:58:33,172] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\r\n"
1279
- ]
1280
- },
1281
- {
1282
- "name": "stdout",
1283
- "output_type": "stream",
1284
- "text": [
1285
- "Enabling DeepSpeed BF16.\r\n"
1286
- ]
1287
- },
1288
- {
1289
- "name": "stdout",
1290
- "output_type": "stream",
1291
- "text": [
1292
- "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\r\n",
1293
- "#\r\n",
1294
- "# RWKV lighting_trainer.py important notes \r\n",
1295
- "# https://github.com/RWKV/RWKV-infctx-trainer \r\n",
1296
- "#\r\n",
1297
- "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\r\n",
1298
- "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n",
1299
- "# - When resuming from checkpoint, the estimated time is inaccurate\r\n",
1300
- "#\r\n",
1301
- "\r\n",
1302
- "[RWKV.model] Configuring optimizer with\r\n",
1303
- " - lr_init: 6.000e-04 (0.0006)\r\n",
1304
- " - lr_final: 5.000e-04 (0.0005)\r\n",
1305
- "\r\n",
1306
- "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n"
1307
- ]
1308
- },
1309
- {
1310
- "name": "stdout",
1311
- "output_type": "stream",
1312
- "text": [
1313
- "Detected CUDA files, patching ldflags\r\n",
1314
- "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n",
1315
- "Building extension module fused_adam...\r\n",
1316
- "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n"
1317
- ]
1318
- },
1319
- {
1320
- "name": "stdout",
1321
- "output_type": "stream",
1322
- "text": [
1323
- "ninja: no work to do.\r\n",
1324
- "Loading extension module fused_adam...\r\n",
1325
- "Time to load fused_adam op: 0.07915163040161133 seconds\r\n",
1326
- "Loading `train_dataloader` to estimate number of stepping batches.\r\n"
1327
- ]
1328
- },
1329
- {
1330
- "name": "stdout",
1331
- "output_type": "stream",
1332
- "text": [
1333
- "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n"
1334
- ]
1335
- },
1336
- {
1337
- "name": "stdout",
1338
- "output_type": "stream",
1339
- "text": [
1340
- "\r\n",
1341
- " | Name | Type | Params\r\n",
1342
- "--------------------------------------\r\n",
1343
- "0 | emb | Embedding | 102 M \r\n",
1344
- "1 | blocks | ModuleList | 654 M \r\n",
1345
- "2 | ln_out | LayerNorm | 4.1 K \r\n",
1346
- "3 | head | Linear | 102 M \r\n",
1347
- "--------------------------------------\r\n",
1348
- "860 M Trainable params\r\n",
1349
- "0 Non-trainable params\r\n",
1350
- "860 M Total params\r\n",
1351
- "3,442.200 Total estimated model params size (MB)\r\n"
1352
- ]
1353
- },
1354
- {
1355
- "name": "stdout",
1356
- "output_type": "stream",
1357
- "text": [
1358
- "\r",
1359
- "Training: 0it [00:00, ?it/s]\r",
1360
- "Training: 0%| | 0/54401 [00:00<?, ?it/s]\r",
1361
- "Epoch 0: 0%| | 0/54401 [00:00<?, ?it/s]"
1362
- ]
1363
- },
1364
- {
1365
- "name": "stdout",
1366
- "output_type": "stream",
1367
- "text": [
1368
- "\r",
1369
- "Epoch 0: 0%| | 1/54401 [00:10<154:16:48, 10.21s/it]\r",
1370
- "Epoch 0: 0%| | 1/54401 [00:10<154:17:58, 10.21s/it, v_num=j6ei, train/loss=10."
1371
- ]
1372
- },
1373
- {
1374
- "name": "stdout",
1375
- "output_type": "stream",
1376
- "text": [
1377
- "\r",
1378
- "Epoch 0: 0%| | 2/54401 [00:11<86:21:01, 5.71s/it, v_num=j6ei, train/loss=10.9\r",
1379
- "Epoch 0: 0%| | 2/54401 [00:11<86:21:23, 5.71s/it, v_num=j6ei, train/loss=11.0"
1380
- ]
1381
- },
1382
- {
1383
- "name": "stdout",
1384
- "output_type": "stream",
1385
- "text": [
1386
- "\r",
1387
- "Epoch 0: 0%| | 3/54401 [00:12<63:41:34, 4.22s/it, v_num=j6ei, train/loss=11.0\r",
1388
- "Epoch 0: 0%| | 3/54401 [00:12<63:41:49, 4.22s/it, v_num=j6ei, train/loss=10.9"
1389
- ]
1390
- },
1391
- {
1392
- "name": "stdout",
1393
- "output_type": "stream",
1394
- "text": [
1395
- "\r",
1396
- "Epoch 0: 0%| | 4/54401 [00:13<52:21:56, 3.47s/it, v_num=j6ei, train/loss=10.9\r",
1397
- "Epoch 0: 0%| | 4/54401 [00:13<52:22:08, 3.47s/it, v_num=j6ei, train/loss=10.9"
1398
- ]
1399
- },
1400
- {
1401
- "name": "stdout",
1402
- "output_type": "stream",
1403
- "text": [
1404
- "\r",
1405
- "Epoch 0: 0%| | 5/54401 [00:15<45:34:11, 3.02s/it, v_num=j6ei, train/loss=10.9\r",
1406
- "Epoch 0: 0%| | 5/54401 [00:15<45:34:21, 3.02s/it, v_num=j6ei, train/loss=10.9"
1407
- ]
1408
- },
1409
- {
1410
- "name": "stdout",
1411
- "output_type": "stream",
1412
- "text": [
1413
- "\r",
1414
- "Epoch 0: 0%| | 6/54401 [00:16<41:01:56, 2.72s/it, v_num=j6ei, train/loss=10.9\r",
1415
- "Epoch 0: 0%| | 6/54401 [00:16<41:02:04, 2.72s/it, v_num=j6ei, train/loss=10.9"
1416
- ]
1417
- },
1418
- {
1419
- "name": "stdout",
1420
- "output_type": "stream",
1421
- "text": [
1422
- "\r",
1423
- "Epoch 0: 0%| | 7/54401 [00:17<37:47:47, 2.50s/it, v_num=j6ei, train/loss=10.9\r",
1424
- "Epoch 0: 0%| | 7/54401 [00:17<37:47:53, 2.50s/it, v_num=j6ei, train/loss=10.9"
1425
- ]
1426
- },
1427
- {
1428
- "name": "stdout",
1429
- "output_type": "stream",
1430
- "text": [
1431
- "\r",
1432
- "Epoch 0: 0%| | 8/54401 [00:18<35:22:29, 2.34s/it, v_num=j6ei, train/loss=10.9\r",
1433
- "Epoch 0: 0%| | 8/54401 [00:18<35:22:35, 2.34s/it, v_num=j6ei, train/loss=10.9"
1434
- ]
1435
- },
1436
- {
1437
- "name": "stdout",
1438
- "output_type": "stream",
1439
- "text": [
1440
- "\r",
1441
- "Epoch 0: 0%| | 9/54401 [00:19<33:29:09, 2.22s/it, v_num=j6ei, train/loss=10.9\r",
1442
- "Epoch 0: 0%| | 9/54401 [00:19<33:29:15, 2.22s/it, v_num=j6ei, train/loss=10.9"
1443
- ]
1444
- },
1445
- {
1446
- "name": "stdout",
1447
- "output_type": "stream",
1448
- "text": [
1449
- "\r",
1450
- "Epoch 0: 0%| | 10/54401 [00:21<31:58:25, 2.12s/it, v_num=j6ei, train/loss=10.\r",
1451
- "Epoch 0: 0%| | 10/54401 [00:21<31:58:30, 2.12s/it, v_num=j6ei, train/loss=10."
1452
- ]
1453
- },
1454
- {
1455
- "name": "stdout",
1456
- "output_type": "stream",
1457
- "text": [
1458
- "\r",
1459
- "Epoch 0: 0%| | 11/54401 [00:22<30:44:06, 2.03s/it, v_num=j6ei, train/loss=10.\r",
1460
- "Epoch 0: 0%| | 11/54401 [00:22<30:44:10, 2.03s/it, v_num=j6ei, train/loss=11."
1461
- ]
1462
- },
1463
- {
1464
- "name": "stdout",
1465
- "output_type": "stream",
1466
- "text": [
1467
- "\r",
1468
- "Epoch 0: 0%| | 12/54401 [00:23<29:42:14, 1.97s/it, v_num=j6ei, train/loss=11.\r",
1469
- "Epoch 0: 0%| | 12/54401 [00:23<29:42:17, 1.97s/it, v_num=j6ei, train/loss=10."
1470
- ]
1471
- },
1472
- {
1473
- "name": "stdout",
1474
- "output_type": "stream",
1475
- "text": [
1476
- "\r",
1477
- "Epoch 0: 0%| | 13/54401 [00:24<28:49:59, 1.91s/it, v_num=j6ei, train/loss=10.\r",
1478
- "Epoch 0: 0%| | 13/54401 [00:24<28:50:03, 1.91s/it, v_num=j6ei, train/loss=10."
1479
- ]
1480
- },
1481
- {
1482
- "name": "stdout",
1483
- "output_type": "stream",
1484
- "text": [
1485
- "\r",
1486
- "Epoch 0: 0%| | 14/54401 [00:26<28:05:17, 1.86s/it, v_num=j6ei, train/loss=10.\r",
1487
- "Epoch 0: 0%| | 14/54401 [00:26<28:05:20, 1.86s/it, v_num=j6ei, train/loss=10."
1488
- ]
1489
- },
1490
- {
1491
- "name": "stdout",
1492
- "output_type": "stream",
1493
- "text": [
1494
- "\r",
1495
- "Epoch 0: 0%| | 15/54401 [00:27<27:26:24, 1.82s/it, v_num=j6ei, train/loss=10.\r",
1496
- "Epoch 0: 0%| | 15/54401 [00:27<27:26:27, 1.82s/it, v_num=j6ei, train/loss=11."
1497
- ]
1498
- },
1499
- {
1500
- "name": "stdout",
1501
- "output_type": "stream",
1502
- "text": [
1503
- "\r",
1504
- "Epoch 0: 0%| | 16/54401 [00:28<26:52:37, 1.78s/it, v_num=j6ei, train/loss=11."
1505
- ]
1506
- },
1507
- {
1508
- "name": "stdout",
1509
- "output_type": "stream",
1510
- "text": [
1511
- "\r",
1512
- "Epoch 0: 0%| | 16/54401 [00:28<26:52:39, 1.78s/it, v_num=j6ei, train/loss=10."
1513
- ]
1514
- },
1515
- {
1516
- "name": "stdout",
1517
- "output_type": "stream",
1518
- "text": [
1519
- "\r",
1520
- "Epoch 0: 0%| | 17/54401 [00:29<26:22:57, 1.75s/it, v_num=j6ei, train/loss=10.\r",
1521
- "Epoch 0: 0%| | 17/54401 [00:29<26:23:00, 1.75s/it, v_num=j6ei, train/loss=10."
1522
- ]
1523
- },
1524
- {
1525
- "name": "stdout",
1526
- "output_type": "stream",
1527
- "text": [
1528
- "\r",
1529
- "Epoch 0: 0%| | 18/54401 [00:30<25:57:05, 1.72s/it, v_num=j6ei, train/loss=10.\r",
1530
- "Epoch 0: 0%| | 18/54401 [00:30<25:57:08, 1.72s/it, v_num=j6ei, train/loss=10."
1531
- ]
1532
- },
1533
- {
1534
- "name": "stdout",
1535
- "output_type": "stream",
1536
- "text": [
1537
- "\r",
1538
- "Epoch 0: 0%| | 19/54401 [00:32<25:33:20, 1.69s/it, v_num=j6ei, train/loss=10.\r",
1539
- "Epoch 0: 0%| | 19/54401 [00:32<25:33:22, 1.69s/it, v_num=j6ei, train/loss=10."
1540
- ]
1541
- },
1542
- {
1543
- "name": "stdout",
1544
- "output_type": "stream",
1545
- "text": [
1546
- "\r",
1547
- "Epoch 0: 0%| | 20/54401 [00:33<25:11:45, 1.67s/it, v_num=j6ei, train/loss=10.\r",
1548
- "Epoch 0: 0%| | 20/54401 [00:33<25:11:48, 1.67s/it, v_num=j6ei, train/loss=10."
1549
- ]
1550
- },
1551
- {
1552
- "name": "stdout",
1553
- "output_type": "stream",
1554
- "text": [
1555
- "\r",
1556
- "Epoch 0: 0%| | 21/54401 [00:34<24:52:15, 1.65s/it, v_num=j6ei, train/loss=10.\r",
1557
- "Epoch 0: 0%| | 21/54401 [00:34<24:52:17, 1.65s/it, v_num=j6ei, train/loss=10."
1558
- ]
1559
- },
1560
- {
1561
- "name": "stdout",
1562
- "output_type": "stream",
1563
- "text": [
1564
- "\r",
1565
- "Epoch 0: 0%| | 22/54401 [00:35<24:34:31, 1.63s/it, v_num=j6ei, train/loss=10.\r",
1566
- "Epoch 0: 0%| | 22/54401 [00:35<24:34:33, 1.63s/it, v_num=j6ei, train/loss=10."
1567
- ]
1568
- },
1569
- {
1570
- "name": "stdout",
1571
- "output_type": "stream",
1572
- "text": [
1573
- "\r",
1574
- "Epoch 0: 0%| | 23/54401 [00:37<24:18:22, 1.61s/it, v_num=j6ei, train/loss=10.\r",
1575
- "Epoch 0: 0%| | 23/54401 [00:37<24:18:24, 1.61s/it, v_num=j6ei, train/loss=10."
1576
- ]
1577
- },
1578
- {
1579
- "name": "stdout",
1580
- "output_type": "stream",
1581
- "text": [
1582
- "\r",
1583
- "Epoch 0: 0%| | 24/54401 [00:38<24:03:23, 1.59s/it, v_num=j6ei, train/loss=10.\r",
1584
- "Epoch 0: 0%| | 24/54401 [00:38<24:03:25, 1.59s/it, v_num=j6ei, train/loss=10."
1585
- ]
1586
- },
1587
- {
1588
- "name": "stdout",
1589
- "output_type": "stream",
1590
- "text": [
1591
- "\r",
1592
- "Epoch 0: 0%| | 25/54401 [00:39<23:49:44, 1.58s/it, v_num=j6ei, train/loss=10.\r",
1593
- "Epoch 0: 0%| | 25/54401 [00:39<23:49:46, 1.58s/it, v_num=j6ei, train/loss=10."
1594
- ]
1595
- },
1596
- {
1597
- "name": "stdout",
1598
- "output_type": "stream",
1599
- "text": [
1600
- "\r",
1601
- "Epoch 0: 0%| | 26/54401 [00:40<23:37:08, 1.56s/it, v_num=j6ei, train/loss=10.\r",
1602
- "Epoch 0: 0%| | 26/54401 [00:40<23:37:10, 1.56s/it, v_num=j6ei, train/loss=11."
1603
- ]
1604
- },
1605
- {
1606
- "name": "stdout",
1607
- "output_type": "stream",
1608
- "text": [
1609
- "\r",
1610
- "Epoch 0: 0%| | 27/54401 [00:41<23:25:25, 1.55s/it, v_num=j6ei, train/loss=11.\r",
1611
- "Epoch 0: 0%| | 27/54401 [00:41<23:25:27, 1.55s/it, v_num=j6ei, train/loss=10."
1612
- ]
1613
- },
1614
- {
1615
- "name": "stdout",
1616
- "output_type": "stream",
1617
- "text": [
1618
- "\r",
1619
- "Epoch 0: 0%| | 28/54401 [00:43<23:14:36, 1.54s/it, v_num=j6ei, train/loss=10.\r",
1620
- "Epoch 0: 0%| | 28/54401 [00:43<23:14:38, 1.54s/it, v_num=j6ei, train/loss=10."
1621
- ]
1622
- },
1623
- {
1624
- "name": "stdout",
1625
- "output_type": "stream",
1626
- "text": [
1627
- "\r",
1628
- "Epoch 0: 0%| | 29/54401 [00:44<23:04:27, 1.53s/it, v_num=j6ei, train/loss=10.\r",
1629
- "Epoch 0: 0%| | 29/54401 [00:44<23:04:28, 1.53s/it, v_num=j6ei, train/loss=10."
1630
- ]
1631
- },
1632
- {
1633
- "name": "stdout",
1634
- "output_type": "stream",
1635
- "text": [
1636
- "\r",
1637
- "Epoch 0: 0%| | 30/54401 [00:45<22:55:04, 1.52s/it, v_num=j6ei, train/loss=10.\r",
1638
- "Epoch 0: 0%| | 30/54401 [00:45<22:55:05, 1.52s/it, v_num=j6ei, train/loss=11."
1639
- ]
1640
- },
1641
- {
1642
- "name": "stdout",
1643
- "output_type": "stream",
1644
- "text": [
1645
- "\r",
1646
- "Epoch 0: 0%| | 31/54401 [00:46<22:46:11, 1.51s/it, v_num=j6ei, train/loss=11.\r",
1647
- "Epoch 0: 0%| | 31/54401 [00:46<22:46:12, 1.51s/it, v_num=j6ei, train/loss=10."
1648
- ]
1649
- },
1650
- {
1651
- "name": "stdout",
1652
- "output_type": "stream",
1653
- "text": [
1654
- "\r",
1655
- "Epoch 0: 0%| | 32/54401 [00:48<22:43:11, 1.50s/it, v_num=j6ei, train/loss=10."
1656
- ]
1657
- },
1658
- {
1659
- "name": "stdout",
1660
- "output_type": "stream",
1661
- "text": [
1662
- "\r",
1663
- "Epoch 0: 0%| | 32/54401 [00:48<22:45:59, 1.51s/it, v_num=j6ei, train/loss=10."
1664
- ]
1665
- },
1666
- {
1667
- "name": "stdout",
1668
- "output_type": "stream",
1669
- "text": [
1670
- "\r",
1671
- "Epoch 0: 0%| | 33/54401 [00:49<22:37:44, 1.50s/it, v_num=j6ei, train/loss=10.\r",
1672
- "Epoch 0: 0%| | 33/54401 [00:49<22:37:46, 1.50s/it, v_num=j6ei, train/loss=9.4"
1673
- ]
1674
- },
1675
- {
1676
- "name": "stdout",
1677
- "output_type": "stream",
1678
- "text": [
1679
- "\r",
1680
- "Epoch 0: 0%| | 34/54401 [00:50<22:30:18, 1.49s/it, v_num=j6ei, train/loss=9.4\r",
1681
- "Epoch 0: 0%| | 34/54401 [00:50<22:30:19, 1.49s/it, v_num=j6ei, train/loss=9.2"
1682
- ]
1683
- },
1684
- {
1685
- "name": "stdout",
1686
- "output_type": "stream",
1687
- "text": [
1688
- "\r",
1689
- "Epoch 0: 0%| | 35/54401 [00:51<22:23:17, 1.48s/it, v_num=j6ei, train/loss=9.2\r",
1690
- "Epoch 0: 0%| | 35/54401 [00:51<22:23:18, 1.48s/it, v_num=j6ei, train/loss=9.3"
1691
- ]
1692
- },
1693
- {
1694
- "name": "stdout",
1695
- "output_type": "stream",
1696
- "text": [
1697
- "\r",
1698
- "Epoch 0: 0%| | 36/54401 [00:53<22:16:41, 1.48s/it, v_num=j6ei, train/loss=9.3\r",
1699
- "Epoch 0: 0%| | 36/54401 [00:53<22:16:42, 1.48s/it, v_num=j6ei, train/loss=9.6"
1700
- ]
1701
- },
1702
- {
1703
- "name": "stdout",
1704
- "output_type": "stream",
1705
- "text": [
1706
- "\r",
1707
- "Epoch 0: 0%| | 37/54401 [00:54<22:10:26, 1.47s/it, v_num=j6ei, train/loss=9.6\r",
1708
- "Epoch 0: 0%| | 37/54401 [00:54<22:10:27, 1.47s/it, v_num=j6ei, train/loss=9.6"
1709
- ]
1710
- },
1711
- {
1712
- "name": "stdout",
1713
- "output_type": "stream",
1714
- "text": [
1715
- "\r",
1716
- "Epoch 0: 0%| | 38/54401 [00:55<22:04:33, 1.46s/it, v_num=j6ei, train/loss=9.6\r",
1717
- "Epoch 0: 0%| | 38/54401 [00:55<22:04:35, 1.46s/it, v_num=j6ei, train/loss=9.4"
1718
- ]
1719
- },
1720
- {
1721
- "name": "stdout",
1722
- "output_type": "stream",
1723
- "text": [
1724
- "\r",
1725
- "Epoch 0: 0%| | 39/54401 [00:56<21:58:58, 1.46s/it, v_num=j6ei, train/loss=9.4\r",
1726
- "Epoch 0: 0%| | 39/54401 [00:56<21:58:59, 1.46s/it, v_num=j6ei, train/loss=9.5"
1727
- ]
1728
- },
1729
- {
1730
- "name": "stdout",
1731
- "output_type": "stream",
1732
- "text": [
1733
- "\r",
1734
- "Epoch 0: 0%| | 40/54401 [00:57<21:53:35, 1.45s/it, v_num=j6ei, train/loss=9.5\r",
1735
- "Epoch 0: 0%| | 40/54401 [00:57<21:53:36, 1.45s/it, v_num=j6ei, train/loss=9.6"
1736
- ]
1737
- },
1738
- {
1739
- "name": "stdout",
1740
- "output_type": "stream",
1741
- "text": [
1742
- "\r",
1743
- "Epoch 0: 0%| | 41/54401 [00:59<21:48:31, 1.44s/it, v_num=j6ei, train/loss=9.6\r",
1744
- "Epoch 0: 0%| | 41/54401 [00:59<21:48:32, 1.44s/it, v_num=j6ei, train/loss=9.4"
1745
- ]
1746
- },
1747
- {
1748
- "name": "stdout",
1749
- "output_type": "stream",
1750
- "text": [
1751
- "\r",
1752
- "Epoch 0: 0%| | 42/54401 [01:00<21:43:42, 1.44s/it, v_num=j6ei, train/loss=9.4\r",
1753
- "Epoch 0: 0%| | 42/54401 [01:00<21:43:43, 1.44s/it, v_num=j6ei, train/loss=9.4"
1754
- ]
1755
- },
1756
- {
1757
- "name": "stdout",
1758
- "output_type": "stream",
1759
- "text": [
1760
- "\r",
1761
- "Epoch 0: 0%| | 43/54401 [01:01<21:39:04, 1.43s/it, v_num=j6ei, train/loss=9.4\r",
1762
- "Epoch 0: 0%| | 43/54401 [01:01<21:39:06, 1.43s/it, v_num=j6ei, train/loss=9.6"
1763
- ]
1764
- },
1765
- {
1766
- "name": "stdout",
1767
- "output_type": "stream",
1768
- "text": [
1769
- "\r",
1770
- "Epoch 0: 0%| | 44/54401 [01:02<21:34:37, 1.43s/it, v_num=j6ei, train/loss=9.6\r",
1771
- "Epoch 0: 0%| | 44/54401 [01:02<21:34:38, 1.43s/it, v_num=j6ei, train/loss=9.5"
1772
- ]
1773
- },
1774
- {
1775
- "name": "stdout",
1776
- "output_type": "stream",
1777
- "text": [
1778
- "\r",
1779
- "Epoch 0: 0%| | 45/54401 [01:04<21:30:26, 1.42s/it, v_num=j6ei, train/loss=9.5\r",
1780
- "Epoch 0: 0%| | 45/54401 [01:04<21:30:27, 1.42s/it, v_num=j6ei, train/loss=9.5"
1781
- ]
1782
- },
1783
- {
1784
- "name": "stdout",
1785
- "output_type": "stream",
1786
- "text": [
1787
- "\r",
1788
- "Epoch 0: 0%| | 46/54401 [01:05<21:26:24, 1.42s/it, v_num=j6ei, train/loss=9.5\r",
1789
- "Epoch 0: 0%| | 46/54401 [01:05<21:26:25, 1.42s/it, v_num=j6ei, train/loss=9.5"
1790
- ]
1791
- },
1792
- {
1793
- "name": "stdout",
1794
- "output_type": "stream",
1795
- "text": [
1796
- "\r",
1797
- "Epoch 0: 0%| | 47/54401 [01:06<21:22:30, 1.42s/it, v_num=j6ei, train/loss=9.5\r",
1798
- "Epoch 0: 0%| | 47/54401 [01:06<21:22:31, 1.42s/it, v_num=j6ei, train/loss=9.5"
1799
- ]
1800
- },
1801
- {
1802
- "name": "stdout",
1803
- "output_type": "stream",
1804
- "text": [
1805
- "\r",
1806
- "Epoch 0: 0%| | 48/54401 [01:07<21:18:53, 1.41s/it, v_num=j6ei, train/loss=9.5\r",
1807
- "Epoch 0: 0%| | 48/54401 [01:07<21:18:54, 1.41s/it, v_num=j6ei, train/loss=9.5"
1808
- ]
1809
- },
1810
- {
1811
- "name": "stdout",
1812
- "output_type": "stream",
1813
- "text": [
1814
- "\r",
1815
- "Epoch 0: 0%| | 49/54401 [01:08<21:15:23, 1.41s/it, v_num=j6ei, train/loss=9.5\r",
1816
- "Epoch 0: 0%| | 49/54401 [01:08<21:15:24, 1.41s/it, v_num=j6ei, train/loss=9.5"
1817
- ]
1818
- },
1819
- {
1820
- "name": "stdout",
1821
- "output_type": "stream",
1822
- "text": [
1823
- "\r",
1824
- "Epoch 0: 0%| | 50/54401 [01:10<21:11:54, 1.40s/it, v_num=j6ei, train/loss=9.5\r",
1825
- "Epoch 0: 0%| | 50/54401 [01:10<21:11:55, 1.40s/it, v_num=j6ei, train/loss=9.4"
1826
- ]
1827
- },
1828
- {
1829
- "name": "stdout",
1830
- "output_type": "stream",
1831
- "text": [
1832
- "\r",
1833
- "Epoch 0: 0%| | 51/54401 [01:11<21:08:37, 1.40s/it, v_num=j6ei, train/loss=9.4\r",
1834
- "Epoch 0: 0%| | 51/54401 [01:11<21:08:38, 1.40s/it, v_num=j6ei, train/loss=9.5"
1835
- ]
1836
- },
1837
- {
1838
- "name": "stdout",
1839
- "output_type": "stream",
1840
- "text": [
1841
- "\r",
1842
- "Epoch 0: 0%| | 52/54401 [01:12<21:05:29, 1.40s/it, v_num=j6ei, train/loss=9.5\r",
1843
- "Epoch 0: 0%| | 52/54401 [01:12<21:05:30, 1.40s/it, v_num=j6ei, train/loss=9.6"
1844
- ]
1845
- },
1846
- {
1847
- "name": "stdout",
1848
- "output_type": "stream",
1849
- "text": [
1850
- "\r",
1851
- "Epoch 0: 0%| | 53/54401 [01:13<21:02:31, 1.39s/it, v_num=j6ei, train/loss=9.6\r",
1852
- "Epoch 0: 0%| | 53/54401 [01:13<21:02:32, 1.39s/it, v_num=j6ei, train/loss=9.6"
1853
- ]
1854
- },
1855
- {
1856
- "name": "stdout",
1857
- "output_type": "stream",
1858
- "text": [
1859
- "\r",
1860
- "Epoch 0: 0%| | 54/54401 [01:15<20:59:37, 1.39s/it, v_num=j6ei, train/loss=9.6\r",
1861
- "Epoch 0: 0%| | 54/54401 [01:15<20:59:38, 1.39s/it, v_num=j6ei, train/loss=9.5"
1862
- ]
1863
- },
1864
- {
1865
- "name": "stdout",
1866
- "output_type": "stream",
1867
- "text": [
1868
- "\r",
1869
- "Epoch 0: 0%| | 55/54401 [01:16<20:56:54, 1.39s/it, v_num=j6ei, train/loss=9.5\r",
1870
- "Epoch 0: 0%| | 55/54401 [01:16<20:56:55, 1.39s/it, v_num=j6ei, train/loss=9.5"
1871
- ]
1872
- },
1873
- {
1874
- "name": "stdout",
1875
- "output_type": "stream",
1876
- "text": [
1877
- "\r",
1878
- "Epoch 0: 0%| | 56/54401 [01:17<20:54:17, 1.38s/it, v_num=j6ei, train/loss=9.5\r",
1879
- "Epoch 0: 0%| | 56/54401 [01:17<20:54:18, 1.38s/it, v_num=j6ei, train/loss=9.3"
1880
- ]
1881
- },
1882
- {
1883
- "name": "stdout",
1884
- "output_type": "stream",
1885
- "text": [
1886
- "\r",
1887
- "Epoch 0: 0%| | 57/54401 [01:18<20:51:46, 1.38s/it, v_num=j6ei, train/loss=9.3\r",
1888
- "Epoch 0: 0%| | 57/54401 [01:18<20:51:47, 1.38s/it, v_num=j6ei, train/loss=9.4"
1889
- ]
1890
- },
1891
- {
1892
- "name": "stdout",
1893
- "output_type": "stream",
1894
- "text": [
1895
- "\r",
1896
- "Epoch 0: 0%| | 58/54401 [01:20<20:49:18, 1.38s/it, v_num=j6ei, train/loss=9.4\r",
1897
- "Epoch 0: 0%| | 58/54401 [01:20<20:49:19, 1.38s/it, v_num=j6ei, train/loss=9.3"
1898
- ]
1899
- },
1900
- {
1901
- "name": "stdout",
1902
- "output_type": "stream",
1903
- "text": [
1904
- "\r",
1905
- "Epoch 0: 0%| | 59/54401 [01:21<20:46:56, 1.38s/it, v_num=j6ei, train/loss=9.3\r",
1906
- "Epoch 0: 0%| | 59/54401 [01:21<20:46:57, 1.38s/it, v_num=j6ei, train/loss=9.6"
1907
- ]
1908
- },
1909
- {
1910
- "name": "stdout",
1911
- "output_type": "stream",
1912
- "text": [
1913
- "\r",
1914
- "Epoch 0: 0%| | 60/54401 [01:22<20:44:35, 1.37s/it, v_num=j6ei, train/loss=9.6\r",
1915
- "Epoch 0: 0%| | 60/54401 [01:22<20:44:35, 1.37s/it, v_num=j6ei, train/loss=9.5"
1916
- ]
1917
- },
1918
- {
1919
- "name": "stdout",
1920
- "output_type": "stream",
1921
- "text": [
1922
- "\r",
1923
- "Epoch 0: 0%| | 61/54401 [01:23<20:42:18, 1.37s/it, v_num=j6ei, train/loss=9.5\r",
1924
- "Epoch 0: 0%| | 61/54401 [01:23<20:42:19, 1.37s/it, v_num=j6ei, train/loss=9.3"
1925
- ]
1926
- },
1927
- {
1928
- "name": "stdout",
1929
- "output_type": "stream",
1930
- "text": [
1931
- "\r",
1932
- "Epoch 0: 0%| | 62/54401 [01:24<20:40:02, 1.37s/it, v_num=j6ei, train/loss=9.3\r",
1933
- "Epoch 0: 0%| | 62/54401 [01:24<20:40:03, 1.37s/it, v_num=j6ei, train/loss=9.6"
1934
- ]
1935
- },
1936
- {
1937
- "name": "stdout",
1938
- "output_type": "stream",
1939
- "text": [
1940
- "\r",
1941
- "Epoch 0: 0%| | 63/54401 [01:26<20:37:53, 1.37s/it, v_num=j6ei, train/loss=9.6\r",
1942
- "Epoch 0: 0%| | 63/54401 [01:26<20:37:54, 1.37s/it, v_num=j6ei, train/loss=9.4"
1943
- ]
1944
- },
1945
- {
1946
- "name": "stdout",
1947
- "output_type": "stream",
1948
- "text": [
1949
- "Traceback (most recent call last):\r\n",
1950
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
1951
- " cli_main()\r\n",
1952
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
1953
- " LightningCLI(\r\n",
1954
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n",
1955
- " self._run_subcommand(self.subcommand)\r\n",
1956
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n",
1957
- " fn(**fn_kwargs)\r\n",
1958
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n",
1959
- " call._call_and_handle_interrupt(\r\n",
1960
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n",
1961
- " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n",
1962
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n",
1963
- " return function(*args, **kwargs)\r\n",
1964
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n",
1965
- " self._run(model, ckpt_path=ckpt_path)\r\n",
1966
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 973, in _run\r\n",
1967
- " results = self._run_stage()\r\n",
1968
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 1016, in _run_stage\r\n",
1969
- " self.fit_loop.run()\r\n",
1970
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 201, in run\r\n",
1971
- " self.advance()\r\n",
1972
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py\", line 354, in advance\r\n",
1973
- " self.epoch_loop.run(self._data_fetcher)\r\n",
1974
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 133, in run\r\n",
1975
- " self.advance(data_fetcher)\r\n",
1976
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 218, in advance\r\n",
1977
- " batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)\r\n",
1978
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 185, in run\r\n",
1979
- " self._optimizer_step(kwargs.get(\"batch_idx\", 0), closure)\r\n",
1980
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/optimization/automatic.py\", line 260, in _optimizer_step\r\n",
1981
- " call._call_lightning_module_hook(\r\n",
1982
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 144, in _call_lightning_module_hook\r\n",
1983
- " output = fn(*args, **kwargs)\r\n",
1984
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/module.py\", line 1256, in optimizer_step\r\n",
1985
- " optimizer.step(closure=optimizer_closure)\r\n",
1986
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/core/optimizer.py\", line 155, in step\r\n",
1987
- " step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\r\n",
1988
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/ddp.py\", line 256, in optimizer_step\r\n",
1989
- " optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\r\n",
1990
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/strategy.py\", line 225, in optimizer_step\r\n",
1991
- " return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\r\n",
1992
- " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 102, in optimizer_step\r\n",
1993
- " return deepspeed_engine.step(**kwargs)\r\n",
1994
- " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 2087, in step\r\n",
1995
- " self._take_model_step(lr_kwargs)\r\n",
1996
- " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py\", line 1994, in _take_model_step\r\n",
1997
- " self.optimizer.step()\r\n",
1998
- " File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1715, in step\r\n",
1999
- " int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)\r\n",
2000
- "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.21 GiB (GPU 0; 22.19 GiB total capacity; 14.81 GiB already allocated; 2.39 GiB free; 18.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\r\n",
2001
- "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
2002
- ]
2003
- },
2004
- {
2005
- "name": "stdout",
2006
- "output_type": "stream",
2007
- "text": [
2008
- "\u001b[34m\u001b[1mwandb\u001b[0m: - 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
2009
- ]
2010
- },
2011
- {
2012
- "name": "stdout",
2013
- "output_type": "stream",
2014
- "text": [
2015
- "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
2016
- ]
2017
- },
2018
- {
2019
- "name": "stdout",
2020
- "output_type": "stream",
2021
- "text": [
2022
- "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
2023
- ]
2024
- },
2025
- {
2026
- "name": "stdout",
2027
- "output_type": "stream",
2028
- "text": [
2029
- "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r"
2030
- ]
2031
- },
2032
- {
2033
- "name": "stdout",
2034
- "output_type": "stream",
2035
- "text": [
2036
- "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
2037
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\r\n",
2038
- "\u001b[34m\u001b[1mwandb\u001b[0m: batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
2039
- "\u001b[34m\u001b[1mwandb\u001b[0m: global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
2040
- "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▁▁▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████\r\n",
2041
- "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
2042
- "\u001b[34m\u001b[1mwandb\u001b[0m: real_ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
2043
- "\u001b[34m\u001b[1mwandb\u001b[0m: substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\r\n",
2044
- "\u001b[34m\u001b[1mwandb\u001b[0m: train/loss ████████████████████▂▁▃▂▂▂▃▂▂▂▂▂▃▂▂▂▃▂▃▃\r\n",
2045
- "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/global_step ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁████████████████████\r\n",
2046
- "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/learning_rate ████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\r\n",
2047
- "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
2048
- "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\r\n",
2049
- "\u001b[34m\u001b[1mwandb\u001b[0m: batchidx 63\r\n",
2050
- "\u001b[34m\u001b[1mwandb\u001b[0m: global_rank 0\r\n",
2051
- "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 3052.94065\r\n",
2052
- "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_total.gpu.0 262080\r\n",
2053
- "\u001b[34m\u001b[1mwandb\u001b[0m: real_ctx_len 4095\r\n",
2054
- "\u001b[34m\u001b[1mwandb\u001b[0m: substep 63\r\n",
2055
- "\u001b[34m\u001b[1mwandb\u001b[0m: train/loss 9.625\r\n",
2056
- "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/global_step 1\r\n",
2057
- "\u001b[34m\u001b[1mwandb\u001b[0m: trainer/learning_rate 0.0006\r\n",
2058
- "\u001b[34m\u001b[1mwandb\u001b[0m: \r\n",
2059
- "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L12-D2048-E0.01 - Enwiki-4k Part 1 (train-ctx=4k, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/3rwyj6ei\u001b[0m\r\n",
2060
- "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v4\u001b[0m\r\n",
2061
- "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n",
2062
- "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230929_095815-3rwyj6ei/logs\u001b[0m\r\n"
2063
- ]
2064
- }
2065
- ],
2066
- "source": [
2067
- "# Start the foundation model training\n",
2068
- "!cd \"{TRAINER_DIR}\" && \\\n",
2069
- " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
2070
- " python3 lightning_trainer.py fit \\\n",
2071
- " -c \"{NOTEBOOK_DIR}/enwiki-4k-part1.yaml\" \\\n",
2072
- " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
2073
- " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
2074
- " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
2075
- " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/\" \\\n",
2076
- " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-v5base-init.pth\" \\\n",
2077
- " --model.ctx_len=4096 \\\n",
2078
- " --model.bptt_learning_range=1"
2079
- ]
2080
- },
2081
- {
2082
- "cell_type": "code",
2083
- "execution_count": 6,
2084
- "id": "73f2dbdc",
2085
- "metadata": {
2086
- "execution": {
2087
- "iopub.execute_input": "2023-09-29T10:00:21.556396Z",
2088
- "iopub.status.busy": "2023-09-29T10:00:21.555187Z",
2089
- "iopub.status.idle": "2023-09-29T10:00:25.260439Z",
2090
- "shell.execute_reply": "2023-09-29T10:00:25.259001Z"
2091
- },
2092
- "papermill": {
2093
- "duration": 3.72759,
2094
- "end_time": "2023-09-29T10:00:25.263203",
2095
- "exception": false,
2096
- "start_time": "2023-09-29T10:00:21.535613",
2097
- "status": "completed"
2098
- },
2099
- "tags": []
2100
- },
2101
- "outputs": [
2102
- {
2103
- "name": "stdout",
2104
- "output_type": "stream",
2105
- "text": [
2106
- "[2023-09-29 10:00:23,854] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2107
- ]
2108
- },
2109
- {
2110
- "name": "stdout",
2111
- "output_type": "stream",
2112
- "text": [
2113
- "Traceback (most recent call last):\r\n",
2114
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
2115
- " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
2116
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
2117
- " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
2118
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
2119
- " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
2120
- "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L12-D2048-E0_01-enwiki-4k-p1/last.ckpt/latest\r\n"
2121
- ]
2122
- },
2123
- {
2124
- "name": "stdout",
2125
- "output_type": "stream",
2126
- "text": [
2127
- "ls: cannot access '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth': No such file or directory\r\n"
2128
- ]
2129
- }
2130
- ],
2131
- "source": [
2132
- "# Lets export the model from the checkpoint\n",
2133
- "!cd \"{TRAINER_DIR}\" && \\\n",
2134
- " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k-p1/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"bf16\"\n",
2135
- "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\""
2136
- ]
2137
- },
2138
- {
2139
- "cell_type": "code",
2140
- "execution_count": 7,
2141
- "id": "9b1932b1",
2142
- "metadata": {
2143
- "execution": {
2144
- "iopub.execute_input": "2023-09-29T10:00:25.302083Z",
2145
- "iopub.status.busy": "2023-09-29T10:00:25.300897Z",
2146
- "iopub.status.idle": "2023-09-29T10:00:31.273775Z",
2147
- "shell.execute_reply": "2023-09-29T10:00:31.272586Z"
2148
- },
2149
- "papermill": {
2150
- "duration": 5.996558,
2151
- "end_time": "2023-09-29T10:00:31.277049",
2152
- "exception": false,
2153
- "start_time": "2023-09-29T10:00:25.280491",
2154
- "status": "completed"
2155
- },
2156
- "tags": []
2157
- },
2158
- "outputs": [
2159
- {
2160
- "name": "stdout",
2161
- "output_type": "stream",
2162
- "text": [
2163
- "[2023-09-29 10:00:29,417] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
2164
- ]
2165
- },
2166
- {
2167
- "name": "stdout",
2168
- "output_type": "stream",
2169
- "text": [
2170
- "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
2171
- "Traceback (most recent call last):\r\n",
2172
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
2173
- " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
2174
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
2175
- " self.model = RWKV(**model_config)\r\n",
2176
- " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
2177
- " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
2178
- "ValueError: load_model file '../model/v5-L12-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n"
2179
- ]
2180
- }
2181
- ],
2182
- "source": [
2183
- "# # Lets do a quick dragon prompt validation\n",
2184
- "!cd \"{INFERENCE_DIR}\" && \\\n",
2185
- " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
2186
- ]
2187
- }
2188
- ],
2189
- "metadata": {
2190
- "kernelspec": {
2191
- "display_name": "Python 3 (ipykernel)",
2192
- "language": "python",
2193
- "name": "python3"
2194
- },
2195
- "language_info": {
2196
- "codemirror_mode": {
2197
- "name": "ipython",
2198
- "version": 3
2199
- },
2200
- "file_extension": ".py",
2201
- "mimetype": "text/x-python",
2202
- "name": "python",
2203
- "nbconvert_exporter": "python",
2204
- "pygments_lexer": "ipython3",
2205
- "version": "3.10.12"
2206
- },
2207
- "papermill": {
2208
- "default_parameters": {},
2209
- "duration": 201.43354,
2210
- "end_time": "2023-09-29T10:00:31.714898",
2211
- "environment_variables": {},
2212
- "exception": null,
2213
- "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb",
2214
- "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L12-D2048-part1.ipynb",
2215
- "parameters": {},
2216
- "start_time": "2023-09-29T09:57:10.281358",
2217
- "version": "2.4.0"
2218
- }
2219
- },
2220
- "nbformat": 4,
2221
- "nbformat_minor": 5
2222
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53c27ed2c20b9f1f690647a83c0fbe2ce09594518b9ec557f515a4f8b548f2b
3
+ size 15941299