[GHA] experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb result notebooks

#25
by picocreator - opened
experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "4c73afb6",
7
+ "metadata": {
8
+ "papermill": {
9
+ "duration": 0.003926,
10
+ "end_time": "2023-08-25T16:07:12.117562",
11
+ "exception": false,
12
+ "start_time": "2023-08-25T16:07:12.113636",
13
+ "status": "completed"
14
+ },
15
+ "tags": []
16
+ },
17
+ "source": [
18
+ "# RWKV v5-headsize2x / embedding init-range 1e-01 / 4k\n",
19
+ "\n",
20
+ "- 6 layers\n",
21
+ "- 4096 embedding size\n",
22
+ "\n",
23
+ "Going through the modified memory training for v5 models, across various initial embedding model weights\n",
24
+ "\n",
25
+ "**Note:** This project assumes you have the rwkv-infctx conda env setup"
26
+ ]
27
+ },
28
+ {
29
+ "attachments": {},
30
+ "cell_type": "markdown",
31
+ "id": "393e4299",
32
+ "metadata": {
33
+ "papermill": {
34
+ "duration": 0.002494,
35
+ "end_time": "2023-08-25T16:07:12.122625",
36
+ "exception": false,
37
+ "start_time": "2023-08-25T16:07:12.120131",
38
+ "status": "completed"
39
+ },
40
+ "tags": []
41
+ },
42
+ "source": [
43
+ "# Basic Setup"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 1,
49
+ "id": "e8229f09",
50
+ "metadata": {
51
+ "execution": {
52
+ "iopub.execute_input": "2023-08-25T16:07:12.129239Z",
53
+ "iopub.status.busy": "2023-08-25T16:07:12.128559Z",
54
+ "iopub.status.idle": "2023-08-25T16:07:12.847564Z",
55
+ "shell.execute_reply": "2023-08-25T16:07:12.846564Z"
56
+ },
57
+ "papermill": {
58
+ "duration": 0.724082,
59
+ "end_time": "2023-08-25T16:07:12.849402",
60
+ "exception": false,
61
+ "start_time": "2023-08-25T16:07:12.125320",
62
+ "status": "completed"
63
+ },
64
+ "tags": []
65
+ },
66
+ "outputs": [],
67
+ "source": [
68
+ "# First lets setup the various directories, and init the model\n",
69
+ "!mkdir -p ../../../../model/\n",
70
+ "!mkdir -p ../../../../datapath/\n",
71
+ "!mkdir -p ../../../../checkpoint/"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 2,
77
+ "id": "c7a42dc0",
78
+ "metadata": {
79
+ "execution": {
80
+ "iopub.execute_input": "2023-08-25T16:07:12.855853Z",
81
+ "iopub.status.busy": "2023-08-25T16:07:12.855653Z",
82
+ "iopub.status.idle": "2023-08-25T16:07:15.711591Z",
83
+ "shell.execute_reply": "2023-08-25T16:07:15.710680Z"
84
+ },
85
+ "papermill": {
86
+ "duration": 2.861479,
87
+ "end_time": "2023-08-25T16:07:15.713370",
88
+ "exception": false,
89
+ "start_time": "2023-08-25T16:07:12.851891",
90
+ "status": "completed"
91
+ },
92
+ "tags": []
93
+ },
94
+ "outputs": [
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n",
100
+ "\u001b[0m"
101
+ ]
102
+ },
103
+ {
104
+ "name": "stdout",
105
+ "output_type": "stream",
106
+ "text": [
107
+ "\r\n",
108
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\r\n",
109
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\r\n"
110
+ ]
111
+ }
112
+ ],
113
+ "source": [
114
+ "# Additional dependencies for eval stuff\n",
115
+ "!pip install -q aiocsv aiofiles"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 3,
121
+ "id": "6cab8aca",
122
+ "metadata": {
123
+ "execution": {
124
+ "iopub.execute_input": "2023-08-25T16:07:15.720224Z",
125
+ "iopub.status.busy": "2023-08-25T16:07:15.720023Z",
126
+ "iopub.status.idle": "2023-08-25T16:07:15.726216Z",
127
+ "shell.execute_reply": "2023-08-25T16:07:15.725750Z"
128
+ },
129
+ "papermill": {
130
+ "duration": 0.011194,
131
+ "end_time": "2023-08-25T16:07:15.727505",
132
+ "exception": false,
133
+ "start_time": "2023-08-25T16:07:15.716311",
134
+ "status": "completed"
135
+ },
136
+ "tags": []
137
+ },
138
+ "outputs": [
139
+ {
140
+ "name": "stdout",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "DEEPSPEED_STRAT: deepspeed_stage_1\n",
144
+ "ENABLE_WANDB: True\n",
145
+ "GPU_DEVICES: auto\n",
146
+ "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x\n",
147
+ "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n",
148
+ "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5headsize2x\n",
149
+ "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
150
+ ]
151
+ }
152
+ ],
153
+ "source": [
154
+ "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
155
+ "GPU_DEVICES=\"auto\"\n",
156
+ "ENABLE_WANDB=True\n",
157
+ "\n",
158
+ "RWKV_WAVENET_LAYERS=1\n",
159
+ "\n",
160
+ "EMBED_SCALE=0.1\n",
161
+ "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
162
+ "\n",
163
+ "LAYER_COUNT=6\n",
164
+ "EMBED_DIM=2048\n",
165
+ "\n",
166
+ "WANDB_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n",
167
+ "FILENAME_PREFIX=f\"v5-hs2x-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n",
168
+ "\n",
169
+ "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
170
+ "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
171
+ "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
172
+ "\n",
173
+ "if ENABLE_WANDB:\n",
174
+ " WANDB_MODE=\"online\"\n",
175
+ "else:\n",
176
+ " WANDB_MODE=\"disabled\"\n",
177
+ "\n",
178
+ "# Computing the notebook, and various paths\n",
179
+ "import os\n",
180
+ "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
181
+ "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
182
+ "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n",
183
+ "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5headsize2x/\"))\n",
184
+ "\n",
185
+ "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
186
+ "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
187
+ "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
188
+ "print(\"PROJECT_DIR:\", PROJECT_DIR)"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 4,
194
+ "id": "fed724db",
195
+ "metadata": {
196
+ "execution": {
197
+ "iopub.execute_input": "2023-08-25T16:07:15.733547Z",
198
+ "iopub.status.busy": "2023-08-25T16:07:15.733381Z"
199
+ },
200
+ "papermill": {
201
+ "duration": null,
202
+ "end_time": null,
203
+ "exception": false,
204
+ "start_time": "2023-08-25T16:07:15.730189",
205
+ "status": "running"
206
+ },
207
+ "tags": []
208
+ },
209
+ "outputs": [
210
+ {
211
+ "name": "stdout",
212
+ "output_type": "stream",
213
+ "text": [
214
+ "Setting ds_accelerator to cuda (auto detect)\r\n"
215
+ ]
216
+ },
217
+ {
218
+ "name": "stdout",
219
+ "output_type": "stream",
220
+ "text": [
221
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
222
+ "---- Initializing model ----\r\n",
223
+ "No of layers: 6\r\n",
224
+ "Embedding size: 4096\r\n",
225
+ "Output model path: ../model/L6-D2048-E0_1-neox-v5base-init.pth\r\n",
226
+ "Vocab size: 50277\r\n",
227
+ "Emb scale: 0.1\r\n",
228
+ "Note: this process takes a significant time (and ram) for large models\r\n",
229
+ "---- ----- ----\r\n"
230
+ ]
231
+ },
232
+ {
233
+ "name": "stdout",
234
+ "output_type": "stream",
235
+ "text": [
236
+ "50277 4096 -0.1 emb.weight\r\n"
237
+ ]
238
+ },
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "4096 4096 1.0 blocks.0.att.receptance.weight\r\n"
244
+ ]
245
+ },
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "4096 4096 1.0 blocks.0.att.key.weight\r\n"
251
+ ]
252
+ },
253
+ {
254
+ "name": "stdout",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "4096 4096 1.0 blocks.0.att.value.weight\r\n"
258
+ ]
259
+ },
260
+ {
261
+ "name": "stdout",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "4096 4096 0 blocks.0.att.output.weight\r\n",
265
+ "16384 4096 1.0 blocks.0.ffn.key.weight\r\n"
266
+ ]
267
+ },
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "4096 4096 0 blocks.0.ffn.receptance.weight\r\n",
273
+ "4096 16384 0 blocks.0.ffn.value.weight\r\n"
274
+ ]
275
+ },
276
+ {
277
+ "name": "stdout",
278
+ "output_type": "stream",
279
+ "text": [
280
+ "4096 4096 1.0 blocks.1.att.receptance.weight\r\n"
281
+ ]
282
+ },
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "4096 4096 1.0 blocks.1.att.key.weight\r\n"
288
+ ]
289
+ },
290
+ {
291
+ "name": "stdout",
292
+ "output_type": "stream",
293
+ "text": [
294
+ "4096 4096 1.0 blocks.1.att.value.weight\r\n"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "4096 4096 0 blocks.1.att.output.weight\r\n"
302
+ ]
303
+ },
304
+ {
305
+ "name": "stdout",
306
+ "output_type": "stream",
307
+ "text": [
308
+ "16384 4096 1.0 blocks.1.ffn.key.weight\r\n"
309
+ ]
310
+ },
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "4096 4096 0 blocks.1.ffn.receptance.weight\r\n"
316
+ ]
317
+ },
318
+ {
319
+ "name": "stdout",
320
+ "output_type": "stream",
321
+ "text": [
322
+ "4096 16384 0 blocks.1.ffn.value.weight\r\n"
323
+ ]
324
+ },
325
+ {
326
+ "name": "stdout",
327
+ "output_type": "stream",
328
+ "text": [
329
+ "4096 4096 1.0 blocks.2.att.receptance.weight\r\n"
330
+ ]
331
+ },
332
+ {
333
+ "name": "stdout",
334
+ "output_type": "stream",
335
+ "text": [
336
+ "4096 4096 1.0 blocks.2.att.key.weight\r\n"
337
+ ]
338
+ },
339
+ {
340
+ "name": "stdout",
341
+ "output_type": "stream",
342
+ "text": [
343
+ "4096 4096 1.0 blocks.2.att.value.weight\r\n"
344
+ ]
345
+ },
346
+ {
347
+ "name": "stdout",
348
+ "output_type": "stream",
349
+ "text": [
350
+ "4096 4096 0 blocks.2.att.output.weight\r\n",
351
+ "16384 4096 1.0 blocks.2.ffn.key.weight\r\n"
352
+ ]
353
+ },
354
+ {
355
+ "name": "stdout",
356
+ "output_type": "stream",
357
+ "text": [
358
+ "4096 4096 0 blocks.2.ffn.receptance.weight\r\n"
359
+ ]
360
+ },
361
+ {
362
+ "name": "stdout",
363
+ "output_type": "stream",
364
+ "text": [
365
+ "4096 16384 0 blocks.2.ffn.value.weight\r\n"
366
+ ]
367
+ },
368
+ {
369
+ "name": "stdout",
370
+ "output_type": "stream",
371
+ "text": [
372
+ "4096 4096 1.0 blocks.3.att.receptance.weight\r\n"
373
+ ]
374
+ },
375
+ {
376
+ "name": "stdout",
377
+ "output_type": "stream",
378
+ "text": [
379
+ "4096 4096 1.0 blocks.3.att.key.weight\r\n"
380
+ ]
381
+ },
382
+ {
383
+ "name": "stdout",
384
+ "output_type": "stream",
385
+ "text": [
386
+ "4096 4096 1.0 blocks.3.att.value.weight\r\n"
387
+ ]
388
+ },
389
+ {
390
+ "name": "stdout",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "4096 4096 0 blocks.3.att.output.weight\r\n",
394
+ "16384 4096 1.0 blocks.3.ffn.key.weight\r\n"
395
+ ]
396
+ },
397
+ {
398
+ "name": "stdout",
399
+ "output_type": "stream",
400
+ "text": [
401
+ "4096 4096 0 blocks.3.ffn.receptance.weight\r\n"
402
+ ]
403
+ },
404
+ {
405
+ "name": "stdout",
406
+ "output_type": "stream",
407
+ "text": [
408
+ "4096 16384 0 blocks.3.ffn.value.weight\r\n"
409
+ ]
410
+ },
411
+ {
412
+ "name": "stdout",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "4096 4096 1.0 blocks.4.att.receptance.weight\r\n"
416
+ ]
417
+ },
418
+ {
419
+ "name": "stdout",
420
+ "output_type": "stream",
421
+ "text": [
422
+ "4096 4096 1.0 blocks.4.att.key.weight\r\n"
423
+ ]
424
+ },
425
+ {
426
+ "name": "stdout",
427
+ "output_type": "stream",
428
+ "text": [
429
+ "4096 4096 1.0 blocks.4.att.value.weight\r\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "# Init the model\n",
435
+ "!cd \"{TRAINER_DIR}\" && \\\n",
436
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
437
+ " python3 ./init_model.py \\\n",
438
+ " --n_layer 6 --n_embd 4096 \\\n",
439
+ " --emb-scale \"{EMBED_SCALE}\" \\\n",
440
+ " --vocab_size neox --skip-if-exists \\\n",
441
+ " \"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\""
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "markdown",
446
+ "id": "b17f7961",
447
+ "metadata": {
448
+ "papermill": {
449
+ "duration": null,
450
+ "end_time": null,
451
+ "exception": null,
452
+ "start_time": null,
453
+ "status": "pending"
454
+ },
455
+ "tags": []
456
+ },
457
+ "source": [
458
+ "## Enwiki Stage 1 : Foundation 4k model training"
459
+ ]
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "execution_count": null,
464
+ "id": "0caf9040",
465
+ "metadata": {
466
+ "papermill": {
467
+ "duration": null,
468
+ "end_time": null,
469
+ "exception": null,
470
+ "start_time": null,
471
+ "status": "pending"
472
+ },
473
+ "tags": []
474
+ },
475
+ "outputs": [],
476
+ "source": [
477
+ "# Lets preload the requried dataset \n",
478
+ "!cd \"{TRAINER_DIR}\" && \\\n",
479
+ " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\""
480
+ ]
481
+ },
482
+ {
483
+ "cell_type": "code",
484
+ "execution_count": null,
485
+ "id": "77604691",
486
+ "metadata": {
487
+ "papermill": {
488
+ "duration": null,
489
+ "end_time": null,
490
+ "exception": null,
491
+ "start_time": null,
492
+ "status": "pending"
493
+ },
494
+ "tags": []
495
+ },
496
+ "outputs": [],
497
+ "source": [
498
+ "# Start the foundation model training\n",
499
+ "!cd \"{TRAINER_DIR}\" && \\\n",
500
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
501
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
502
+ " python lightning_trainer.py fit \\\n",
503
+ " -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n",
504
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
505
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
506
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
507
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n",
508
+ " --model.load_model=\"../model/L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}-neox-v5base-init.pth\" \\\n",
509
+ " --model.ctx_len=4096 \\\n",
510
+ " --model.bptt_learning_range=1"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "code",
515
+ "execution_count": null,
516
+ "id": "41df74c7",
517
+ "metadata": {
518
+ "papermill": {
519
+ "duration": null,
520
+ "end_time": null,
521
+ "exception": null,
522
+ "start_time": null,
523
+ "status": "pending"
524
+ },
525
+ "tags": []
526
+ },
527
+ "outputs": [],
528
+ "source": [
529
+ "# Lets export the model from the checkpoint\n",
530
+ "!cd \"{TRAINER_DIR}\" && \\\n",
531
+ " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n",
532
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\""
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": null,
538
+ "id": "29c9df8a",
539
+ "metadata": {
540
+ "papermill": {
541
+ "duration": null,
542
+ "end_time": null,
543
+ "exception": null,
544
+ "start_time": null,
545
+ "status": "pending"
546
+ },
547
+ "tags": []
548
+ },
549
+ "outputs": [],
550
+ "source": [
551
+ "# # Lets do a quick dragon prompt validation\n",
552
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
553
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
554
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\""
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "execution_count": null,
560
+ "id": "442c0b35",
561
+ "metadata": {
562
+ "papermill": {
563
+ "duration": null,
564
+ "end_time": null,
565
+ "exception": null,
566
+ "start_time": null,
567
+ "status": "pending"
568
+ },
569
+ "tags": []
570
+ },
571
+ "outputs": [],
572
+ "source": [
573
+ "# Lets do a quick memory test\n",
574
+ "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
575
+ " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\""
576
+ ]
577
+ },
578
+ {
579
+ "attachments": {},
580
+ "cell_type": "markdown",
581
+ "id": "43fe425a",
582
+ "metadata": {
583
+ "papermill": {
584
+ "duration": null,
585
+ "end_time": null,
586
+ "exception": null,
587
+ "start_time": null,
588
+ "status": "pending"
589
+ },
590
+ "tags": []
591
+ },
592
+ "source": [
593
+ "# Enwiki Stage 2 : Basic Instruct Tuning"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": null,
599
+ "id": "ad7fabe0",
600
+ "metadata": {
601
+ "papermill": {
602
+ "duration": null,
603
+ "end_time": null,
604
+ "exception": null,
605
+ "start_time": null,
606
+ "status": "pending"
607
+ },
608
+ "tags": []
609
+ },
610
+ "outputs": [],
611
+ "source": [
612
+ "# Lets preload the requried dataset\n",
613
+ "!cd \"{TRAINER_DIR}\" && \\\n",
614
+ " python3 preload_datapath.py \"{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml\""
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": null,
620
+ "id": "a219c41a",
621
+ "metadata": {
622
+ "papermill": {
623
+ "duration": null,
624
+ "end_time": null,
625
+ "exception": null,
626
+ "start_time": null,
627
+ "status": "pending"
628
+ },
629
+ "tags": []
630
+ },
631
+ "outputs": [],
632
+ "source": [
633
+ "# Start the instruct finetuning\n",
634
+ "!cd \"{TRAINER_DIR}\" && \\\n",
635
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
636
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
637
+ " python lightning_trainer.py fit \\\n",
638
+ " -c \"{NOTEBOOK_DIR}/v5base-enwiki-instruct.yaml\" \\\n",
639
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
640
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
641
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
642
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n",
643
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n",
644
+ " --model.ctx_len=4096 \\\n",
645
+ " --model.bptt_learning_range=1"
646
+ ]
647
+ },
648
+ {
649
+ "cell_type": "code",
650
+ "execution_count": null,
651
+ "id": "0142da06",
652
+ "metadata": {
653
+ "papermill": {
654
+ "duration": null,
655
+ "end_time": null,
656
+ "exception": null,
657
+ "start_time": null,
658
+ "status": "pending"
659
+ },
660
+ "tags": []
661
+ },
662
+ "outputs": [],
663
+ "source": [
664
+ "# Lets export the model from the checkpoint\n",
665
+ "!cd \"{TRAINER_DIR}\" && \\\n",
666
+ " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n",
667
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\""
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": null,
673
+ "id": "b6c4cda2",
674
+ "metadata": {
675
+ "papermill": {
676
+ "duration": null,
677
+ "end_time": null,
678
+ "exception": null,
679
+ "start_time": null,
680
+ "status": "pending"
681
+ },
682
+ "tags": []
683
+ },
684
+ "outputs": [],
685
+ "source": [
686
+ "# # Lets do a quick dragon prompt validation\n",
687
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
688
+ " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
689
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\""
690
+ ]
691
+ },
692
+ {
693
+ "cell_type": "code",
694
+ "execution_count": null,
695
+ "id": "7f278017",
696
+ "metadata": {
697
+ "papermill": {
698
+ "duration": null,
699
+ "end_time": null,
700
+ "exception": null,
701
+ "start_time": null,
702
+ "status": "pending"
703
+ },
704
+ "tags": []
705
+ },
706
+ "outputs": [],
707
+ "source": [
708
+ "# Lets do a quick memory test\n",
709
+ "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
710
+ " python3 ../memory_script/eval_v5headsize2x_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\""
711
+ ]
712
+ }
713
+ ],
714
+ "metadata": {
715
+ "kernelspec": {
716
+ "display_name": "Python 3 (ipykernel)",
717
+ "language": "python",
718
+ "name": "python3"
719
+ },
720
+ "language_info": {
721
+ "codemirror_mode": {
722
+ "name": "ipython",
723
+ "version": 3
724
+ },
725
+ "file_extension": ".py",
726
+ "mimetype": "text/x-python",
727
+ "name": "python",
728
+ "nbconvert_exporter": "python",
729
+ "pygments_lexer": "ipython3",
730
+ "version": "3.11.4"
731
+ },
732
+ "papermill": {
733
+ "default_parameters": {},
734
+ "duration": null,
735
+ "end_time": null,
736
+ "environment_variables": {},
737
+ "exception": null,
738
+ "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb",
739
+ "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-headsize2x/v5-L6-D2048-E1e-1-ctx4k-part1.ipynb",
740
+ "parameters": {},
741
+ "start_time": "2023-08-25T16:07:11.154329",
742
+ "version": "2.4.0"
743
+ }
744
+ },
745
+ "nbformat": 4,
746
+ "nbformat_minor": 5
747
+ }