picocreator commited on
Commit
c7c964d
1 Parent(s): f2129ca

6102943ac872aa8ec5b577aa5a90fad18430f47ffa31a9462fa35c87c5dc25ed

Browse files
experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb ADDED
@@ -0,0 +1,893 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "c4287c8b",
7
+ "metadata": {
8
+ "papermill": {
9
+ "duration": 0.004479,
10
+ "end_time": "2023-10-11T07:03:00.576797",
11
+ "exception": false,
12
+ "start_time": "2023-10-11T07:03:00.572318",
13
+ "status": "completed"
14
+ },
15
+ "tags": []
16
+ },
17
+ "source": [
18
+ "# RWKV v5 multi-size training experiment\n",
19
+ "\n",
20
+ "**Note:** This project assumes you have the rwkv-infctx conda env setup"
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "id": "5202003b",
27
+ "metadata": {
28
+ "papermill": {
29
+ "duration": 0.002433,
30
+ "end_time": "2023-10-11T07:03:00.582179",
31
+ "exception": false,
32
+ "start_time": "2023-10-11T07:03:00.579746",
33
+ "status": "completed"
34
+ },
35
+ "tags": []
36
+ },
37
+ "source": [
38
+ "# Basic Setup"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 1,
44
+ "id": "fc416bc2",
45
+ "metadata": {
46
+ "execution": {
47
+ "iopub.execute_input": "2023-10-11T07:03:00.589288Z",
48
+ "iopub.status.busy": "2023-10-11T07:03:00.588784Z",
49
+ "iopub.status.idle": "2023-10-11T07:03:01.346279Z",
50
+ "shell.execute_reply": "2023-10-11T07:03:01.345465Z"
51
+ },
52
+ "papermill": {
53
+ "duration": 0.763505,
54
+ "end_time": "2023-10-11T07:03:01.348385",
55
+ "exception": false,
56
+ "start_time": "2023-10-11T07:03:00.584880",
57
+ "status": "completed"
58
+ },
59
+ "tags": []
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "# First lets setup the various directories, and init the model\n",
64
+ "!mkdir -p ../../../../model/\n",
65
+ "!mkdir -p ../../../../datapath/\n",
66
+ "!mkdir -p ../../../../checkpoint/"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "id": "a9dea07f",
73
+ "metadata": {
74
+ "execution": {
75
+ "iopub.execute_input": "2023-10-11T07:03:01.355823Z",
76
+ "iopub.status.busy": "2023-10-11T07:03:01.355275Z",
77
+ "iopub.status.idle": "2023-10-11T07:03:01.363208Z",
78
+ "shell.execute_reply": "2023-10-11T07:03:01.362389Z"
79
+ },
80
+ "papermill": {
81
+ "duration": 0.013626,
82
+ "end_time": "2023-10-11T07:03:01.364896",
83
+ "exception": false,
84
+ "start_time": "2023-10-11T07:03:01.351270",
85
+ "status": "completed"
86
+ },
87
+ "tags": []
88
+ },
89
+ "outputs": [
90
+ {
91
+ "name": "stdout",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "DEEPSPEED_STRAT: deepspeed_stage_2_offload\n",
95
+ "ENABLE_WANDB: True\n",
96
+ "GPU_DEVICES: auto\n",
97
+ "NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train\n",
98
+ "INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
99
+ "TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5\n",
100
+ "PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "DEEPSPEED_STRAT=\"deepspeed_stage_2_offload\"\n",
106
+ "GPU_DEVICES=\"auto\"\n",
107
+ "ENABLE_WANDB=True\n",
108
+ "\n",
109
+ "EMBED_SCALE=0.01\n",
110
+ "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
111
+ "\n",
112
+ "EMBED_SIZE=2048\n",
113
+ "\n",
114
+ "WANDB_PREFIX=f\"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}\"\n",
115
+ "FILENAME_PREFIX=f\"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}\"\n",
116
+ "\n",
117
+ "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
118
+ "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
119
+ "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
120
+ "\n",
121
+ "if ENABLE_WANDB:\n",
122
+ " WANDB_MODE=\"online\"\n",
123
+ "else:\n",
124
+ " WANDB_MODE=\"disabled\"\n",
125
+ "\n",
126
+ "# Computing the notebook, and various paths\n",
127
+ "import os\n",
128
+ "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
129
+ "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n",
130
+ "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
131
+ "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
132
+ "\n",
133
+ "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
134
+ "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
135
+ "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
136
+ "print(\"PROJECT_DIR:\", PROJECT_DIR)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 3,
142
+ "id": "bcb68665",
143
+ "metadata": {
144
+ "execution": {
145
+ "iopub.execute_input": "2023-10-11T07:03:01.372738Z",
146
+ "iopub.status.busy": "2023-10-11T07:03:01.371949Z",
147
+ "iopub.status.idle": "2023-10-11T07:03:02.581837Z",
148
+ "shell.execute_reply": "2023-10-11T07:03:02.580259Z"
149
+ },
150
+ "papermill": {
151
+ "duration": 1.216253,
152
+ "end_time": "2023-10-11T07:03:02.584117",
153
+ "exception": false,
154
+ "start_time": "2023-10-11T07:03:01.367864",
155
+ "status": "completed"
156
+ },
157
+ "tags": []
158
+ },
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "--2023-10-11 07:03:01-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\r\n",
165
+ "Resolving huggingface.co (huggingface.co)... 18.154.227.7, 18.154.227.87, 18.154.227.69, ...\r\n",
166
+ "Connecting to huggingface.co (huggingface.co)|18.154.227.7|:443... connected.\r\n",
167
+ "HTTP request sent, awaiting response... "
168
+ ]
169
+ },
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "200 OK\r\n",
175
+ "Length: 44360 (43K) [text/html]\r\n",
176
+ "Saving to: ‘v5-L6-D2048-E0_01-split-2a.pth’\r\n",
177
+ "\r\n",
178
+ "\r",
179
+ " v5-L6-D20 0%[ ] 0 --.-KB/s \r",
180
+ "v5-L6-D2048-E0_01-s 100%[===================>] 43.32K --.-KB/s in 0.001s \r\n",
181
+ "\r\n",
182
+ "2023-10-11 07:03:01 (46.3 MB/s) - ‘v5-L6-D2048-E0_01-split-2a.pth’ saved [44360/44360]\r\n",
183
+ "\r\n"
184
+ ]
185
+ },
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "--2023-10-11 07:03:02-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\r\n",
191
+ "Resolving huggingface.co (huggingface.co)... 18.154.227.67, 18.154.227.7, 18.154.227.69, ...\r\n",
192
+ "Connecting to huggingface.co (huggingface.co)|18.154.227.67|:443... connected.\r\n",
193
+ "HTTP request sent, awaiting response... "
194
+ ]
195
+ },
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "200 OK\r\n",
201
+ "Length: 44360 (43K) [text/html]\r\n",
202
+ "Saving to: ‘v5-L6-D2048-E0_01-split-2b.pth’\r\n",
203
+ "\r\n",
204
+ "\r",
205
+ " v5-L6-D20 0%[ ] 0 --.-KB/s \r",
206
+ "v5-L6-D2048-E0_01-s 100%[===================>] 43.32K --.-KB/s in 0s \r\n",
207
+ "\r\n",
208
+ "2023-10-11 07:03:02 (215 MB/s) - ‘v5-L6-D2048-E0_01-split-2b.pth’ saved [44360/44360]\r\n",
209
+ "\r\n"
210
+ ]
211
+ }
212
+ ],
213
+ "source": [
214
+ "# Get the init split model, and finetune from there\n",
215
+ "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth\"\n",
216
+ "!cd \"{PROJECT_DIR}/model/\" && wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/blob/main/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth\""
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 4,
222
+ "id": "81e32bce",
223
+ "metadata": {
224
+ "execution": {
225
+ "iopub.execute_input": "2023-10-11T07:03:02.594064Z",
226
+ "iopub.status.busy": "2023-10-11T07:03:02.592869Z",
227
+ "iopub.status.idle": "2023-10-11T07:03:12.284528Z",
228
+ "shell.execute_reply": "2023-10-11T07:03:12.283195Z"
229
+ },
230
+ "papermill": {
231
+ "duration": 9.699478,
232
+ "end_time": "2023-10-11T07:03:12.286978",
233
+ "exception": false,
234
+ "start_time": "2023-10-11T07:03:02.587500",
235
+ "status": "completed"
236
+ },
237
+ "tags": []
238
+ },
239
+ "outputs": [
240
+ {
241
+ "name": "stdout",
242
+ "output_type": "stream",
243
+ "text": [
244
+ "\r",
245
+ "Saving the dataset (0/2 shards): 0%| | 0/27200 [00:00<?, ? examples/s]"
246
+ ]
247
+ },
248
+ {
249
+ "name": "stdout",
250
+ "output_type": "stream",
251
+ "text": [
252
+ "\r",
253
+ "Saving the dataset (0/2 shards): 7%| | 2000/27200 [00:00<00:01, 16059.79 examp"
254
+ ]
255
+ },
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "\r",
261
+ "Saving the dataset (0/2 shards): 15%|▏| 4000/27200 [00:00<00:01, 17165.62 examp"
262
+ ]
263
+ },
264
+ {
265
+ "name": "stdout",
266
+ "output_type": "stream",
267
+ "text": [
268
+ "\r",
269
+ "Saving the dataset (0/2 shards): 22%|▏| 6000/27200 [00:00<00:01, 17873.84 examp"
270
+ ]
271
+ },
272
+ {
273
+ "name": "stdout",
274
+ "output_type": "stream",
275
+ "text": [
276
+ "\r",
277
+ "Saving the dataset (0/2 shards): 29%|▎| 8000/27200 [00:00<00:01, 18442.04 examp"
278
+ ]
279
+ },
280
+ {
281
+ "name": "stdout",
282
+ "output_type": "stream",
283
+ "text": [
284
+ "\r",
285
+ "Saving the dataset (0/2 shards): 40%|▍| 11000/27200 [00:00<00:00, 19214.27 exam"
286
+ ]
287
+ },
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "\r",
293
+ "Saving the dataset (0/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19584.96 exam\r",
294
+ "Saving the dataset (1/2 shards): 50%|▌| 13600/27200 [00:00<00:00, 19584.96 exam"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "\r",
302
+ "Saving the dataset (1/2 shards): 65%|▋| 17600/27200 [00:00<00:00, 20456.63 exam"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stdout",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "\r",
310
+ "Saving the dataset (1/2 shards): 79%|▊| 21600/27200 [00:01<00:00, 21374.91 exam"
311
+ ]
312
+ },
313
+ {
314
+ "name": "stdout",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "\r",
318
+ "Saving the dataset (1/2 shards): 94%|▉| 25600/27200 [00:01<00:00, 21799.50 exam"
319
+ ]
320
+ },
321
+ {
322
+ "name": "stdout",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "\r",
326
+ "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 21799.50 exam\r",
327
+ "Saving the dataset (2/2 shards): 100%|█| 27200/27200 [00:01<00:00, 20461.58 exam\r\n"
328
+ ]
329
+ },
330
+ {
331
+ "name": "stdout",
332
+ "output_type": "stream",
333
+ "text": [
334
+ "\r",
335
+ "Saving the dataset (0/1 shards): 0%| | 0/109 [00:00<?, ? examples/s]\r",
336
+ "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7981.06 examples/\r",
337
+ "Saving the dataset (1/1 shards): 100%|█| 109/109 [00:00<00:00, 7683.94 examples/\r\n"
338
+ ]
339
+ }
340
+ ],
341
+ "source": [
342
+ "# Lets preload the requried datasets\n",
343
+ "!cd \"{TRAINER_DIR}\" && \\\n",
344
+ " python3 preload_datapath.py \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\""
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "markdown",
349
+ "id": "5bf12f9e",
350
+ "metadata": {
351
+ "papermill": {
352
+ "duration": 0.00368,
353
+ "end_time": "2023-10-11T07:03:12.294651",
354
+ "exception": false,
355
+ "start_time": "2023-10-11T07:03:12.290971",
356
+ "status": "completed"
357
+ },
358
+ "tags": []
359
+ },
360
+ "source": [
361
+ "## Enwiki Stage 3 : Split-Baseline-A training"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 5,
367
+ "id": "c1566af8",
368
+ "metadata": {
369
+ "execution": {
370
+ "iopub.execute_input": "2023-10-11T07:03:12.305523Z",
371
+ "iopub.status.busy": "2023-10-11T07:03:12.304583Z",
372
+ "iopub.status.idle": "2023-10-11T07:03:29.051517Z",
373
+ "shell.execute_reply": "2023-10-11T07:03:29.049989Z"
374
+ },
375
+ "papermill": {
376
+ "duration": 16.755429,
377
+ "end_time": "2023-10-11T07:03:29.053992",
378
+ "exception": false,
379
+ "start_time": "2023-10-11T07:03:12.298563",
380
+ "status": "completed"
381
+ },
382
+ "tags": []
383
+ },
384
+ "outputs": [
385
+ {
386
+ "name": "stdout",
387
+ "output_type": "stream",
388
+ "text": [
389
+ "[2023-10-11 07:03:16,508] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
390
+ ]
391
+ },
392
+ {
393
+ "name": "stdout",
394
+ "output_type": "stream",
395
+ "text": [
396
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
397
+ ]
398
+ },
399
+ {
400
+ "name": "stdout",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2a.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
404
+ " rank_zero_warn(\r\n"
405
+ ]
406
+ },
407
+ {
408
+ "name": "stdout",
409
+ "output_type": "stream",
410
+ "text": [
411
+ "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 3015291597\r\n",
412
+ " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
413
+ "Global seed set to 3015291597\r\n"
414
+ ]
415
+ },
416
+ {
417
+ "name": "stdout",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
421
+ ]
422
+ },
423
+ {
424
+ "name": "stdout",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
428
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_070319-yi5f0p8v\u001b[0m\r\n",
429
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
430
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
431
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
432
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/yi5f0p8v\u001b[0m\r\n"
433
+ ]
434
+ },
435
+ {
436
+ "name": "stdout",
437
+ "output_type": "stream",
438
+ "text": [
439
+ "Traceback (most recent call last):\r\n",
440
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
441
+ " cli_main()\r\n",
442
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
443
+ " LightningCLI(\r\n",
444
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
445
+ " self.instantiate_classes()\r\n",
446
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
447
+ " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
448
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
449
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
450
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
451
+ " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
452
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
453
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
454
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
455
+ " component.instantiate_class(component, cfg)\r\n",
456
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
457
+ " parent[key] = group.group_class(**value)\r\n",
458
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
459
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
460
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2a.pth' does not exist\r\n"
461
+ ]
462
+ },
463
+ {
464
+ "name": "stdout",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
468
+ ]
469
+ },
470
+ {
471
+ "name": "stdout",
472
+ "output_type": "stream",
473
+ "text": [
474
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/yi5f0p8v\u001b[0m\r\n",
475
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v15\u001b[0m\r\n",
476
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n",
477
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_070319-yi5f0p8v/logs\u001b[0m\r\n"
478
+ ]
479
+ }
480
+ ],
481
+ "source": [
482
+ "# Start the foundation model training\n",
483
+ "!cd \"{TRAINER_DIR}\" && \\\n",
484
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
485
+ " python3 lightning_trainer.py fit \\\n",
486
+ " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
487
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion A3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
488
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
489
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
490
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/\" \\\n",
491
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2a.pth\" \\\n",
492
+ " --model.ctx_len=4096 \\\n",
493
+ " --model.bptt_learning_range=1"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 6,
499
+ "id": "d56d6a30",
500
+ "metadata": {
501
+ "execution": {
502
+ "iopub.execute_input": "2023-10-11T07:03:29.066944Z",
503
+ "iopub.status.busy": "2023-10-11T07:03:29.065449Z",
504
+ "iopub.status.idle": "2023-10-11T07:03:32.793675Z",
505
+ "shell.execute_reply": "2023-10-11T07:03:32.792045Z"
506
+ },
507
+ "papermill": {
508
+ "duration": 3.737581,
509
+ "end_time": "2023-10-11T07:03:32.796266",
510
+ "exception": false,
511
+ "start_time": "2023-10-11T07:03:29.058685",
512
+ "status": "completed"
513
+ },
514
+ "tags": []
515
+ },
516
+ "outputs": [
517
+ {
518
+ "name": "stdout",
519
+ "output_type": "stream",
520
+ "text": [
521
+ "[2023-10-11 07:03:31,388] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
522
+ ]
523
+ },
524
+ {
525
+ "name": "stdout",
526
+ "output_type": "stream",
527
+ "text": [
528
+ "Traceback (most recent call last):\r\n",
529
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
530
+ " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
531
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
532
+ " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
533
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
534
+ " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
535
+ "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/last.ckpt/latest\r\n"
536
+ ]
537
+ },
538
+ {
539
+ "name": "stdout",
540
+ "output_type": "stream",
541
+ "text": [
542
+ "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth': No such file or directory\r\n"
543
+ ]
544
+ }
545
+ ],
546
+ "source": [
547
+ "# Lets export the model from the checkpoint\n",
548
+ "!cd \"{TRAINER_DIR}\" && \\\n",
549
+ " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"bf16\"\n",
550
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\""
551
+ ]
552
+ },
553
+ {
554
+ "cell_type": "code",
555
+ "execution_count": 7,
556
+ "id": "f4c37d28",
557
+ "metadata": {
558
+ "execution": {
559
+ "iopub.execute_input": "2023-10-11T07:03:32.809297Z",
560
+ "iopub.status.busy": "2023-10-11T07:03:32.808263Z",
561
+ "iopub.status.idle": "2023-10-11T07:03:39.200791Z",
562
+ "shell.execute_reply": "2023-10-11T07:03:39.199726Z"
563
+ },
564
+ "papermill": {
565
+ "duration": 6.402526,
566
+ "end_time": "2023-10-11T07:03:39.203696",
567
+ "exception": false,
568
+ "start_time": "2023-10-11T07:03:32.801170",
569
+ "status": "completed"
570
+ },
571
+ "tags": []
572
+ },
573
+ "outputs": [
574
+ {
575
+ "name": "stdout",
576
+ "output_type": "stream",
577
+ "text": [
578
+ "[2023-10-11 07:03:37,133] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
579
+ ]
580
+ },
581
+ {
582
+ "name": "stdout",
583
+ "output_type": "stream",
584
+ "text": [
585
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
586
+ "Traceback (most recent call last):\r\n",
587
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
588
+ " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
589
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
590
+ " self.model = RWKV(**model_config)\r\n",
591
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
592
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
593
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth' does not exist\r\n"
594
+ ]
595
+ }
596
+ ],
597
+ "source": [
598
+ "# # Lets do a quick dragon prompt validation\n",
599
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
600
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-a3.pth\" \"cuda fp32\""
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "markdown",
605
+ "id": "39b5c9e4",
606
+ "metadata": {
607
+ "papermill": {
608
+ "duration": 0.006891,
609
+ "end_time": "2023-10-11T07:03:39.217738",
610
+ "exception": false,
611
+ "start_time": "2023-10-11T07:03:39.210847",
612
+ "status": "completed"
613
+ },
614
+ "tags": []
615
+ },
616
+ "source": [
617
+ "## Enwiki Stage 3 : Split-Baseline-B training"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "execution_count": 8,
623
+ "id": "6a2803dc",
624
+ "metadata": {
625
+ "execution": {
626
+ "iopub.execute_input": "2023-10-11T07:03:39.235541Z",
627
+ "iopub.status.busy": "2023-10-11T07:03:39.234393Z",
628
+ "iopub.status.idle": "2023-10-11T07:03:54.515077Z",
629
+ "shell.execute_reply": "2023-10-11T07:03:54.513988Z"
630
+ },
631
+ "papermill": {
632
+ "duration": 15.292458,
633
+ "end_time": "2023-10-11T07:03:54.517573",
634
+ "exception": false,
635
+ "start_time": "2023-10-11T07:03:39.225115",
636
+ "status": "completed"
637
+ },
638
+ "tags": []
639
+ },
640
+ "outputs": [
641
+ {
642
+ "name": "stdout",
643
+ "output_type": "stream",
644
+ "text": [
645
+ "[2023-10-11 07:03:43,703] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
646
+ ]
647
+ },
648
+ {
649
+ "name": "stdout",
650
+ "output_type": "stream",
651
+ "text": [
652
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
653
+ ]
654
+ },
655
+ {
656
+ "name": "stdout",
657
+ "output_type": "stream",
658
+ "text": [
659
+ "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/enwiki-4k-part3.yaml', '--trainer.logger.init_args.name=[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/', '--model.load_model=../model/v5-L6+6-D2048-E0_01-split-2b.pth', '--model.ctx_len=4096', '--model.bptt_learning_range=1'].\r\n",
660
+ " rank_zero_warn(\r\n"
661
+ ]
662
+ },
663
+ {
664
+ "name": "stdout",
665
+ "output_type": "stream",
666
+ "text": [
667
+ "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 736269213\r\n",
668
+ " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n",
669
+ "Global seed set to 736269213\r\n"
670
+ ]
671
+ },
672
+ {
673
+ "name": "stdout",
674
+ "output_type": "stream",
675
+ "text": [
676
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n"
677
+ ]
678
+ },
679
+ {
680
+ "name": "stdout",
681
+ "output_type": "stream",
682
+ "text": [
683
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.12\r\n",
684
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231011_070346-9urskvqu\u001b[0m\r\n",
685
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n",
686
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m\r\n",
687
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments\u001b[0m\r\n",
688
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/9urskvqu\u001b[0m\r\n"
689
+ ]
690
+ },
691
+ {
692
+ "name": "stdout",
693
+ "output_type": "stream",
694
+ "text": [
695
+ "Traceback (most recent call last):\r\n",
696
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 278, in <module>\r\n",
697
+ " cli_main()\r\n",
698
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 253, in cli_main\r\n",
699
+ " LightningCLI(\r\n",
700
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n",
701
+ " self.instantiate_classes()\r\n",
702
+ " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n",
703
+ " self.config_init = self.parser.instantiate_classes(self.config)\r\n",
704
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
705
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
706
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n",
707
+ " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n",
708
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n",
709
+ " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n",
710
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n",
711
+ " component.instantiate_class(component, cfg)\r\n",
712
+ " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n",
713
+ " parent[key] = group.group_class(**value)\r\n",
714
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
715
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
716
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-split-2b.pth' does not exist\r\n"
717
+ ]
718
+ },
719
+ {
720
+ "name": "stdout",
721
+ "output_type": "stream",
722
+ "text": [
723
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n"
724
+ ]
725
+ },
726
+ {
727
+ "name": "stdout",
728
+ "output_type": "stream",
729
+ "text": [
730
+ "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/9urskvqu\u001b[0m\r\n",
731
+ "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v15\u001b[0m\r\n",
732
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\r\n",
733
+ "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231011_070346-9urskvqu/logs\u001b[0m\r\n"
734
+ ]
735
+ }
736
+ ],
737
+ "source": [
738
+ "# Start the foundation model training\n",
739
+ "!cd \"{TRAINER_DIR}\" && \\\n",
740
+ " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
741
+ " python3 lightning_trainer.py fit \\\n",
742
+ " -c \"{NOTEBOOK_DIR}/enwiki-4k-part3.yaml\" \\\n",
743
+ " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - layer-expansion B3 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
744
+ " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
745
+ " --trainer.devices=\"{GPU_DEVICES}\" \\\n",
746
+ " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/\" \\\n",
747
+ " --model.load_model=\"../model/{FILENAME_PREFIX}-split-2b.pth\" \\\n",
748
+ " --model.ctx_len=4096 \\\n",
749
+ " --model.bptt_learning_range=1"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": 9,
755
+ "id": "9f50e589",
756
+ "metadata": {
757
+ "execution": {
758
+ "iopub.execute_input": "2023-10-11T07:03:54.533944Z",
759
+ "iopub.status.busy": "2023-10-11T07:03:54.532500Z",
760
+ "iopub.status.idle": "2023-10-11T07:03:58.532096Z",
761
+ "shell.execute_reply": "2023-10-11T07:03:58.530934Z"
762
+ },
763
+ "papermill": {
764
+ "duration": 4.010668,
765
+ "end_time": "2023-10-11T07:03:58.534788",
766
+ "exception": false,
767
+ "start_time": "2023-10-11T07:03:54.524120",
768
+ "status": "completed"
769
+ },
770
+ "tags": []
771
+ },
772
+ "outputs": [
773
+ {
774
+ "name": "stdout",
775
+ "output_type": "stream",
776
+ "text": [
777
+ "[2023-10-11 07:03:57,001] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
778
+ ]
779
+ },
780
+ {
781
+ "name": "stdout",
782
+ "output_type": "stream",
783
+ "text": [
784
+ "Traceback (most recent call last):\r\n",
785
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in <module>\r\n",
786
+ " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n",
787
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n",
788
+ " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n",
789
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n",
790
+ " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n",
791
+ "ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/last.ckpt/latest\r\n"
792
+ ]
793
+ },
794
+ {
795
+ "name": "stdout",
796
+ "output_type": "stream",
797
+ "text": [
798
+ "ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth': No such file or directory\r\n"
799
+ ]
800
+ }
801
+ ],
802
+ "source": [
803
+ "# Lets export the model from the checkpoint\n",
804
+ "!cd \"{TRAINER_DIR}\" && \\\n",
805
+ " python3 export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/last.ckpt\" \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"bf16\"\n",
806
+ "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\""
807
+ ]
808
+ },
809
+ {
810
+ "cell_type": "code",
811
+ "execution_count": 10,
812
+ "id": "777a63eb",
813
+ "metadata": {
814
+ "execution": {
815
+ "iopub.execute_input": "2023-10-11T07:03:58.550672Z",
816
+ "iopub.status.busy": "2023-10-11T07:03:58.550018Z",
817
+ "iopub.status.idle": "2023-10-11T07:04:05.053290Z",
818
+ "shell.execute_reply": "2023-10-11T07:04:05.052230Z"
819
+ },
820
+ "papermill": {
821
+ "duration": 6.514677,
822
+ "end_time": "2023-10-11T07:04:05.056026",
823
+ "exception": false,
824
+ "start_time": "2023-10-11T07:03:58.541349",
825
+ "status": "completed"
826
+ },
827
+ "tags": []
828
+ },
829
+ "outputs": [
830
+ {
831
+ "name": "stdout",
832
+ "output_type": "stream",
833
+ "text": [
834
+ "[2023-10-11 07:04:02,965] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
835
+ ]
836
+ },
837
+ {
838
+ "name": "stdout",
839
+ "output_type": "stream",
840
+ "text": [
841
+ "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
842
+ "Traceback (most recent call last):\r\n",
843
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
844
+ " model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
845
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
846
+ " self.model = RWKV(**model_config)\r\n",
847
+ " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
848
+ " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
849
+ "ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth' does not exist\r\n"
850
+ ]
851
+ }
852
+ ],
853
+ "source": [
854
+ "# # Lets do a quick dragon prompt validation\n",
855
+ "!cd \"{INFERENCE_DIR}\" && \\\n",
856
+ " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-layer-expansion-b3.pth\" \"cuda fp32\""
857
+ ]
858
+ }
859
+ ],
860
+ "metadata": {
861
+ "kernelspec": {
862
+ "display_name": "Python 3 (ipykernel)",
863
+ "language": "python",
864
+ "name": "python3"
865
+ },
866
+ "language_info": {
867
+ "codemirror_mode": {
868
+ "name": "ipython",
869
+ "version": 3
870
+ },
871
+ "file_extension": ".py",
872
+ "mimetype": "text/x-python",
873
+ "name": "python",
874
+ "nbconvert_exporter": "python",
875
+ "pygments_lexer": "ipython3",
876
+ "version": "3.10.12"
877
+ },
878
+ "papermill": {
879
+ "default_parameters": {},
880
+ "duration": 66.115721,
881
+ "end_time": "2023-10-11T07:04:05.485131",
882
+ "environment_variables": {},
883
+ "exception": null,
884
+ "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
885
+ "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6+6-D2048-layer-baseline.ipynb",
886
+ "parameters": {},
887
+ "start_time": "2023-10-11T07:02:59.369410",
888
+ "version": "2.4.0"
889
+ }
890
+ },
891
+ "nbformat": 4,
892
+ "nbformat_minor": 5
893
+ }