fasterinnerlooper commited on
Commit
4845b9c
·
verified ·
1 Parent(s): 4945291

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Salesforce/codet5p-220m",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "classifier_dropout": 0.0,
8
+ "d_ff": 3072,
9
+ "d_kv": 64,
10
+ "d_model": 768,
11
+ "decoder_start_token_id": 0,
12
+ "dense_act_fn": "relu",
13
+ "device_map": "auto",
14
+ "dropout_rate": 0.1,
15
+ "eos_token_id": 2,
16
+ "feed_forward_proj": "relu",
17
+ "initializer_factor": 1.0,
18
+ "is_encoder_decoder": true,
19
+ "is_gated_act": false,
20
+ "layer_norm_epsilon": 1e-06,
21
+ "model_type": "t5",
22
+ "n_positions": 512,
23
+ "num_decoder_layers": 12,
24
+ "num_heads": 12,
25
+ "num_layers": 12,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.38.2",
32
+ "use_cache": false,
33
+ "vocab_size": 32100
34
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 2,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.38.2",
8
+ "use_cache": false
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be9d15935068c9e1c1bbf639b8d926fa3ddc169e21c701f0f232d1211007a9ef
3
+ size 891558696
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea4e9db6603d9237f15556c79dfa8010d5ebd5e09900be89ee5f8be19ebd96c7
3
+ size 447691160
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a31d7f648f6de0e22201444447661f9868e75ea6c8fcfdddaf19b8d926606c13
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e765c138857d0512f08fccf3f80d6b56577aa6ed602bf5213da46cd1932b86bc
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<extra_id_99>",
5
+ "lstrip": true,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<extra_id_98>",
12
+ "lstrip": true,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<extra_id_97>",
19
+ "lstrip": true,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<extra_id_96>",
26
+ "lstrip": true,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<extra_id_95>",
33
+ "lstrip": true,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<extra_id_94>",
40
+ "lstrip": true,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<extra_id_93>",
47
+ "lstrip": true,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<extra_id_92>",
54
+ "lstrip": true,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<extra_id_91>",
61
+ "lstrip": true,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<extra_id_90>",
68
+ "lstrip": true,
69
+ "normalized": true,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<extra_id_89>",
75
+ "lstrip": true,
76
+ "normalized": true,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "<extra_id_88>",
82
+ "lstrip": true,
83
+ "normalized": true,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<extra_id_87>",
89
+ "lstrip": true,
90
+ "normalized": true,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "<extra_id_86>",
96
+ "lstrip": true,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<extra_id_85>",
103
+ "lstrip": true,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "<extra_id_84>",
110
+ "lstrip": true,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "<extra_id_83>",
117
+ "lstrip": true,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "<extra_id_82>",
124
+ "lstrip": true,
125
+ "normalized": true,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "<extra_id_81>",
131
+ "lstrip": true,
132
+ "normalized": true,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "<extra_id_80>",
138
+ "lstrip": true,
139
+ "normalized": true,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ {
144
+ "content": "<extra_id_79>",
145
+ "lstrip": true,
146
+ "normalized": true,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ {
151
+ "content": "<extra_id_78>",
152
+ "lstrip": true,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false
156
+ },
157
+ {
158
+ "content": "<extra_id_77>",
159
+ "lstrip": true,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false
163
+ },
164
+ {
165
+ "content": "<extra_id_76>",
166
+ "lstrip": true,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false
170
+ },
171
+ {
172
+ "content": "<extra_id_75>",
173
+ "lstrip": true,
174
+ "normalized": true,
175
+ "rstrip": false,
176
+ "single_word": false
177
+ },
178
+ {
179
+ "content": "<extra_id_74>",
180
+ "lstrip": true,
181
+ "normalized": true,
182
+ "rstrip": false,
183
+ "single_word": false
184
+ },
185
+ {
186
+ "content": "<extra_id_73>",
187
+ "lstrip": true,
188
+ "normalized": true,
189
+ "rstrip": false,
190
+ "single_word": false
191
+ },
192
+ {
193
+ "content": "<extra_id_72>",
194
+ "lstrip": true,
195
+ "normalized": true,
196
+ "rstrip": false,
197
+ "single_word": false
198
+ },
199
+ {
200
+ "content": "<extra_id_71>",
201
+ "lstrip": true,
202
+ "normalized": true,
203
+ "rstrip": false,
204
+ "single_word": false
205
+ },
206
+ {
207
+ "content": "<extra_id_70>",
208
+ "lstrip": true,
209
+ "normalized": true,
210
+ "rstrip": false,
211
+ "single_word": false
212
+ },
213
+ {
214
+ "content": "<extra_id_69>",
215
+ "lstrip": true,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false
219
+ },
220
+ {
221
+ "content": "<extra_id_68>",
222
+ "lstrip": true,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false
226
+ },
227
+ {
228
+ "content": "<extra_id_67>",
229
+ "lstrip": true,
230
+ "normalized": true,
231
+ "rstrip": false,
232
+ "single_word": false
233
+ },
234
+ {
235
+ "content": "<extra_id_66>",
236
+ "lstrip": true,
237
+ "normalized": true,
238
+ "rstrip": false,
239
+ "single_word": false
240
+ },
241
+ {
242
+ "content": "<extra_id_65>",
243
+ "lstrip": true,
244
+ "normalized": true,
245
+ "rstrip": false,
246
+ "single_word": false
247
+ },
248
+ {
249
+ "content": "<extra_id_64>",
250
+ "lstrip": true,
251
+ "normalized": true,
252
+ "rstrip": false,
253
+ "single_word": false
254
+ },
255
+ {
256
+ "content": "<extra_id_63>",
257
+ "lstrip": true,
258
+ "normalized": true,
259
+ "rstrip": false,
260
+ "single_word": false
261
+ },
262
+ {
263
+ "content": "<extra_id_62>",
264
+ "lstrip": true,
265
+ "normalized": true,
266
+ "rstrip": false,
267
+ "single_word": false
268
+ },
269
+ {
270
+ "content": "<extra_id_61>",
271
+ "lstrip": true,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false
275
+ },
276
+ {
277
+ "content": "<extra_id_60>",
278
+ "lstrip": true,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false
282
+ },
283
+ {
284
+ "content": "<extra_id_59>",
285
+ "lstrip": true,
286
+ "normalized": true,
287
+ "rstrip": false,
288
+ "single_word": false
289
+ },
290
+ {
291
+ "content": "<extra_id_58>",
292
+ "lstrip": true,
293
+ "normalized": true,
294
+ "rstrip": false,
295
+ "single_word": false
296
+ },
297
+ {
298
+ "content": "<extra_id_57>",
299
+ "lstrip": true,
300
+ "normalized": true,
301
+ "rstrip": false,
302
+ "single_word": false
303
+ },
304
+ {
305
+ "content": "<extra_id_56>",
306
+ "lstrip": true,
307
+ "normalized": true,
308
+ "rstrip": false,
309
+ "single_word": false
310
+ },
311
+ {
312
+ "content": "<extra_id_55>",
313
+ "lstrip": true,
314
+ "normalized": true,
315
+ "rstrip": false,
316
+ "single_word": false
317
+ },
318
+ {
319
+ "content": "<extra_id_54>",
320
+ "lstrip": true,
321
+ "normalized": true,
322
+ "rstrip": false,
323
+ "single_word": false
324
+ },
325
+ {
326
+ "content": "<extra_id_53>",
327
+ "lstrip": true,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false
331
+ },
332
+ {
333
+ "content": "<extra_id_52>",
334
+ "lstrip": true,
335
+ "normalized": true,
336
+ "rstrip": false,
337
+ "single_word": false
338
+ },
339
+ {
340
+ "content": "<extra_id_51>",
341
+ "lstrip": true,
342
+ "normalized": true,
343
+ "rstrip": false,
344
+ "single_word": false
345
+ },
346
+ {
347
+ "content": "<extra_id_50>",
348
+ "lstrip": true,
349
+ "normalized": true,
350
+ "rstrip": false,
351
+ "single_word": false
352
+ },
353
+ {
354
+ "content": "<extra_id_49>",
355
+ "lstrip": true,
356
+ "normalized": true,
357
+ "rstrip": false,
358
+ "single_word": false
359
+ },
360
+ {
361
+ "content": "<extra_id_48>",
362
+ "lstrip": true,
363
+ "normalized": true,
364
+ "rstrip": false,
365
+ "single_word": false
366
+ },
367
+ {
368
+ "content": "<extra_id_47>",
369
+ "lstrip": true,
370
+ "normalized": true,
371
+ "rstrip": false,
372
+ "single_word": false
373
+ },
374
+ {
375
+ "content": "<extra_id_46>",
376
+ "lstrip": true,
377
+ "normalized": true,
378
+ "rstrip": false,
379
+ "single_word": false
380
+ },
381
+ {
382
+ "content": "<extra_id_45>",
383
+ "lstrip": true,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false
387
+ },
388
+ {
389
+ "content": "<extra_id_44>",
390
+ "lstrip": true,
391
+ "normalized": true,
392
+ "rstrip": false,
393
+ "single_word": false
394
+ },
395
+ {
396
+ "content": "<extra_id_43>",
397
+ "lstrip": true,
398
+ "normalized": true,
399
+ "rstrip": false,
400
+ "single_word": false
401
+ },
402
+ {
403
+ "content": "<extra_id_42>",
404
+ "lstrip": true,
405
+ "normalized": true,
406
+ "rstrip": false,
407
+ "single_word": false
408
+ },
409
+ {
410
+ "content": "<extra_id_41>",
411
+ "lstrip": true,
412
+ "normalized": true,
413
+ "rstrip": false,
414
+ "single_word": false
415
+ },
416
+ {
417
+ "content": "<extra_id_40>",
418
+ "lstrip": true,
419
+ "normalized": true,
420
+ "rstrip": false,
421
+ "single_word": false
422
+ },
423
+ {
424
+ "content": "<extra_id_39>",
425
+ "lstrip": true,
426
+ "normalized": true,
427
+ "rstrip": false,
428
+ "single_word": false
429
+ },
430
+ {
431
+ "content": "<extra_id_38>",
432
+ "lstrip": true,
433
+ "normalized": true,
434
+ "rstrip": false,
435
+ "single_word": false
436
+ },
437
+ {
438
+ "content": "<extra_id_37>",
439
+ "lstrip": true,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false
443
+ },
444
+ {
445
+ "content": "<extra_id_36>",
446
+ "lstrip": true,
447
+ "normalized": true,
448
+ "rstrip": false,
449
+ "single_word": false
450
+ },
451
+ {
452
+ "content": "<extra_id_35>",
453
+ "lstrip": true,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false
457
+ },
458
+ {
459
+ "content": "<extra_id_34>",
460
+ "lstrip": true,
461
+ "normalized": true,
462
+ "rstrip": false,
463
+ "single_word": false
464
+ },
465
+ {
466
+ "content": "<extra_id_33>",
467
+ "lstrip": true,
468
+ "normalized": true,
469
+ "rstrip": false,
470
+ "single_word": false
471
+ },
472
+ {
473
+ "content": "<extra_id_32>",
474
+ "lstrip": true,
475
+ "normalized": true,
476
+ "rstrip": false,
477
+ "single_word": false
478
+ },
479
+ {
480
+ "content": "<extra_id_31>",
481
+ "lstrip": true,
482
+ "normalized": true,
483
+ "rstrip": false,
484
+ "single_word": false
485
+ },
486
+ {
487
+ "content": "<extra_id_30>",
488
+ "lstrip": true,
489
+ "normalized": true,
490
+ "rstrip": false,
491
+ "single_word": false
492
+ },
493
+ {
494
+ "content": "<extra_id_29>",
495
+ "lstrip": true,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false
499
+ },
500
+ {
501
+ "content": "<extra_id_28>",
502
+ "lstrip": true,
503
+ "normalized": true,
504
+ "rstrip": false,
505
+ "single_word": false
506
+ },
507
+ {
508
+ "content": "<extra_id_27>",
509
+ "lstrip": true,
510
+ "normalized": true,
511
+ "rstrip": false,
512
+ "single_word": false
513
+ },
514
+ {
515
+ "content": "<extra_id_26>",
516
+ "lstrip": true,
517
+ "normalized": true,
518
+ "rstrip": false,
519
+ "single_word": false
520
+ },
521
+ {
522
+ "content": "<extra_id_25>",
523
+ "lstrip": true,
524
+ "normalized": true,
525
+ "rstrip": false,
526
+ "single_word": false
527
+ },
528
+ {
529
+ "content": "<extra_id_24>",
530
+ "lstrip": true,
531
+ "normalized": true,
532
+ "rstrip": false,
533
+ "single_word": false
534
+ },
535
+ {
536
+ "content": "<extra_id_23>",
537
+ "lstrip": true,
538
+ "normalized": true,
539
+ "rstrip": false,
540
+ "single_word": false
541
+ },
542
+ {
543
+ "content": "<extra_id_22>",
544
+ "lstrip": true,
545
+ "normalized": true,
546
+ "rstrip": false,
547
+ "single_word": false
548
+ },
549
+ {
550
+ "content": "<extra_id_21>",
551
+ "lstrip": true,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false
555
+ },
556
+ {
557
+ "content": "<extra_id_20>",
558
+ "lstrip": true,
559
+ "normalized": true,
560
+ "rstrip": false,
561
+ "single_word": false
562
+ },
563
+ {
564
+ "content": "<extra_id_19>",
565
+ "lstrip": true,
566
+ "normalized": true,
567
+ "rstrip": false,
568
+ "single_word": false
569
+ },
570
+ {
571
+ "content": "<extra_id_18>",
572
+ "lstrip": true,
573
+ "normalized": true,
574
+ "rstrip": false,
575
+ "single_word": false
576
+ },
577
+ {
578
+ "content": "<extra_id_17>",
579
+ "lstrip": true,
580
+ "normalized": true,
581
+ "rstrip": false,
582
+ "single_word": false
583
+ },
584
+ {
585
+ "content": "<extra_id_16>",
586
+ "lstrip": true,
587
+ "normalized": true,
588
+ "rstrip": false,
589
+ "single_word": false
590
+ },
591
+ {
592
+ "content": "<extra_id_15>",
593
+ "lstrip": true,
594
+ "normalized": true,
595
+ "rstrip": false,
596
+ "single_word": false
597
+ },
598
+ {
599
+ "content": "<extra_id_14>",
600
+ "lstrip": true,
601
+ "normalized": true,
602
+ "rstrip": false,
603
+ "single_word": false
604
+ },
605
+ {
606
+ "content": "<extra_id_13>",
607
+ "lstrip": true,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false
611
+ },
612
+ {
613
+ "content": "<extra_id_12>",
614
+ "lstrip": true,
615
+ "normalized": true,
616
+ "rstrip": false,
617
+ "single_word": false
618
+ },
619
+ {
620
+ "content": "<extra_id_11>",
621
+ "lstrip": true,
622
+ "normalized": true,
623
+ "rstrip": false,
624
+ "single_word": false
625
+ },
626
+ {
627
+ "content": "<extra_id_10>",
628
+ "lstrip": true,
629
+ "normalized": true,
630
+ "rstrip": false,
631
+ "single_word": false
632
+ },
633
+ {
634
+ "content": "<extra_id_9>",
635
+ "lstrip": true,
636
+ "normalized": true,
637
+ "rstrip": false,
638
+ "single_word": false
639
+ },
640
+ {
641
+ "content": "<extra_id_8>",
642
+ "lstrip": true,
643
+ "normalized": true,
644
+ "rstrip": false,
645
+ "single_word": false
646
+ },
647
+ {
648
+ "content": "<extra_id_7>",
649
+ "lstrip": true,
650
+ "normalized": true,
651
+ "rstrip": false,
652
+ "single_word": false
653
+ },
654
+ {
655
+ "content": "<extra_id_6>",
656
+ "lstrip": true,
657
+ "normalized": true,
658
+ "rstrip": false,
659
+ "single_word": false
660
+ },
661
+ {
662
+ "content": "<extra_id_5>",
663
+ "lstrip": true,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false
667
+ },
668
+ {
669
+ "content": "<extra_id_4>",
670
+ "lstrip": true,
671
+ "normalized": true,
672
+ "rstrip": false,
673
+ "single_word": false
674
+ },
675
+ {
676
+ "content": "<extra_id_3>",
677
+ "lstrip": true,
678
+ "normalized": true,
679
+ "rstrip": false,
680
+ "single_word": false
681
+ },
682
+ {
683
+ "content": "<extra_id_2>",
684
+ "lstrip": true,
685
+ "normalized": true,
686
+ "rstrip": false,
687
+ "single_word": false
688
+ },
689
+ {
690
+ "content": "<extra_id_1>",
691
+ "lstrip": true,
692
+ "normalized": true,
693
+ "rstrip": false,
694
+ "single_word": false
695
+ },
696
+ {
697
+ "content": "<extra_id_0>",
698
+ "lstrip": true,
699
+ "normalized": true,
700
+ "rstrip": false,
701
+ "single_word": false
702
+ }
703
+ ],
704
+ "bos_token": {
705
+ "content": "<s>",
706
+ "lstrip": false,
707
+ "normalized": true,
708
+ "rstrip": false,
709
+ "single_word": false
710
+ },
711
+ "cls_token": {
712
+ "content": "<s>",
713
+ "lstrip": false,
714
+ "normalized": true,
715
+ "rstrip": false,
716
+ "single_word": false
717
+ },
718
+ "eos_token": {
719
+ "content": "</s>",
720
+ "lstrip": false,
721
+ "normalized": true,
722
+ "rstrip": false,
723
+ "single_word": false
724
+ },
725
+ "mask_token": {
726
+ "content": "<mask>",
727
+ "lstrip": true,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false
731
+ },
732
+ "pad_token": {
733
+ "content": "<pad>",
734
+ "lstrip": false,
735
+ "normalized": true,
736
+ "rstrip": false,
737
+ "single_word": false
738
+ },
739
+ "sep_token": {
740
+ "content": "</s>",
741
+ "lstrip": false,
742
+ "normalized": true,
743
+ "rstrip": false,
744
+ "single_word": false
745
+ },
746
+ "unk_token": {
747
+ "content": "<unk>",
748
+ "lstrip": false,
749
+ "normalized": true,
750
+ "rstrip": false,
751
+ "single_word": false
752
+ }
753
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,959 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "32000": {
45
+ "content": "<extra_id_99>",
46
+ "lstrip": true,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "32001": {
53
+ "content": "<extra_id_98>",
54
+ "lstrip": true,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "32002": {
61
+ "content": "<extra_id_97>",
62
+ "lstrip": true,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "32003": {
69
+ "content": "<extra_id_96>",
70
+ "lstrip": true,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "32004": {
77
+ "content": "<extra_id_95>",
78
+ "lstrip": true,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "32005": {
85
+ "content": "<extra_id_94>",
86
+ "lstrip": true,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "32006": {
93
+ "content": "<extra_id_93>",
94
+ "lstrip": true,
95
+ "normalized": true,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "32007": {
101
+ "content": "<extra_id_92>",
102
+ "lstrip": true,
103
+ "normalized": true,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "32008": {
109
+ "content": "<extra_id_91>",
110
+ "lstrip": true,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "32009": {
117
+ "content": "<extra_id_90>",
118
+ "lstrip": true,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "32010": {
125
+ "content": "<extra_id_89>",
126
+ "lstrip": true,
127
+ "normalized": true,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "32011": {
133
+ "content": "<extra_id_88>",
134
+ "lstrip": true,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "32012": {
141
+ "content": "<extra_id_87>",
142
+ "lstrip": true,
143
+ "normalized": true,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "32013": {
149
+ "content": "<extra_id_86>",
150
+ "lstrip": true,
151
+ "normalized": true,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "32014": {
157
+ "content": "<extra_id_85>",
158
+ "lstrip": true,
159
+ "normalized": true,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "32015": {
165
+ "content": "<extra_id_84>",
166
+ "lstrip": true,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "32016": {
173
+ "content": "<extra_id_83>",
174
+ "lstrip": true,
175
+ "normalized": true,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "32017": {
181
+ "content": "<extra_id_82>",
182
+ "lstrip": true,
183
+ "normalized": true,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "32018": {
189
+ "content": "<extra_id_81>",
190
+ "lstrip": true,
191
+ "normalized": true,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "32019": {
197
+ "content": "<extra_id_80>",
198
+ "lstrip": true,
199
+ "normalized": true,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "32020": {
205
+ "content": "<extra_id_79>",
206
+ "lstrip": true,
207
+ "normalized": true,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "32021": {
213
+ "content": "<extra_id_78>",
214
+ "lstrip": true,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "32022": {
221
+ "content": "<extra_id_77>",
222
+ "lstrip": true,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "32023": {
229
+ "content": "<extra_id_76>",
230
+ "lstrip": true,
231
+ "normalized": true,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "32024": {
237
+ "content": "<extra_id_75>",
238
+ "lstrip": true,
239
+ "normalized": true,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "32025": {
245
+ "content": "<extra_id_74>",
246
+ "lstrip": true,
247
+ "normalized": true,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "32026": {
253
+ "content": "<extra_id_73>",
254
+ "lstrip": true,
255
+ "normalized": true,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "32027": {
261
+ "content": "<extra_id_72>",
262
+ "lstrip": true,
263
+ "normalized": true,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "32028": {
269
+ "content": "<extra_id_71>",
270
+ "lstrip": true,
271
+ "normalized": true,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "32029": {
277
+ "content": "<extra_id_70>",
278
+ "lstrip": true,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "32030": {
285
+ "content": "<extra_id_69>",
286
+ "lstrip": true,
287
+ "normalized": true,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "32031": {
293
+ "content": "<extra_id_68>",
294
+ "lstrip": true,
295
+ "normalized": true,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "32032": {
301
+ "content": "<extra_id_67>",
302
+ "lstrip": true,
303
+ "normalized": true,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "32033": {
309
+ "content": "<extra_id_66>",
310
+ "lstrip": true,
311
+ "normalized": true,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "32034": {
317
+ "content": "<extra_id_65>",
318
+ "lstrip": true,
319
+ "normalized": true,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "32035": {
325
+ "content": "<extra_id_64>",
326
+ "lstrip": true,
327
+ "normalized": true,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "32036": {
333
+ "content": "<extra_id_63>",
334
+ "lstrip": true,
335
+ "normalized": true,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "32037": {
341
+ "content": "<extra_id_62>",
342
+ "lstrip": true,
343
+ "normalized": true,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "32038": {
349
+ "content": "<extra_id_61>",
350
+ "lstrip": true,
351
+ "normalized": true,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "32039": {
357
+ "content": "<extra_id_60>",
358
+ "lstrip": true,
359
+ "normalized": true,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "32040": {
365
+ "content": "<extra_id_59>",
366
+ "lstrip": true,
367
+ "normalized": true,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "32041": {
373
+ "content": "<extra_id_58>",
374
+ "lstrip": true,
375
+ "normalized": true,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "32042": {
381
+ "content": "<extra_id_57>",
382
+ "lstrip": true,
383
+ "normalized": true,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "32043": {
389
+ "content": "<extra_id_56>",
390
+ "lstrip": true,
391
+ "normalized": true,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "32044": {
397
+ "content": "<extra_id_55>",
398
+ "lstrip": true,
399
+ "normalized": true,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "32045": {
405
+ "content": "<extra_id_54>",
406
+ "lstrip": true,
407
+ "normalized": true,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "32046": {
413
+ "content": "<extra_id_53>",
414
+ "lstrip": true,
415
+ "normalized": true,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "32047": {
421
+ "content": "<extra_id_52>",
422
+ "lstrip": true,
423
+ "normalized": true,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "32048": {
429
+ "content": "<extra_id_51>",
430
+ "lstrip": true,
431
+ "normalized": true,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "32049": {
437
+ "content": "<extra_id_50>",
438
+ "lstrip": true,
439
+ "normalized": true,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "32050": {
445
+ "content": "<extra_id_49>",
446
+ "lstrip": true,
447
+ "normalized": true,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "32051": {
453
+ "content": "<extra_id_48>",
454
+ "lstrip": true,
455
+ "normalized": true,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "32052": {
461
+ "content": "<extra_id_47>",
462
+ "lstrip": true,
463
+ "normalized": true,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "32053": {
469
+ "content": "<extra_id_46>",
470
+ "lstrip": true,
471
+ "normalized": true,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "32054": {
477
+ "content": "<extra_id_45>",
478
+ "lstrip": true,
479
+ "normalized": true,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "32055": {
485
+ "content": "<extra_id_44>",
486
+ "lstrip": true,
487
+ "normalized": true,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "32056": {
493
+ "content": "<extra_id_43>",
494
+ "lstrip": true,
495
+ "normalized": true,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "32057": {
501
+ "content": "<extra_id_42>",
502
+ "lstrip": true,
503
+ "normalized": true,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "32058": {
509
+ "content": "<extra_id_41>",
510
+ "lstrip": true,
511
+ "normalized": true,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "32059": {
517
+ "content": "<extra_id_40>",
518
+ "lstrip": true,
519
+ "normalized": true,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "32060": {
525
+ "content": "<extra_id_39>",
526
+ "lstrip": true,
527
+ "normalized": true,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "32061": {
533
+ "content": "<extra_id_38>",
534
+ "lstrip": true,
535
+ "normalized": true,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "32062": {
541
+ "content": "<extra_id_37>",
542
+ "lstrip": true,
543
+ "normalized": true,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "32063": {
549
+ "content": "<extra_id_36>",
550
+ "lstrip": true,
551
+ "normalized": true,
552
+ "rstrip": false,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "32064": {
557
+ "content": "<extra_id_35>",
558
+ "lstrip": true,
559
+ "normalized": true,
560
+ "rstrip": false,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "32065": {
565
+ "content": "<extra_id_34>",
566
+ "lstrip": true,
567
+ "normalized": true,
568
+ "rstrip": false,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "32066": {
573
+ "content": "<extra_id_33>",
574
+ "lstrip": true,
575
+ "normalized": true,
576
+ "rstrip": false,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "32067": {
581
+ "content": "<extra_id_32>",
582
+ "lstrip": true,
583
+ "normalized": true,
584
+ "rstrip": false,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "32068": {
589
+ "content": "<extra_id_31>",
590
+ "lstrip": true,
591
+ "normalized": true,
592
+ "rstrip": false,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "32069": {
597
+ "content": "<extra_id_30>",
598
+ "lstrip": true,
599
+ "normalized": true,
600
+ "rstrip": false,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "32070": {
605
+ "content": "<extra_id_29>",
606
+ "lstrip": true,
607
+ "normalized": true,
608
+ "rstrip": false,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "32071": {
613
+ "content": "<extra_id_28>",
614
+ "lstrip": true,
615
+ "normalized": true,
616
+ "rstrip": false,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "32072": {
621
+ "content": "<extra_id_27>",
622
+ "lstrip": true,
623
+ "normalized": true,
624
+ "rstrip": false,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "32073": {
629
+ "content": "<extra_id_26>",
630
+ "lstrip": true,
631
+ "normalized": true,
632
+ "rstrip": false,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "32074": {
637
+ "content": "<extra_id_25>",
638
+ "lstrip": true,
639
+ "normalized": true,
640
+ "rstrip": false,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "32075": {
645
+ "content": "<extra_id_24>",
646
+ "lstrip": true,
647
+ "normalized": true,
648
+ "rstrip": false,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "32076": {
653
+ "content": "<extra_id_23>",
654
+ "lstrip": true,
655
+ "normalized": true,
656
+ "rstrip": false,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "32077": {
661
+ "content": "<extra_id_22>",
662
+ "lstrip": true,
663
+ "normalized": true,
664
+ "rstrip": false,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "32078": {
669
+ "content": "<extra_id_21>",
670
+ "lstrip": true,
671
+ "normalized": true,
672
+ "rstrip": false,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "32079": {
677
+ "content": "<extra_id_20>",
678
+ "lstrip": true,
679
+ "normalized": true,
680
+ "rstrip": false,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "32080": {
685
+ "content": "<extra_id_19>",
686
+ "lstrip": true,
687
+ "normalized": true,
688
+ "rstrip": false,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "32081": {
693
+ "content": "<extra_id_18>",
694
+ "lstrip": true,
695
+ "normalized": true,
696
+ "rstrip": false,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "32082": {
701
+ "content": "<extra_id_17>",
702
+ "lstrip": true,
703
+ "normalized": true,
704
+ "rstrip": false,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "32083": {
709
+ "content": "<extra_id_16>",
710
+ "lstrip": true,
711
+ "normalized": true,
712
+ "rstrip": false,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "32084": {
717
+ "content": "<extra_id_15>",
718
+ "lstrip": true,
719
+ "normalized": true,
720
+ "rstrip": false,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "32085": {
725
+ "content": "<extra_id_14>",
726
+ "lstrip": true,
727
+ "normalized": true,
728
+ "rstrip": false,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "32086": {
733
+ "content": "<extra_id_13>",
734
+ "lstrip": true,
735
+ "normalized": true,
736
+ "rstrip": false,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "32087": {
741
+ "content": "<extra_id_12>",
742
+ "lstrip": true,
743
+ "normalized": true,
744
+ "rstrip": false,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "32088": {
749
+ "content": "<extra_id_11>",
750
+ "lstrip": true,
751
+ "normalized": true,
752
+ "rstrip": false,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "32089": {
757
+ "content": "<extra_id_10>",
758
+ "lstrip": true,
759
+ "normalized": true,
760
+ "rstrip": false,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "32090": {
765
+ "content": "<extra_id_9>",
766
+ "lstrip": true,
767
+ "normalized": true,
768
+ "rstrip": false,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "32091": {
773
+ "content": "<extra_id_8>",
774
+ "lstrip": true,
775
+ "normalized": true,
776
+ "rstrip": false,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "32092": {
781
+ "content": "<extra_id_7>",
782
+ "lstrip": true,
783
+ "normalized": true,
784
+ "rstrip": false,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "32093": {
789
+ "content": "<extra_id_6>",
790
+ "lstrip": true,
791
+ "normalized": true,
792
+ "rstrip": false,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "32094": {
797
+ "content": "<extra_id_5>",
798
+ "lstrip": true,
799
+ "normalized": true,
800
+ "rstrip": false,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "32095": {
805
+ "content": "<extra_id_4>",
806
+ "lstrip": true,
807
+ "normalized": true,
808
+ "rstrip": false,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "32096": {
813
+ "content": "<extra_id_3>",
814
+ "lstrip": true,
815
+ "normalized": true,
816
+ "rstrip": false,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "32097": {
821
+ "content": "<extra_id_2>",
822
+ "lstrip": true,
823
+ "normalized": true,
824
+ "rstrip": false,
825
+ "single_word": false,
826
+ "special": true
827
+ },
828
+ "32098": {
829
+ "content": "<extra_id_1>",
830
+ "lstrip": true,
831
+ "normalized": true,
832
+ "rstrip": false,
833
+ "single_word": false,
834
+ "special": true
835
+ },
836
+ "32099": {
837
+ "content": "<extra_id_0>",
838
+ "lstrip": true,
839
+ "normalized": true,
840
+ "rstrip": false,
841
+ "single_word": false,
842
+ "special": true
843
+ }
844
+ },
845
+ "additional_special_tokens": [
846
+ "<extra_id_99>",
847
+ "<extra_id_98>",
848
+ "<extra_id_97>",
849
+ "<extra_id_96>",
850
+ "<extra_id_95>",
851
+ "<extra_id_94>",
852
+ "<extra_id_93>",
853
+ "<extra_id_92>",
854
+ "<extra_id_91>",
855
+ "<extra_id_90>",
856
+ "<extra_id_89>",
857
+ "<extra_id_88>",
858
+ "<extra_id_87>",
859
+ "<extra_id_86>",
860
+ "<extra_id_85>",
861
+ "<extra_id_84>",
862
+ "<extra_id_83>",
863
+ "<extra_id_82>",
864
+ "<extra_id_81>",
865
+ "<extra_id_80>",
866
+ "<extra_id_79>",
867
+ "<extra_id_78>",
868
+ "<extra_id_77>",
869
+ "<extra_id_76>",
870
+ "<extra_id_75>",
871
+ "<extra_id_74>",
872
+ "<extra_id_73>",
873
+ "<extra_id_72>",
874
+ "<extra_id_71>",
875
+ "<extra_id_70>",
876
+ "<extra_id_69>",
877
+ "<extra_id_68>",
878
+ "<extra_id_67>",
879
+ "<extra_id_66>",
880
+ "<extra_id_65>",
881
+ "<extra_id_64>",
882
+ "<extra_id_63>",
883
+ "<extra_id_62>",
884
+ "<extra_id_61>",
885
+ "<extra_id_60>",
886
+ "<extra_id_59>",
887
+ "<extra_id_58>",
888
+ "<extra_id_57>",
889
+ "<extra_id_56>",
890
+ "<extra_id_55>",
891
+ "<extra_id_54>",
892
+ "<extra_id_53>",
893
+ "<extra_id_52>",
894
+ "<extra_id_51>",
895
+ "<extra_id_50>",
896
+ "<extra_id_49>",
897
+ "<extra_id_48>",
898
+ "<extra_id_47>",
899
+ "<extra_id_46>",
900
+ "<extra_id_45>",
901
+ "<extra_id_44>",
902
+ "<extra_id_43>",
903
+ "<extra_id_42>",
904
+ "<extra_id_41>",
905
+ "<extra_id_40>",
906
+ "<extra_id_39>",
907
+ "<extra_id_38>",
908
+ "<extra_id_37>",
909
+ "<extra_id_36>",
910
+ "<extra_id_35>",
911
+ "<extra_id_34>",
912
+ "<extra_id_33>",
913
+ "<extra_id_32>",
914
+ "<extra_id_31>",
915
+ "<extra_id_30>",
916
+ "<extra_id_29>",
917
+ "<extra_id_28>",
918
+ "<extra_id_27>",
919
+ "<extra_id_26>",
920
+ "<extra_id_25>",
921
+ "<extra_id_24>",
922
+ "<extra_id_23>",
923
+ "<extra_id_22>",
924
+ "<extra_id_21>",
925
+ "<extra_id_20>",
926
+ "<extra_id_19>",
927
+ "<extra_id_18>",
928
+ "<extra_id_17>",
929
+ "<extra_id_16>",
930
+ "<extra_id_15>",
931
+ "<extra_id_14>",
932
+ "<extra_id_13>",
933
+ "<extra_id_12>",
934
+ "<extra_id_11>",
935
+ "<extra_id_10>",
936
+ "<extra_id_9>",
937
+ "<extra_id_8>",
938
+ "<extra_id_7>",
939
+ "<extra_id_6>",
940
+ "<extra_id_5>",
941
+ "<extra_id_4>",
942
+ "<extra_id_3>",
943
+ "<extra_id_2>",
944
+ "<extra_id_1>",
945
+ "<extra_id_0>"
946
+ ],
947
+ "bos_token": "<s>",
948
+ "clean_up_tokenization_spaces": true,
949
+ "cls_token": "<s>",
950
+ "eos_token": "</s>",
951
+ "errors": "replace",
952
+ "mask_token": "<mask>",
953
+ "model_max_length": 512,
954
+ "pad_token": "<pad>",
955
+ "sep_token": "</s>",
956
+ "tokenizer_class": "RobertaTokenizer",
957
+ "trim_offsets": true,
958
+ "unk_token": "<unk>"
959
+ }
trainer_state.json ADDED
@@ -0,0 +1,2780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.1402364451691804,
5
+ "eval_steps": 500,
6
+ "global_step": 31500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 0.8877573609352112,
14
+ "learning_rate": 9.98641119717353e-06,
15
+ "loss": 2.7747,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.01,
20
+ "grad_norm": 0.891560435295105,
21
+ "learning_rate": 9.97282239434706e-06,
22
+ "loss": 2.3805,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02,
27
+ "grad_norm": 1.140250563621521,
28
+ "learning_rate": 9.959233591520588e-06,
29
+ "loss": 2.2018,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.03,
34
+ "grad_norm": 1.5957276821136475,
35
+ "learning_rate": 9.945644788694116e-06,
36
+ "loss": 2.1426,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.03,
41
+ "grad_norm": 1.0516589879989624,
42
+ "learning_rate": 9.932055985867646e-06,
43
+ "loss": 2.0379,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.03,
48
+ "eval_codebleu": 0.02236215080832533,
49
+ "eval_dataflow_match_score": 0.020328127538115238,
50
+ "eval_loss": 2.005807876586914,
51
+ "eval_ngram_match_score": 7.828601095920155e-05,
52
+ "eval_runtime": 211.5866,
53
+ "eval_samples_per_second": 12.019,
54
+ "eval_steps_per_second": 1.503,
55
+ "eval_syntax_match_score": 0.06800422172156935,
56
+ "eval_weighted_ngram_match_score": 0.0010379679626575443,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.04,
61
+ "grad_norm": 1.4408249855041504,
62
+ "learning_rate": 9.918467183041175e-06,
63
+ "loss": 2.1005,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.05,
68
+ "grad_norm": 1.042541742324829,
69
+ "learning_rate": 9.904878380214705e-06,
70
+ "loss": 2.0475,
71
+ "step": 700
72
+ },
73
+ {
74
+ "epoch": 0.05,
75
+ "grad_norm": 0.9206609129905701,
76
+ "learning_rate": 9.891289577388233e-06,
77
+ "loss": 2.0145,
78
+ "step": 800
79
+ },
80
+ {
81
+ "epoch": 0.06,
82
+ "grad_norm": 1.2876330614089966,
83
+ "learning_rate": 9.87770077456176e-06,
84
+ "loss": 1.9729,
85
+ "step": 900
86
+ },
87
+ {
88
+ "epoch": 0.07,
89
+ "grad_norm": 1.5496817827224731,
90
+ "learning_rate": 9.86411197173529e-06,
91
+ "loss": 1.9755,
92
+ "step": 1000
93
+ },
94
+ {
95
+ "epoch": 0.07,
96
+ "eval_codebleu": 0.026625421991617458,
97
+ "eval_dataflow_match_score": 0.017334601907502377,
98
+ "eval_loss": 1.8824141025543213,
99
+ "eval_ngram_match_score": 0.0001360979918249884,
100
+ "eval_runtime": 210.1093,
101
+ "eval_samples_per_second": 12.103,
102
+ "eval_steps_per_second": 1.513,
103
+ "eval_syntax_match_score": 0.085743571008139,
104
+ "eval_weighted_ngram_match_score": 0.0032874170590034683,
105
+ "step": 1000
106
+ },
107
+ {
108
+ "epoch": 0.07,
109
+ "grad_norm": 1.479344129562378,
110
+ "learning_rate": 9.85052316890882e-06,
111
+ "loss": 1.9526,
112
+ "step": 1100
113
+ },
114
+ {
115
+ "epoch": 0.08,
116
+ "grad_norm": 1.1165575981140137,
117
+ "learning_rate": 9.83693436608235e-06,
118
+ "loss": 1.9212,
119
+ "step": 1200
120
+ },
121
+ {
122
+ "epoch": 0.09,
123
+ "grad_norm": 1.871772289276123,
124
+ "learning_rate": 9.823345563255877e-06,
125
+ "loss": 1.8674,
126
+ "step": 1300
127
+ },
128
+ {
129
+ "epoch": 0.1,
130
+ "grad_norm": 1.1310107707977295,
131
+ "learning_rate": 9.809756760429407e-06,
132
+ "loss": 1.9083,
133
+ "step": 1400
134
+ },
135
+ {
136
+ "epoch": 0.1,
137
+ "grad_norm": 1.2779932022094727,
138
+ "learning_rate": 9.796167957602935e-06,
139
+ "loss": 1.8672,
140
+ "step": 1500
141
+ },
142
+ {
143
+ "epoch": 0.1,
144
+ "eval_codebleu": 0.03411263800674984,
145
+ "eval_dataflow_match_score": 0.035922307567354324,
146
+ "eval_loss": 1.8209562301635742,
147
+ "eval_ngram_match_score": 0.00019790870128407514,
148
+ "eval_runtime": 209.7736,
149
+ "eval_samples_per_second": 12.123,
150
+ "eval_steps_per_second": 1.516,
151
+ "eval_syntax_match_score": 0.09630802330065559,
152
+ "eval_weighted_ngram_match_score": 0.004022312457705368,
153
+ "step": 1500
154
+ },
155
+ {
156
+ "epoch": 0.11,
157
+ "grad_norm": 1.2445776462554932,
158
+ "learning_rate": 9.782579154776465e-06,
159
+ "loss": 1.8163,
160
+ "step": 1600
161
+ },
162
+ {
163
+ "epoch": 0.12,
164
+ "grad_norm": 1.673412561416626,
165
+ "learning_rate": 9.768990351949994e-06,
166
+ "loss": 1.8664,
167
+ "step": 1700
168
+ },
169
+ {
170
+ "epoch": 0.12,
171
+ "grad_norm": 1.3831863403320312,
172
+ "learning_rate": 9.755401549123524e-06,
173
+ "loss": 1.8336,
174
+ "step": 1800
175
+ },
176
+ {
177
+ "epoch": 0.13,
178
+ "grad_norm": 1.5055395364761353,
179
+ "learning_rate": 9.741812746297052e-06,
180
+ "loss": 1.879,
181
+ "step": 1900
182
+ },
183
+ {
184
+ "epoch": 0.14,
185
+ "grad_norm": 1.3736721277236938,
186
+ "learning_rate": 9.72822394347058e-06,
187
+ "loss": 1.8522,
188
+ "step": 2000
189
+ },
190
+ {
191
+ "epoch": 0.14,
192
+ "eval_codebleu": 0.04137599574546961,
193
+ "eval_dataflow_match_score": 0.06186619636599912,
194
+ "eval_loss": 1.7707544565200806,
195
+ "eval_ngram_match_score": 0.00026554637185762897,
196
+ "eval_runtime": 210.486,
197
+ "eval_samples_per_second": 12.082,
198
+ "eval_steps_per_second": 1.511,
199
+ "eval_syntax_match_score": 0.09913941829547991,
200
+ "eval_weighted_ngram_match_score": 0.00423282194854177,
201
+ "step": 2000
202
+ },
203
+ {
204
+ "epoch": 0.14,
205
+ "grad_norm": 1.5695489645004272,
206
+ "learning_rate": 9.71463514064411e-06,
207
+ "loss": 1.7906,
208
+ "step": 2100
209
+ },
210
+ {
211
+ "epoch": 0.15,
212
+ "grad_norm": 1.4151173830032349,
213
+ "learning_rate": 9.701046337817639e-06,
214
+ "loss": 1.8182,
215
+ "step": 2200
216
+ },
217
+ {
218
+ "epoch": 0.16,
219
+ "grad_norm": 1.7115099430084229,
220
+ "learning_rate": 9.687457534991169e-06,
221
+ "loss": 1.8232,
222
+ "step": 2300
223
+ },
224
+ {
225
+ "epoch": 0.16,
226
+ "grad_norm": 1.1658021211624146,
227
+ "learning_rate": 9.673868732164697e-06,
228
+ "loss": 1.7381,
229
+ "step": 2400
230
+ },
231
+ {
232
+ "epoch": 0.17,
233
+ "grad_norm": 1.5483379364013672,
234
+ "learning_rate": 9.660279929338226e-06,
235
+ "loss": 1.7707,
236
+ "step": 2500
237
+ },
238
+ {
239
+ "epoch": 0.17,
240
+ "eval_codebleu": 0.041373696094825684,
241
+ "eval_dataflow_match_score": 0.06151811199034646,
242
+ "eval_loss": 1.7324126958847046,
243
+ "eval_ngram_match_score": 0.00031767245966084565,
244
+ "eval_runtime": 210.1473,
245
+ "eval_samples_per_second": 12.101,
246
+ "eval_steps_per_second": 1.513,
247
+ "eval_syntax_match_score": 0.09862185146846902,
248
+ "eval_weighted_ngram_match_score": 0.0050371484608264,
249
+ "step": 2500
250
+ },
251
+ {
252
+ "epoch": 0.18,
253
+ "grad_norm": 1.138310194015503,
254
+ "learning_rate": 9.646691126511754e-06,
255
+ "loss": 1.702,
256
+ "step": 2600
257
+ },
258
+ {
259
+ "epoch": 0.18,
260
+ "grad_norm": 1.5409811735153198,
261
+ "learning_rate": 9.633102323685284e-06,
262
+ "loss": 1.7403,
263
+ "step": 2700
264
+ },
265
+ {
266
+ "epoch": 0.19,
267
+ "grad_norm": 1.2863227128982544,
268
+ "learning_rate": 9.619513520858813e-06,
269
+ "loss": 1.7514,
270
+ "step": 2800
271
+ },
272
+ {
273
+ "epoch": 0.2,
274
+ "grad_norm": 1.523219347000122,
275
+ "learning_rate": 9.605924718032343e-06,
276
+ "loss": 1.7735,
277
+ "step": 2900
278
+ },
279
+ {
280
+ "epoch": 0.2,
281
+ "grad_norm": 1.6315879821777344,
282
+ "learning_rate": 9.592335915205871e-06,
283
+ "loss": 1.7289,
284
+ "step": 3000
285
+ },
286
+ {
287
+ "epoch": 0.2,
288
+ "eval_codebleu": 0.03736357788677419,
289
+ "eval_dataflow_match_score": 0.04515814633467152,
290
+ "eval_loss": 1.6959866285324097,
291
+ "eval_ngram_match_score": 0.00026766267346779954,
292
+ "eval_runtime": 209.9032,
293
+ "eval_samples_per_second": 12.115,
294
+ "eval_steps_per_second": 1.515,
295
+ "eval_syntax_match_score": 0.09879437374413931,
296
+ "eval_weighted_ngram_match_score": 0.0052341287948181195,
297
+ "step": 3000
298
+ },
299
+ {
300
+ "epoch": 0.21,
301
+ "grad_norm": 1.5990880727767944,
302
+ "learning_rate": 9.5787471123794e-06,
303
+ "loss": 1.7275,
304
+ "step": 3100
305
+ },
306
+ {
307
+ "epoch": 0.22,
308
+ "grad_norm": 1.787031650543213,
309
+ "learning_rate": 9.565158309552928e-06,
310
+ "loss": 1.7112,
311
+ "step": 3200
312
+ },
313
+ {
314
+ "epoch": 0.22,
315
+ "grad_norm": 1.6794238090515137,
316
+ "learning_rate": 9.551569506726458e-06,
317
+ "loss": 1.683,
318
+ "step": 3300
319
+ },
320
+ {
321
+ "epoch": 0.23,
322
+ "grad_norm": 2.1808717250823975,
323
+ "learning_rate": 9.537980703899988e-06,
324
+ "loss": 1.8087,
325
+ "step": 3400
326
+ },
327
+ {
328
+ "epoch": 0.24,
329
+ "grad_norm": 1.737121343612671,
330
+ "learning_rate": 9.524391901073516e-06,
331
+ "loss": 1.7739,
332
+ "step": 3500
333
+ },
334
+ {
335
+ "epoch": 0.24,
336
+ "eval_codebleu": 0.03697189900391692,
337
+ "eval_dataflow_match_score": 0.03780196319587868,
338
+ "eval_loss": 1.668063998222351,
339
+ "eval_ngram_match_score": 0.0003274646611745997,
340
+ "eval_runtime": 208.872,
341
+ "eval_samples_per_second": 12.175,
342
+ "eval_steps_per_second": 1.522,
343
+ "eval_syntax_match_score": 0.1035438105096511,
344
+ "eval_weighted_ngram_match_score": 0.006214357648963285,
345
+ "step": 3500
346
+ },
347
+ {
348
+ "epoch": 0.24,
349
+ "grad_norm": 1.6818939447402954,
350
+ "learning_rate": 9.510803098247045e-06,
351
+ "loss": 1.7366,
352
+ "step": 3600
353
+ },
354
+ {
355
+ "epoch": 0.25,
356
+ "grad_norm": 1.4390071630477905,
357
+ "learning_rate": 9.497214295420573e-06,
358
+ "loss": 1.6693,
359
+ "step": 3700
360
+ },
361
+ {
362
+ "epoch": 0.26,
363
+ "grad_norm": 1.443108081817627,
364
+ "learning_rate": 9.483625492594103e-06,
365
+ "loss": 1.6856,
366
+ "step": 3800
367
+ },
368
+ {
369
+ "epoch": 0.26,
370
+ "grad_norm": 1.6540530920028687,
371
+ "learning_rate": 9.470036689767632e-06,
372
+ "loss": 1.6965,
373
+ "step": 3900
374
+ },
375
+ {
376
+ "epoch": 0.27,
377
+ "grad_norm": 1.178989291191101,
378
+ "learning_rate": 9.456447886941162e-06,
379
+ "loss": 1.6797,
380
+ "step": 4000
381
+ },
382
+ {
383
+ "epoch": 0.27,
384
+ "eval_codebleu": 0.033575238882649495,
385
+ "eval_dataflow_match_score": 0.025967094423688303,
386
+ "eval_loss": 1.6396487951278687,
387
+ "eval_ngram_match_score": 0.00023558729997467283,
388
+ "eval_runtime": 209.0765,
389
+ "eval_samples_per_second": 12.163,
390
+ "eval_steps_per_second": 1.521,
391
+ "eval_syntax_match_score": 0.1018591812295764,
392
+ "eval_weighted_ngram_match_score": 0.006239092577358598,
393
+ "step": 4000
394
+ },
395
+ {
396
+ "epoch": 0.28,
397
+ "grad_norm": 1.888967752456665,
398
+ "learning_rate": 9.44285908411469e-06,
399
+ "loss": 1.6815,
400
+ "step": 4100
401
+ },
402
+ {
403
+ "epoch": 0.29,
404
+ "grad_norm": 1.4612641334533691,
405
+ "learning_rate": 9.42927028128822e-06,
406
+ "loss": 1.6089,
407
+ "step": 4200
408
+ },
409
+ {
410
+ "epoch": 0.29,
411
+ "grad_norm": 1.5609626770019531,
412
+ "learning_rate": 9.415681478461748e-06,
413
+ "loss": 1.6824,
414
+ "step": 4300
415
+ },
416
+ {
417
+ "epoch": 0.3,
418
+ "grad_norm": 1.38056218624115,
419
+ "learning_rate": 9.402092675635277e-06,
420
+ "loss": 1.679,
421
+ "step": 4400
422
+ },
423
+ {
424
+ "epoch": 0.31,
425
+ "grad_norm": 1.928477168083191,
426
+ "learning_rate": 9.388503872808807e-06,
427
+ "loss": 1.6737,
428
+ "step": 4500
429
+ },
430
+ {
431
+ "epoch": 0.31,
432
+ "eval_codebleu": 0.03436485059379589,
433
+ "eval_dataflow_match_score": 0.025433365047687558,
434
+ "eval_loss": 1.619603157043457,
435
+ "eval_ngram_match_score": 0.00025042578946432695,
436
+ "eval_runtime": 210.4293,
437
+ "eval_samples_per_second": 12.085,
438
+ "eval_steps_per_second": 1.511,
439
+ "eval_syntax_match_score": 0.10487324686922811,
440
+ "eval_weighted_ngram_match_score": 0.006902364668803559,
441
+ "step": 4500
442
+ },
443
+ {
444
+ "epoch": 0.31,
445
+ "grad_norm": 1.467225432395935,
446
+ "learning_rate": 9.374915069982335e-06,
447
+ "loss": 1.642,
448
+ "step": 4600
449
+ },
450
+ {
451
+ "epoch": 0.32,
452
+ "grad_norm": 1.4231284856796265,
453
+ "learning_rate": 9.361326267155864e-06,
454
+ "loss": 1.6238,
455
+ "step": 4700
456
+ },
457
+ {
458
+ "epoch": 0.33,
459
+ "grad_norm": 1.61458158493042,
460
+ "learning_rate": 9.347737464329394e-06,
461
+ "loss": 1.6226,
462
+ "step": 4800
463
+ },
464
+ {
465
+ "epoch": 0.33,
466
+ "grad_norm": 1.8465476036071777,
467
+ "learning_rate": 9.334148661502922e-06,
468
+ "loss": 1.6427,
469
+ "step": 4900
470
+ },
471
+ {
472
+ "epoch": 0.34,
473
+ "grad_norm": 2.1492788791656494,
474
+ "learning_rate": 9.320559858676452e-06,
475
+ "loss": 1.6783,
476
+ "step": 5000
477
+ },
478
+ {
479
+ "epoch": 0.34,
480
+ "eval_codebleu": 0.03197730480981573,
481
+ "eval_dataflow_match_score": 0.018842967535330563,
482
+ "eval_loss": 1.596398115158081,
483
+ "eval_ngram_match_score": 0.00021050334214284166,
484
+ "eval_runtime": 209.9696,
485
+ "eval_samples_per_second": 12.111,
486
+ "eval_steps_per_second": 1.515,
487
+ "eval_syntax_match_score": 0.10182873612210518,
488
+ "eval_weighted_ngram_match_score": 0.0070270122396843505,
489
+ "step": 5000
490
+ },
491
+ {
492
+ "epoch": 0.35,
493
+ "grad_norm": 1.5661336183547974,
494
+ "learning_rate": 9.306971055849981e-06,
495
+ "loss": 1.6227,
496
+ "step": 5100
497
+ },
498
+ {
499
+ "epoch": 0.35,
500
+ "grad_norm": 1.5441151857376099,
501
+ "learning_rate": 9.293382253023509e-06,
502
+ "loss": 1.6525,
503
+ "step": 5200
504
+ },
505
+ {
506
+ "epoch": 0.36,
507
+ "grad_norm": 1.507628083229065,
508
+ "learning_rate": 9.279793450197039e-06,
509
+ "loss": 1.649,
510
+ "step": 5300
511
+ },
512
+ {
513
+ "epoch": 0.37,
514
+ "grad_norm": 1.5360280275344849,
515
+ "learning_rate": 9.266204647370567e-06,
516
+ "loss": 1.6738,
517
+ "step": 5400
518
+ },
519
+ {
520
+ "epoch": 0.37,
521
+ "grad_norm": 1.6375775337219238,
522
+ "learning_rate": 9.252615844544096e-06,
523
+ "loss": 1.5854,
524
+ "step": 5500
525
+ },
526
+ {
527
+ "epoch": 0.37,
528
+ "eval_loss": 1.5711854696273804,
529
+ "eval_runtime": 70.8487,
530
+ "eval_samples_per_second": 35.893,
531
+ "eval_steps_per_second": 4.488,
532
+ "step": 5500
533
+ },
534
+ {
535
+ "epoch": 0.38,
536
+ "grad_norm": 1.5210243463516235,
537
+ "learning_rate": 9.239027041717626e-06,
538
+ "loss": 1.621,
539
+ "step": 5600
540
+ },
541
+ {
542
+ "epoch": 0.39,
543
+ "grad_norm": 1.2972959280014038,
544
+ "learning_rate": 9.225438238891154e-06,
545
+ "loss": 1.6124,
546
+ "step": 5700
547
+ },
548
+ {
549
+ "epoch": 0.39,
550
+ "grad_norm": 1.902424693107605,
551
+ "learning_rate": 9.211849436064683e-06,
552
+ "loss": 1.6012,
553
+ "step": 5800
554
+ },
555
+ {
556
+ "epoch": 0.4,
557
+ "grad_norm": 1.865466594696045,
558
+ "learning_rate": 9.198260633238213e-06,
559
+ "loss": 1.5745,
560
+ "step": 5900
561
+ },
562
+ {
563
+ "epoch": 0.41,
564
+ "grad_norm": 1.6546335220336914,
565
+ "learning_rate": 9.184671830411741e-06,
566
+ "loss": 1.5874,
567
+ "step": 6000
568
+ },
569
+ {
570
+ "epoch": 0.41,
571
+ "eval_loss": 1.553696870803833,
572
+ "eval_runtime": 70.799,
573
+ "eval_samples_per_second": 35.919,
574
+ "eval_steps_per_second": 4.492,
575
+ "step": 6000
576
+ },
577
+ {
578
+ "epoch": 0.41,
579
+ "grad_norm": 1.7478700876235962,
580
+ "learning_rate": 9.17108302758527e-06,
581
+ "loss": 1.5666,
582
+ "step": 6100
583
+ },
584
+ {
585
+ "epoch": 0.42,
586
+ "grad_norm": 1.5560758113861084,
587
+ "learning_rate": 9.1574942247588e-06,
588
+ "loss": 1.5382,
589
+ "step": 6200
590
+ },
591
+ {
592
+ "epoch": 0.43,
593
+ "grad_norm": 1.4764758348464966,
594
+ "learning_rate": 9.143905421932328e-06,
595
+ "loss": 1.5408,
596
+ "step": 6300
597
+ },
598
+ {
599
+ "epoch": 0.43,
600
+ "grad_norm": 1.6634376049041748,
601
+ "learning_rate": 9.130316619105858e-06,
602
+ "loss": 1.5688,
603
+ "step": 6400
604
+ },
605
+ {
606
+ "epoch": 0.44,
607
+ "grad_norm": 1.673550009727478,
608
+ "learning_rate": 9.116727816279387e-06,
609
+ "loss": 1.5445,
610
+ "step": 6500
611
+ },
612
+ {
613
+ "epoch": 0.44,
614
+ "eval_loss": 1.5321872234344482,
615
+ "eval_runtime": 70.8079,
616
+ "eval_samples_per_second": 35.914,
617
+ "eval_steps_per_second": 4.491,
618
+ "step": 6500
619
+ },
620
+ {
621
+ "epoch": 0.45,
622
+ "grad_norm": 1.572766900062561,
623
+ "learning_rate": 9.103139013452915e-06,
624
+ "loss": 1.5596,
625
+ "step": 6600
626
+ },
627
+ {
628
+ "epoch": 0.46,
629
+ "grad_norm": 1.4650744199752808,
630
+ "learning_rate": 9.089550210626445e-06,
631
+ "loss": 1.5327,
632
+ "step": 6700
633
+ },
634
+ {
635
+ "epoch": 0.46,
636
+ "grad_norm": 1.8133198022842407,
637
+ "learning_rate": 9.075961407799973e-06,
638
+ "loss": 1.5789,
639
+ "step": 6800
640
+ },
641
+ {
642
+ "epoch": 0.47,
643
+ "grad_norm": 1.6164945363998413,
644
+ "learning_rate": 9.062372604973503e-06,
645
+ "loss": 1.4918,
646
+ "step": 6900
647
+ },
648
+ {
649
+ "epoch": 0.48,
650
+ "grad_norm": 1.7517341375350952,
651
+ "learning_rate": 9.048783802147032e-06,
652
+ "loss": 1.4947,
653
+ "step": 7000
654
+ },
655
+ {
656
+ "epoch": 0.48,
657
+ "eval_loss": 1.5142408609390259,
658
+ "eval_runtime": 70.7829,
659
+ "eval_samples_per_second": 35.927,
660
+ "eval_steps_per_second": 4.493,
661
+ "step": 7000
662
+ },
663
+ {
664
+ "epoch": 0.48,
665
+ "grad_norm": 2.3255422115325928,
666
+ "learning_rate": 9.03519499932056e-06,
667
+ "loss": 1.5434,
668
+ "step": 7100
669
+ },
670
+ {
671
+ "epoch": 0.49,
672
+ "grad_norm": 2.118253469467163,
673
+ "learning_rate": 9.02160619649409e-06,
674
+ "loss": 1.5191,
675
+ "step": 7200
676
+ },
677
+ {
678
+ "epoch": 0.5,
679
+ "grad_norm": 1.5948559045791626,
680
+ "learning_rate": 9.008017393667618e-06,
681
+ "loss": 1.5437,
682
+ "step": 7300
683
+ },
684
+ {
685
+ "epoch": 0.5,
686
+ "grad_norm": 1.4735013246536255,
687
+ "learning_rate": 8.994428590841147e-06,
688
+ "loss": 1.5217,
689
+ "step": 7400
690
+ },
691
+ {
692
+ "epoch": 0.51,
693
+ "grad_norm": 1.5940884351730347,
694
+ "learning_rate": 8.980839788014677e-06,
695
+ "loss": 1.5113,
696
+ "step": 7500
697
+ },
698
+ {
699
+ "epoch": 0.51,
700
+ "eval_loss": 1.4991660118103027,
701
+ "eval_runtime": 70.7857,
702
+ "eval_samples_per_second": 35.925,
703
+ "eval_steps_per_second": 4.492,
704
+ "step": 7500
705
+ },
706
+ {
707
+ "epoch": 0.52,
708
+ "grad_norm": 1.7738714218139648,
709
+ "learning_rate": 8.967250985188207e-06,
710
+ "loss": 1.5253,
711
+ "step": 7600
712
+ },
713
+ {
714
+ "epoch": 0.52,
715
+ "grad_norm": 1.8445935249328613,
716
+ "learning_rate": 8.953662182361734e-06,
717
+ "loss": 1.5359,
718
+ "step": 7700
719
+ },
720
+ {
721
+ "epoch": 0.53,
722
+ "grad_norm": 1.5108485221862793,
723
+ "learning_rate": 8.940073379535264e-06,
724
+ "loss": 1.5228,
725
+ "step": 7800
726
+ },
727
+ {
728
+ "epoch": 0.54,
729
+ "grad_norm": 1.7147557735443115,
730
+ "learning_rate": 8.926484576708792e-06,
731
+ "loss": 1.5228,
732
+ "step": 7900
733
+ },
734
+ {
735
+ "epoch": 0.54,
736
+ "grad_norm": 1.5899161100387573,
737
+ "learning_rate": 8.912895773882322e-06,
738
+ "loss": 1.4889,
739
+ "step": 8000
740
+ },
741
+ {
742
+ "epoch": 0.54,
743
+ "eval_loss": 1.4857795238494873,
744
+ "eval_runtime": 70.8945,
745
+ "eval_samples_per_second": 35.87,
746
+ "eval_steps_per_second": 4.486,
747
+ "step": 8000
748
+ },
749
+ {
750
+ "epoch": 0.55,
751
+ "grad_norm": 1.6886754035949707,
752
+ "learning_rate": 8.899306971055851e-06,
753
+ "loss": 1.4898,
754
+ "step": 8100
755
+ },
756
+ {
757
+ "epoch": 0.56,
758
+ "grad_norm": 1.5048600435256958,
759
+ "learning_rate": 8.885718168229381e-06,
760
+ "loss": 1.5239,
761
+ "step": 8200
762
+ },
763
+ {
764
+ "epoch": 0.56,
765
+ "grad_norm": 1.8773216009140015,
766
+ "learning_rate": 8.872129365402909e-06,
767
+ "loss": 1.4889,
768
+ "step": 8300
769
+ },
770
+ {
771
+ "epoch": 0.57,
772
+ "grad_norm": 1.7063783407211304,
773
+ "learning_rate": 8.858540562576437e-06,
774
+ "loss": 1.4472,
775
+ "step": 8400
776
+ },
777
+ {
778
+ "epoch": 0.58,
779
+ "grad_norm": 1.5942984819412231,
780
+ "learning_rate": 8.844951759749966e-06,
781
+ "loss": 1.4998,
782
+ "step": 8500
783
+ },
784
+ {
785
+ "epoch": 0.58,
786
+ "eval_loss": 1.4711333513259888,
787
+ "eval_runtime": 70.9176,
788
+ "eval_samples_per_second": 35.858,
789
+ "eval_steps_per_second": 4.484,
790
+ "step": 8500
791
+ },
792
+ {
793
+ "epoch": 0.58,
794
+ "grad_norm": 1.818272352218628,
795
+ "learning_rate": 8.831362956923496e-06,
796
+ "loss": 1.5504,
797
+ "step": 8600
798
+ },
799
+ {
800
+ "epoch": 0.59,
801
+ "grad_norm": 1.8230923414230347,
802
+ "learning_rate": 8.817774154097026e-06,
803
+ "loss": 1.5182,
804
+ "step": 8700
805
+ },
806
+ {
807
+ "epoch": 0.6,
808
+ "grad_norm": 1.8054814338684082,
809
+ "learning_rate": 8.804185351270554e-06,
810
+ "loss": 1.5136,
811
+ "step": 8800
812
+ },
813
+ {
814
+ "epoch": 0.6,
815
+ "grad_norm": 1.7768468856811523,
816
+ "learning_rate": 8.790596548444083e-06,
817
+ "loss": 1.4643,
818
+ "step": 8900
819
+ },
820
+ {
821
+ "epoch": 0.61,
822
+ "grad_norm": 1.9298087358474731,
823
+ "learning_rate": 8.777007745617611e-06,
824
+ "loss": 1.4449,
825
+ "step": 9000
826
+ },
827
+ {
828
+ "epoch": 0.61,
829
+ "eval_loss": 1.4553042650222778,
830
+ "eval_runtime": 71.0741,
831
+ "eval_samples_per_second": 35.78,
832
+ "eval_steps_per_second": 4.474,
833
+ "step": 9000
834
+ },
835
+ {
836
+ "epoch": 0.62,
837
+ "grad_norm": 1.979022741317749,
838
+ "learning_rate": 8.76341894279114e-06,
839
+ "loss": 1.5057,
840
+ "step": 9100
841
+ },
842
+ {
843
+ "epoch": 0.63,
844
+ "grad_norm": 1.6144100427627563,
845
+ "learning_rate": 8.74983013996467e-06,
846
+ "loss": 1.4893,
847
+ "step": 9200
848
+ },
849
+ {
850
+ "epoch": 0.63,
851
+ "grad_norm": 1.7385257482528687,
852
+ "learning_rate": 8.7362413371382e-06,
853
+ "loss": 1.4427,
854
+ "step": 9300
855
+ },
856
+ {
857
+ "epoch": 0.64,
858
+ "grad_norm": 2.218280792236328,
859
+ "learning_rate": 8.722652534311728e-06,
860
+ "loss": 1.445,
861
+ "step": 9400
862
+ },
863
+ {
864
+ "epoch": 0.65,
865
+ "grad_norm": 1.760903000831604,
866
+ "learning_rate": 8.709063731485256e-06,
867
+ "loss": 1.4364,
868
+ "step": 9500
869
+ },
870
+ {
871
+ "epoch": 0.65,
872
+ "eval_loss": 1.4402815103530884,
873
+ "eval_runtime": 70.7715,
874
+ "eval_samples_per_second": 35.933,
875
+ "eval_steps_per_second": 4.493,
876
+ "step": 9500
877
+ },
878
+ {
879
+ "epoch": 0.65,
880
+ "grad_norm": 1.4513877630233765,
881
+ "learning_rate": 8.695474928658785e-06,
882
+ "loss": 1.437,
883
+ "step": 9600
884
+ },
885
+ {
886
+ "epoch": 0.66,
887
+ "grad_norm": 1.703837513923645,
888
+ "learning_rate": 8.681886125832315e-06,
889
+ "loss": 1.4565,
890
+ "step": 9700
891
+ },
892
+ {
893
+ "epoch": 0.67,
894
+ "grad_norm": 2.192049980163574,
895
+ "learning_rate": 8.668297323005845e-06,
896
+ "loss": 1.4654,
897
+ "step": 9800
898
+ },
899
+ {
900
+ "epoch": 0.67,
901
+ "grad_norm": 2.012014150619507,
902
+ "learning_rate": 8.654708520179373e-06,
903
+ "loss": 1.4975,
904
+ "step": 9900
905
+ },
906
+ {
907
+ "epoch": 0.68,
908
+ "grad_norm": 2.117527484893799,
909
+ "learning_rate": 8.641119717352902e-06,
910
+ "loss": 1.4446,
911
+ "step": 10000
912
+ },
913
+ {
914
+ "epoch": 0.68,
915
+ "eval_loss": 1.4303960800170898,
916
+ "eval_runtime": 70.9752,
917
+ "eval_samples_per_second": 35.829,
918
+ "eval_steps_per_second": 4.48,
919
+ "step": 10000
920
+ },
921
+ {
922
+ "epoch": 0.69,
923
+ "grad_norm": 2.3108479976654053,
924
+ "learning_rate": 8.62753091452643e-06,
925
+ "loss": 1.4801,
926
+ "step": 10100
927
+ },
928
+ {
929
+ "epoch": 0.69,
930
+ "grad_norm": 1.4589275121688843,
931
+ "learning_rate": 8.61394211169996e-06,
932
+ "loss": 1.4579,
933
+ "step": 10200
934
+ },
935
+ {
936
+ "epoch": 0.7,
937
+ "grad_norm": 1.7688006162643433,
938
+ "learning_rate": 8.60035330887349e-06,
939
+ "loss": 1.471,
940
+ "step": 10300
941
+ },
942
+ {
943
+ "epoch": 0.71,
944
+ "grad_norm": 1.6766855716705322,
945
+ "learning_rate": 8.586764506047019e-06,
946
+ "loss": 1.4532,
947
+ "step": 10400
948
+ },
949
+ {
950
+ "epoch": 0.71,
951
+ "grad_norm": 2.0386102199554443,
952
+ "learning_rate": 8.573175703220547e-06,
953
+ "loss": 1.3998,
954
+ "step": 10500
955
+ },
956
+ {
957
+ "epoch": 0.71,
958
+ "eval_loss": 1.4204617738723755,
959
+ "eval_runtime": 70.7476,
960
+ "eval_samples_per_second": 35.945,
961
+ "eval_steps_per_second": 4.495,
962
+ "step": 10500
963
+ },
964
+ {
965
+ "epoch": 0.72,
966
+ "grad_norm": 1.9797570705413818,
967
+ "learning_rate": 8.559586900394075e-06,
968
+ "loss": 1.3922,
969
+ "step": 10600
970
+ },
971
+ {
972
+ "epoch": 0.73,
973
+ "grad_norm": 1.7562373876571655,
974
+ "learning_rate": 8.545998097567605e-06,
975
+ "loss": 1.4378,
976
+ "step": 10700
977
+ },
978
+ {
979
+ "epoch": 0.73,
980
+ "grad_norm": 1.6127831935882568,
981
+ "learning_rate": 8.532409294741134e-06,
982
+ "loss": 1.4483,
983
+ "step": 10800
984
+ },
985
+ {
986
+ "epoch": 0.74,
987
+ "grad_norm": 1.6120541095733643,
988
+ "learning_rate": 8.518820491914664e-06,
989
+ "loss": 1.3961,
990
+ "step": 10900
991
+ },
992
+ {
993
+ "epoch": 0.75,
994
+ "grad_norm": 1.5521306991577148,
995
+ "learning_rate": 8.505231689088192e-06,
996
+ "loss": 1.4101,
997
+ "step": 11000
998
+ },
999
+ {
1000
+ "epoch": 0.75,
1001
+ "eval_loss": 1.4052175283432007,
1002
+ "eval_runtime": 70.9316,
1003
+ "eval_samples_per_second": 35.851,
1004
+ "eval_steps_per_second": 4.483,
1005
+ "step": 11000
1006
+ },
1007
+ {
1008
+ "epoch": 0.75,
1009
+ "grad_norm": 2.4100379943847656,
1010
+ "learning_rate": 8.491642886261721e-06,
1011
+ "loss": 1.4224,
1012
+ "step": 11100
1013
+ },
1014
+ {
1015
+ "epoch": 0.76,
1016
+ "grad_norm": 1.7542225122451782,
1017
+ "learning_rate": 8.47805408343525e-06,
1018
+ "loss": 1.4238,
1019
+ "step": 11200
1020
+ },
1021
+ {
1022
+ "epoch": 0.77,
1023
+ "grad_norm": 2.3809213638305664,
1024
+ "learning_rate": 8.464465280608779e-06,
1025
+ "loss": 1.3968,
1026
+ "step": 11300
1027
+ },
1028
+ {
1029
+ "epoch": 0.77,
1030
+ "grad_norm": 1.490343451499939,
1031
+ "learning_rate": 8.450876477782309e-06,
1032
+ "loss": 1.4512,
1033
+ "step": 11400
1034
+ },
1035
+ {
1036
+ "epoch": 0.78,
1037
+ "grad_norm": 1.663609504699707,
1038
+ "learning_rate": 8.437287674955838e-06,
1039
+ "loss": 1.4772,
1040
+ "step": 11500
1041
+ },
1042
+ {
1043
+ "epoch": 0.78,
1044
+ "eval_loss": 1.3936774730682373,
1045
+ "eval_runtime": 70.914,
1046
+ "eval_samples_per_second": 35.86,
1047
+ "eval_steps_per_second": 4.484,
1048
+ "step": 11500
1049
+ },
1050
+ {
1051
+ "epoch": 0.79,
1052
+ "grad_norm": 1.5684208869934082,
1053
+ "learning_rate": 8.423698872129366e-06,
1054
+ "loss": 1.4276,
1055
+ "step": 11600
1056
+ },
1057
+ {
1058
+ "epoch": 0.79,
1059
+ "grad_norm": 1.6131608486175537,
1060
+ "learning_rate": 8.410110069302894e-06,
1061
+ "loss": 1.4067,
1062
+ "step": 11700
1063
+ },
1064
+ {
1065
+ "epoch": 0.8,
1066
+ "grad_norm": 2.017564058303833,
1067
+ "learning_rate": 8.396521266476424e-06,
1068
+ "loss": 1.4028,
1069
+ "step": 11800
1070
+ },
1071
+ {
1072
+ "epoch": 0.81,
1073
+ "grad_norm": 2.383514165878296,
1074
+ "learning_rate": 8.382932463649953e-06,
1075
+ "loss": 1.4028,
1076
+ "step": 11900
1077
+ },
1078
+ {
1079
+ "epoch": 0.82,
1080
+ "grad_norm": 2.202026605606079,
1081
+ "learning_rate": 8.369343660823483e-06,
1082
+ "loss": 1.3671,
1083
+ "step": 12000
1084
+ },
1085
+ {
1086
+ "epoch": 0.82,
1087
+ "eval_loss": 1.3839792013168335,
1088
+ "eval_runtime": 70.7236,
1089
+ "eval_samples_per_second": 35.957,
1090
+ "eval_steps_per_second": 4.496,
1091
+ "step": 12000
1092
+ },
1093
+ {
1094
+ "epoch": 0.82,
1095
+ "grad_norm": 1.499125361442566,
1096
+ "learning_rate": 8.355754857997011e-06,
1097
+ "loss": 1.4193,
1098
+ "step": 12100
1099
+ },
1100
+ {
1101
+ "epoch": 0.83,
1102
+ "grad_norm": 1.3109521865844727,
1103
+ "learning_rate": 8.34216605517054e-06,
1104
+ "loss": 1.3969,
1105
+ "step": 12200
1106
+ },
1107
+ {
1108
+ "epoch": 0.84,
1109
+ "grad_norm": 2.689412832260132,
1110
+ "learning_rate": 8.328577252344068e-06,
1111
+ "loss": 1.4141,
1112
+ "step": 12300
1113
+ },
1114
+ {
1115
+ "epoch": 0.84,
1116
+ "grad_norm": 1.6615593433380127,
1117
+ "learning_rate": 8.314988449517598e-06,
1118
+ "loss": 1.3512,
1119
+ "step": 12400
1120
+ },
1121
+ {
1122
+ "epoch": 0.85,
1123
+ "grad_norm": 1.994040846824646,
1124
+ "learning_rate": 8.301399646691128e-06,
1125
+ "loss": 1.4268,
1126
+ "step": 12500
1127
+ },
1128
+ {
1129
+ "epoch": 0.85,
1130
+ "eval_loss": 1.3757482767105103,
1131
+ "eval_runtime": 70.9944,
1132
+ "eval_samples_per_second": 35.82,
1133
+ "eval_steps_per_second": 4.479,
1134
+ "step": 12500
1135
+ },
1136
+ {
1137
+ "epoch": 0.86,
1138
+ "grad_norm": 2.2422096729278564,
1139
+ "learning_rate": 8.287810843864657e-06,
1140
+ "loss": 1.3549,
1141
+ "step": 12600
1142
+ },
1143
+ {
1144
+ "epoch": 0.86,
1145
+ "grad_norm": 1.4407843351364136,
1146
+ "learning_rate": 8.274222041038185e-06,
1147
+ "loss": 1.3934,
1148
+ "step": 12700
1149
+ },
1150
+ {
1151
+ "epoch": 0.87,
1152
+ "grad_norm": 3.7289652824401855,
1153
+ "learning_rate": 8.260633238211713e-06,
1154
+ "loss": 1.3916,
1155
+ "step": 12800
1156
+ },
1157
+ {
1158
+ "epoch": 0.88,
1159
+ "grad_norm": 1.819023847579956,
1160
+ "learning_rate": 8.247044435385243e-06,
1161
+ "loss": 1.3878,
1162
+ "step": 12900
1163
+ },
1164
+ {
1165
+ "epoch": 0.88,
1166
+ "grad_norm": 1.6075499057769775,
1167
+ "learning_rate": 8.233455632558772e-06,
1168
+ "loss": 1.3469,
1169
+ "step": 13000
1170
+ },
1171
+ {
1172
+ "epoch": 0.88,
1173
+ "eval_loss": 1.365922212600708,
1174
+ "eval_runtime": 71.0057,
1175
+ "eval_samples_per_second": 35.814,
1176
+ "eval_steps_per_second": 4.479,
1177
+ "step": 13000
1178
+ },
1179
+ {
1180
+ "epoch": 0.89,
1181
+ "grad_norm": 1.9204126596450806,
1182
+ "learning_rate": 8.219866829732302e-06,
1183
+ "loss": 1.3456,
1184
+ "step": 13100
1185
+ },
1186
+ {
1187
+ "epoch": 0.9,
1188
+ "grad_norm": 2.0110292434692383,
1189
+ "learning_rate": 8.20627802690583e-06,
1190
+ "loss": 1.3921,
1191
+ "step": 13200
1192
+ },
1193
+ {
1194
+ "epoch": 0.9,
1195
+ "grad_norm": 1.4502629041671753,
1196
+ "learning_rate": 8.192689224079358e-06,
1197
+ "loss": 1.383,
1198
+ "step": 13300
1199
+ },
1200
+ {
1201
+ "epoch": 0.91,
1202
+ "grad_norm": 2.5011653900146484,
1203
+ "learning_rate": 8.179100421252888e-06,
1204
+ "loss": 1.3413,
1205
+ "step": 13400
1206
+ },
1207
+ {
1208
+ "epoch": 0.92,
1209
+ "grad_norm": 1.4338220357894897,
1210
+ "learning_rate": 8.165511618426417e-06,
1211
+ "loss": 1.3531,
1212
+ "step": 13500
1213
+ },
1214
+ {
1215
+ "epoch": 0.92,
1216
+ "eval_loss": 1.3567384481430054,
1217
+ "eval_runtime": 70.8084,
1218
+ "eval_samples_per_second": 35.914,
1219
+ "eval_steps_per_second": 4.491,
1220
+ "step": 13500
1221
+ },
1222
+ {
1223
+ "epoch": 0.92,
1224
+ "grad_norm": 1.8867733478546143,
1225
+ "learning_rate": 8.151922815599947e-06,
1226
+ "loss": 1.4115,
1227
+ "step": 13600
1228
+ },
1229
+ {
1230
+ "epoch": 0.93,
1231
+ "grad_norm": 1.897558331489563,
1232
+ "learning_rate": 8.138334012773476e-06,
1233
+ "loss": 1.3473,
1234
+ "step": 13700
1235
+ },
1236
+ {
1237
+ "epoch": 0.94,
1238
+ "grad_norm": 2.6677191257476807,
1239
+ "learning_rate": 8.124745209947004e-06,
1240
+ "loss": 1.3982,
1241
+ "step": 13800
1242
+ },
1243
+ {
1244
+ "epoch": 0.94,
1245
+ "grad_norm": 1.6690632104873657,
1246
+ "learning_rate": 8.111156407120532e-06,
1247
+ "loss": 1.3371,
1248
+ "step": 13900
1249
+ },
1250
+ {
1251
+ "epoch": 0.95,
1252
+ "grad_norm": 1.668286919593811,
1253
+ "learning_rate": 8.097567604294062e-06,
1254
+ "loss": 1.3463,
1255
+ "step": 14000
1256
+ },
1257
+ {
1258
+ "epoch": 0.95,
1259
+ "eval_loss": 1.3470206260681152,
1260
+ "eval_runtime": 70.8694,
1261
+ "eval_samples_per_second": 35.883,
1262
+ "eval_steps_per_second": 4.487,
1263
+ "step": 14000
1264
+ },
1265
+ {
1266
+ "epoch": 0.96,
1267
+ "grad_norm": 1.3303313255310059,
1268
+ "learning_rate": 8.083978801467592e-06,
1269
+ "loss": 1.3283,
1270
+ "step": 14100
1271
+ },
1272
+ {
1273
+ "epoch": 0.96,
1274
+ "grad_norm": 1.8314011096954346,
1275
+ "learning_rate": 8.070389998641121e-06,
1276
+ "loss": 1.3382,
1277
+ "step": 14200
1278
+ },
1279
+ {
1280
+ "epoch": 0.97,
1281
+ "grad_norm": 1.6911287307739258,
1282
+ "learning_rate": 8.056801195814649e-06,
1283
+ "loss": 1.3249,
1284
+ "step": 14300
1285
+ },
1286
+ {
1287
+ "epoch": 0.98,
1288
+ "grad_norm": 2.0255990028381348,
1289
+ "learning_rate": 8.043212392988179e-06,
1290
+ "loss": 1.4345,
1291
+ "step": 14400
1292
+ },
1293
+ {
1294
+ "epoch": 0.99,
1295
+ "grad_norm": 1.6872771978378296,
1296
+ "learning_rate": 8.029623590161707e-06,
1297
+ "loss": 1.3662,
1298
+ "step": 14500
1299
+ },
1300
+ {
1301
+ "epoch": 0.99,
1302
+ "eval_loss": 1.3394687175750732,
1303
+ "eval_runtime": 70.8165,
1304
+ "eval_samples_per_second": 35.91,
1305
+ "eval_steps_per_second": 4.49,
1306
+ "step": 14500
1307
+ },
1308
+ {
1309
+ "epoch": 0.99,
1310
+ "grad_norm": 1.2456538677215576,
1311
+ "learning_rate": 8.016034787335236e-06,
1312
+ "loss": 1.3152,
1313
+ "step": 14600
1314
+ },
1315
+ {
1316
+ "epoch": 1.0,
1317
+ "grad_norm": 1.9343585968017578,
1318
+ "learning_rate": 8.002445984508766e-06,
1319
+ "loss": 1.3179,
1320
+ "step": 14700
1321
+ },
1322
+ {
1323
+ "epoch": 1.01,
1324
+ "grad_norm": 1.6026442050933838,
1325
+ "learning_rate": 7.988857181682294e-06,
1326
+ "loss": 1.3445,
1327
+ "step": 14800
1328
+ },
1329
+ {
1330
+ "epoch": 1.01,
1331
+ "grad_norm": 1.8159044981002808,
1332
+ "learning_rate": 7.975268378855823e-06,
1333
+ "loss": 1.3259,
1334
+ "step": 14900
1335
+ },
1336
+ {
1337
+ "epoch": 1.02,
1338
+ "grad_norm": 1.6430504322052002,
1339
+ "learning_rate": 7.961679576029351e-06,
1340
+ "loss": 1.337,
1341
+ "step": 15000
1342
+ },
1343
+ {
1344
+ "epoch": 1.02,
1345
+ "eval_loss": 1.3323568105697632,
1346
+ "eval_runtime": 70.9018,
1347
+ "eval_samples_per_second": 35.867,
1348
+ "eval_steps_per_second": 4.485,
1349
+ "step": 15000
1350
+ },
1351
+ {
1352
+ "epoch": 1.03,
1353
+ "grad_norm": 2.036970853805542,
1354
+ "learning_rate": 7.948090773202881e-06,
1355
+ "loss": 1.3217,
1356
+ "step": 15100
1357
+ },
1358
+ {
1359
+ "epoch": 1.03,
1360
+ "grad_norm": 1.6756584644317627,
1361
+ "learning_rate": 7.93450197037641e-06,
1362
+ "loss": 1.3008,
1363
+ "step": 15200
1364
+ },
1365
+ {
1366
+ "epoch": 1.04,
1367
+ "grad_norm": 1.5923326015472412,
1368
+ "learning_rate": 7.92091316754994e-06,
1369
+ "loss": 1.3347,
1370
+ "step": 15300
1371
+ },
1372
+ {
1373
+ "epoch": 1.05,
1374
+ "grad_norm": 1.809383749961853,
1375
+ "learning_rate": 7.907460252751734e-06,
1376
+ "loss": 1.3192,
1377
+ "step": 15400
1378
+ },
1379
+ {
1380
+ "epoch": 1.05,
1381
+ "grad_norm": 2.035680055618286,
1382
+ "learning_rate": 7.893871449925262e-06,
1383
+ "loss": 1.3627,
1384
+ "step": 15500
1385
+ },
1386
+ {
1387
+ "epoch": 1.05,
1388
+ "eval_loss": 1.3219527006149292,
1389
+ "eval_runtime": 70.9632,
1390
+ "eval_samples_per_second": 35.835,
1391
+ "eval_steps_per_second": 4.481,
1392
+ "step": 15500
1393
+ },
1394
+ {
1395
+ "epoch": 1.06,
1396
+ "grad_norm": 1.7485100030899048,
1397
+ "learning_rate": 7.880282647098791e-06,
1398
+ "loss": 1.3015,
1399
+ "step": 15600
1400
+ },
1401
+ {
1402
+ "epoch": 1.07,
1403
+ "grad_norm": 2.0771241188049316,
1404
+ "learning_rate": 7.86669384427232e-06,
1405
+ "loss": 1.3261,
1406
+ "step": 15700
1407
+ },
1408
+ {
1409
+ "epoch": 1.07,
1410
+ "grad_norm": 1.8625783920288086,
1411
+ "learning_rate": 7.853105041445849e-06,
1412
+ "loss": 1.3308,
1413
+ "step": 15800
1414
+ },
1415
+ {
1416
+ "epoch": 1.08,
1417
+ "grad_norm": 1.8347725868225098,
1418
+ "learning_rate": 7.839516238619378e-06,
1419
+ "loss": 1.3637,
1420
+ "step": 15900
1421
+ },
1422
+ {
1423
+ "epoch": 1.09,
1424
+ "grad_norm": 1.9449338912963867,
1425
+ "learning_rate": 7.825927435792908e-06,
1426
+ "loss": 1.2906,
1427
+ "step": 16000
1428
+ },
1429
+ {
1430
+ "epoch": 1.09,
1431
+ "eval_loss": 1.3169829845428467,
1432
+ "eval_runtime": 70.9704,
1433
+ "eval_samples_per_second": 35.832,
1434
+ "eval_steps_per_second": 4.481,
1435
+ "step": 16000
1436
+ },
1437
+ {
1438
+ "epoch": 1.09,
1439
+ "grad_norm": 1.6746830940246582,
1440
+ "learning_rate": 7.812338632966436e-06,
1441
+ "loss": 1.3326,
1442
+ "step": 16100
1443
+ },
1444
+ {
1445
+ "epoch": 1.1,
1446
+ "grad_norm": 1.5581905841827393,
1447
+ "learning_rate": 7.798749830139966e-06,
1448
+ "loss": 1.2964,
1449
+ "step": 16200
1450
+ },
1451
+ {
1452
+ "epoch": 1.11,
1453
+ "grad_norm": 2.1636734008789062,
1454
+ "learning_rate": 7.785161027313495e-06,
1455
+ "loss": 1.2867,
1456
+ "step": 16300
1457
+ },
1458
+ {
1459
+ "epoch": 1.11,
1460
+ "grad_norm": 1.760335922241211,
1461
+ "learning_rate": 7.771572224487023e-06,
1462
+ "loss": 1.3017,
1463
+ "step": 16400
1464
+ },
1465
+ {
1466
+ "epoch": 1.12,
1467
+ "grad_norm": 1.9209500551223755,
1468
+ "learning_rate": 7.757983421660553e-06,
1469
+ "loss": 1.331,
1470
+ "step": 16500
1471
+ },
1472
+ {
1473
+ "epoch": 1.12,
1474
+ "eval_loss": 1.3085339069366455,
1475
+ "eval_runtime": 70.9762,
1476
+ "eval_samples_per_second": 35.829,
1477
+ "eval_steps_per_second": 4.48,
1478
+ "step": 16500
1479
+ },
1480
+ {
1481
+ "epoch": 1.13,
1482
+ "grad_norm": 1.8936728239059448,
1483
+ "learning_rate": 7.74439461883408e-06,
1484
+ "loss": 1.3494,
1485
+ "step": 16600
1486
+ },
1487
+ {
1488
+ "epoch": 1.13,
1489
+ "grad_norm": 1.6603670120239258,
1490
+ "learning_rate": 7.73080581600761e-06,
1491
+ "loss": 1.3443,
1492
+ "step": 16700
1493
+ },
1494
+ {
1495
+ "epoch": 1.14,
1496
+ "grad_norm": 1.9962695837020874,
1497
+ "learning_rate": 7.71721701318114e-06,
1498
+ "loss": 1.3016,
1499
+ "step": 16800
1500
+ },
1501
+ {
1502
+ "epoch": 1.15,
1503
+ "grad_norm": 1.7902451753616333,
1504
+ "learning_rate": 7.70362821035467e-06,
1505
+ "loss": 1.3107,
1506
+ "step": 16900
1507
+ },
1508
+ {
1509
+ "epoch": 1.16,
1510
+ "grad_norm": 1.7962889671325684,
1511
+ "learning_rate": 7.690039407528197e-06,
1512
+ "loss": 1.3082,
1513
+ "step": 17000
1514
+ },
1515
+ {
1516
+ "epoch": 1.16,
1517
+ "eval_loss": 1.3007583618164062,
1518
+ "eval_runtime": 70.8534,
1519
+ "eval_samples_per_second": 35.891,
1520
+ "eval_steps_per_second": 4.488,
1521
+ "step": 17000
1522
+ },
1523
+ {
1524
+ "epoch": 1.16,
1525
+ "grad_norm": 1.8448220491409302,
1526
+ "learning_rate": 7.676450604701725e-06,
1527
+ "loss": 1.3148,
1528
+ "step": 17100
1529
+ },
1530
+ {
1531
+ "epoch": 1.17,
1532
+ "grad_norm": 2.124708652496338,
1533
+ "learning_rate": 7.662861801875255e-06,
1534
+ "loss": 1.3348,
1535
+ "step": 17200
1536
+ },
1537
+ {
1538
+ "epoch": 1.18,
1539
+ "grad_norm": 1.5953021049499512,
1540
+ "learning_rate": 7.649272999048785e-06,
1541
+ "loss": 1.2575,
1542
+ "step": 17300
1543
+ },
1544
+ {
1545
+ "epoch": 1.18,
1546
+ "grad_norm": 1.7431753873825073,
1547
+ "learning_rate": 7.635684196222314e-06,
1548
+ "loss": 1.3239,
1549
+ "step": 17400
1550
+ },
1551
+ {
1552
+ "epoch": 1.19,
1553
+ "grad_norm": 2.0628209114074707,
1554
+ "learning_rate": 7.622095393395842e-06,
1555
+ "loss": 1.2904,
1556
+ "step": 17500
1557
+ },
1558
+ {
1559
+ "epoch": 1.19,
1560
+ "eval_loss": 1.2971383333206177,
1561
+ "eval_runtime": 70.8465,
1562
+ "eval_samples_per_second": 35.894,
1563
+ "eval_steps_per_second": 4.489,
1564
+ "step": 17500
1565
+ },
1566
+ {
1567
+ "epoch": 1.2,
1568
+ "grad_norm": 2.154141902923584,
1569
+ "learning_rate": 7.608506590569371e-06,
1570
+ "loss": 1.2824,
1571
+ "step": 17600
1572
+ },
1573
+ {
1574
+ "epoch": 1.2,
1575
+ "grad_norm": 1.7325314283370972,
1576
+ "learning_rate": 7.595053675771166e-06,
1577
+ "loss": 1.2587,
1578
+ "step": 17700
1579
+ },
1580
+ {
1581
+ "epoch": 1.21,
1582
+ "grad_norm": 1.7533044815063477,
1583
+ "learning_rate": 7.581464872944694e-06,
1584
+ "loss": 1.2697,
1585
+ "step": 17800
1586
+ },
1587
+ {
1588
+ "epoch": 1.22,
1589
+ "grad_norm": 1.642408847808838,
1590
+ "learning_rate": 7.567876070118223e-06,
1591
+ "loss": 1.2587,
1592
+ "step": 17900
1593
+ },
1594
+ {
1595
+ "epoch": 1.22,
1596
+ "grad_norm": 2.0030946731567383,
1597
+ "learning_rate": 7.554287267291752e-06,
1598
+ "loss": 1.2825,
1599
+ "step": 18000
1600
+ },
1601
+ {
1602
+ "epoch": 1.22,
1603
+ "eval_loss": 1.2882283926010132,
1604
+ "eval_runtime": 70.9711,
1605
+ "eval_samples_per_second": 35.831,
1606
+ "eval_steps_per_second": 4.481,
1607
+ "step": 18000
1608
+ },
1609
+ {
1610
+ "epoch": 1.23,
1611
+ "grad_norm": 1.828447699546814,
1612
+ "learning_rate": 7.540698464465281e-06,
1613
+ "loss": 1.2162,
1614
+ "step": 18100
1615
+ },
1616
+ {
1617
+ "epoch": 1.24,
1618
+ "grad_norm": 1.9078677892684937,
1619
+ "learning_rate": 7.527109661638811e-06,
1620
+ "loss": 1.2618,
1621
+ "step": 18200
1622
+ },
1623
+ {
1624
+ "epoch": 1.24,
1625
+ "grad_norm": 1.7438205480575562,
1626
+ "learning_rate": 7.513520858812339e-06,
1627
+ "loss": 1.2972,
1628
+ "step": 18300
1629
+ },
1630
+ {
1631
+ "epoch": 1.25,
1632
+ "grad_norm": 1.5308886766433716,
1633
+ "learning_rate": 7.499932055985868e-06,
1634
+ "loss": 1.2635,
1635
+ "step": 18400
1636
+ },
1637
+ {
1638
+ "epoch": 1.26,
1639
+ "grad_norm": 1.7570804357528687,
1640
+ "learning_rate": 7.486343253159397e-06,
1641
+ "loss": 1.3104,
1642
+ "step": 18500
1643
+ },
1644
+ {
1645
+ "epoch": 1.26,
1646
+ "eval_loss": 1.2821784019470215,
1647
+ "eval_runtime": 70.9616,
1648
+ "eval_samples_per_second": 35.836,
1649
+ "eval_steps_per_second": 4.481,
1650
+ "step": 18500
1651
+ },
1652
+ {
1653
+ "epoch": 1.26,
1654
+ "grad_norm": 1.820691466331482,
1655
+ "learning_rate": 7.472754450332927e-06,
1656
+ "loss": 1.2978,
1657
+ "step": 18600
1658
+ },
1659
+ {
1660
+ "epoch": 1.27,
1661
+ "grad_norm": 2.0996739864349365,
1662
+ "learning_rate": 7.4591656475064555e-06,
1663
+ "loss": 1.2925,
1664
+ "step": 18700
1665
+ },
1666
+ {
1667
+ "epoch": 1.28,
1668
+ "grad_norm": 1.6602342128753662,
1669
+ "learning_rate": 7.445576844679985e-06,
1670
+ "loss": 1.243,
1671
+ "step": 18800
1672
+ },
1673
+ {
1674
+ "epoch": 1.28,
1675
+ "grad_norm": 2.034649133682251,
1676
+ "learning_rate": 7.431988041853513e-06,
1677
+ "loss": 1.2775,
1678
+ "step": 18900
1679
+ },
1680
+ {
1681
+ "epoch": 1.29,
1682
+ "grad_norm": 1.898582100868225,
1683
+ "learning_rate": 7.418399239027042e-06,
1684
+ "loss": 1.2786,
1685
+ "step": 19000
1686
+ },
1687
+ {
1688
+ "epoch": 1.29,
1689
+ "eval_loss": 1.2745426893234253,
1690
+ "eval_runtime": 70.9759,
1691
+ "eval_samples_per_second": 35.829,
1692
+ "eval_steps_per_second": 4.48,
1693
+ "step": 19000
1694
+ },
1695
+ {
1696
+ "epoch": 1.3,
1697
+ "grad_norm": 1.4128785133361816,
1698
+ "learning_rate": 7.4048104362005715e-06,
1699
+ "loss": 1.2656,
1700
+ "step": 19100
1701
+ },
1702
+ {
1703
+ "epoch": 1.3,
1704
+ "grad_norm": 2.2971534729003906,
1705
+ "learning_rate": 7.3912216333741e-06,
1706
+ "loss": 1.2426,
1707
+ "step": 19200
1708
+ },
1709
+ {
1710
+ "epoch": 1.31,
1711
+ "grad_norm": 1.783996820449829,
1712
+ "learning_rate": 7.37763283054763e-06,
1713
+ "loss": 1.272,
1714
+ "step": 19300
1715
+ },
1716
+ {
1717
+ "epoch": 1.32,
1718
+ "grad_norm": 1.8958848714828491,
1719
+ "learning_rate": 7.364044027721159e-06,
1720
+ "loss": 1.2168,
1721
+ "step": 19400
1722
+ },
1723
+ {
1724
+ "epoch": 1.32,
1725
+ "grad_norm": 2.1363043785095215,
1726
+ "learning_rate": 7.350455224894687e-06,
1727
+ "loss": 1.2734,
1728
+ "step": 19500
1729
+ },
1730
+ {
1731
+ "epoch": 1.32,
1732
+ "eval_loss": 1.2699941396713257,
1733
+ "eval_runtime": 71.0334,
1734
+ "eval_samples_per_second": 35.8,
1735
+ "eval_steps_per_second": 4.477,
1736
+ "step": 19500
1737
+ },
1738
+ {
1739
+ "epoch": 1.33,
1740
+ "grad_norm": 2.642695903778076,
1741
+ "learning_rate": 7.336866422068216e-06,
1742
+ "loss": 1.3135,
1743
+ "step": 19600
1744
+ },
1745
+ {
1746
+ "epoch": 1.34,
1747
+ "grad_norm": 1.8240422010421753,
1748
+ "learning_rate": 7.323277619241746e-06,
1749
+ "loss": 1.2924,
1750
+ "step": 19700
1751
+ },
1752
+ {
1753
+ "epoch": 1.35,
1754
+ "grad_norm": 1.8623692989349365,
1755
+ "learning_rate": 7.309688816415275e-06,
1756
+ "loss": 1.2907,
1757
+ "step": 19800
1758
+ },
1759
+ {
1760
+ "epoch": 1.35,
1761
+ "grad_norm": 2.2778708934783936,
1762
+ "learning_rate": 7.296100013588804e-06,
1763
+ "loss": 1.2727,
1764
+ "step": 19900
1765
+ },
1766
+ {
1767
+ "epoch": 1.36,
1768
+ "grad_norm": 1.8957061767578125,
1769
+ "learning_rate": 7.282511210762332e-06,
1770
+ "loss": 1.2656,
1771
+ "step": 20000
1772
+ },
1773
+ {
1774
+ "epoch": 1.36,
1775
+ "eval_loss": 1.2644336223602295,
1776
+ "eval_runtime": 70.9373,
1777
+ "eval_samples_per_second": 35.849,
1778
+ "eval_steps_per_second": 4.483,
1779
+ "step": 20000
1780
+ },
1781
+ {
1782
+ "epoch": 1.37,
1783
+ "grad_norm": 1.7855497598648071,
1784
+ "learning_rate": 7.268922407935861e-06,
1785
+ "loss": 1.2158,
1786
+ "step": 20100
1787
+ },
1788
+ {
1789
+ "epoch": 1.37,
1790
+ "grad_norm": 1.8924943208694458,
1791
+ "learning_rate": 7.2553336051093905e-06,
1792
+ "loss": 1.2753,
1793
+ "step": 20200
1794
+ },
1795
+ {
1796
+ "epoch": 1.38,
1797
+ "grad_norm": 2.0177762508392334,
1798
+ "learning_rate": 7.241880690311184e-06,
1799
+ "loss": 1.2406,
1800
+ "step": 20300
1801
+ },
1802
+ {
1803
+ "epoch": 1.39,
1804
+ "grad_norm": 2.4161458015441895,
1805
+ "learning_rate": 7.228291887484713e-06,
1806
+ "loss": 1.2453,
1807
+ "step": 20400
1808
+ },
1809
+ {
1810
+ "epoch": 1.39,
1811
+ "grad_norm": 1.7210235595703125,
1812
+ "learning_rate": 7.214703084658242e-06,
1813
+ "loss": 1.2107,
1814
+ "step": 20500
1815
+ },
1816
+ {
1817
+ "epoch": 1.39,
1818
+ "eval_loss": 1.258453607559204,
1819
+ "eval_runtime": 71.0477,
1820
+ "eval_samples_per_second": 35.793,
1821
+ "eval_steps_per_second": 4.476,
1822
+ "step": 20500
1823
+ },
1824
+ {
1825
+ "epoch": 1.4,
1826
+ "grad_norm": 1.5457173585891724,
1827
+ "learning_rate": 7.201114281831771e-06,
1828
+ "loss": 1.2377,
1829
+ "step": 20600
1830
+ },
1831
+ {
1832
+ "epoch": 1.41,
1833
+ "grad_norm": 2.403831720352173,
1834
+ "learning_rate": 7.187525479005301e-06,
1835
+ "loss": 1.238,
1836
+ "step": 20700
1837
+ },
1838
+ {
1839
+ "epoch": 1.41,
1840
+ "grad_norm": 2.01042103767395,
1841
+ "learning_rate": 7.173936676178829e-06,
1842
+ "loss": 1.2528,
1843
+ "step": 20800
1844
+ },
1845
+ {
1846
+ "epoch": 1.42,
1847
+ "grad_norm": 2.197006940841675,
1848
+ "learning_rate": 7.160347873352358e-06,
1849
+ "loss": 1.2395,
1850
+ "step": 20900
1851
+ },
1852
+ {
1853
+ "epoch": 1.43,
1854
+ "grad_norm": 2.503634214401245,
1855
+ "learning_rate": 7.146759070525887e-06,
1856
+ "loss": 1.2822,
1857
+ "step": 21000
1858
+ },
1859
+ {
1860
+ "epoch": 1.43,
1861
+ "eval_loss": 1.2508896589279175,
1862
+ "eval_runtime": 71.0512,
1863
+ "eval_samples_per_second": 35.791,
1864
+ "eval_steps_per_second": 4.476,
1865
+ "step": 21000
1866
+ },
1867
+ {
1868
+ "epoch": 1.43,
1869
+ "grad_norm": 1.4275486469268799,
1870
+ "learning_rate": 7.133170267699417e-06,
1871
+ "loss": 1.2337,
1872
+ "step": 21100
1873
+ },
1874
+ {
1875
+ "epoch": 1.44,
1876
+ "grad_norm": 2.1461949348449707,
1877
+ "learning_rate": 7.11971735290121e-06,
1878
+ "loss": 1.2576,
1879
+ "step": 21200
1880
+ },
1881
+ {
1882
+ "epoch": 1.45,
1883
+ "grad_norm": 1.705665111541748,
1884
+ "learning_rate": 7.106128550074739e-06,
1885
+ "loss": 1.2311,
1886
+ "step": 21300
1887
+ },
1888
+ {
1889
+ "epoch": 1.45,
1890
+ "grad_norm": 2.058223247528076,
1891
+ "learning_rate": 7.0925397472482685e-06,
1892
+ "loss": 1.2619,
1893
+ "step": 21400
1894
+ },
1895
+ {
1896
+ "epoch": 1.46,
1897
+ "grad_norm": 1.4659618139266968,
1898
+ "learning_rate": 7.078950944421797e-06,
1899
+ "loss": 1.2188,
1900
+ "step": 21500
1901
+ },
1902
+ {
1903
+ "epoch": 1.46,
1904
+ "eval_loss": 1.2478315830230713,
1905
+ "eval_runtime": 70.8088,
1906
+ "eval_samples_per_second": 35.914,
1907
+ "eval_steps_per_second": 4.491,
1908
+ "step": 21500
1909
+ },
1910
+ {
1911
+ "epoch": 1.47,
1912
+ "grad_norm": 1.7102398872375488,
1913
+ "learning_rate": 7.065362141595325e-06,
1914
+ "loss": 1.2553,
1915
+ "step": 21600
1916
+ },
1917
+ {
1918
+ "epoch": 1.47,
1919
+ "grad_norm": 2.445326089859009,
1920
+ "learning_rate": 7.051773338768855e-06,
1921
+ "loss": 1.2687,
1922
+ "step": 21700
1923
+ },
1924
+ {
1925
+ "epoch": 1.48,
1926
+ "grad_norm": 2.055088758468628,
1927
+ "learning_rate": 7.038184535942384e-06,
1928
+ "loss": 1.254,
1929
+ "step": 21800
1930
+ },
1931
+ {
1932
+ "epoch": 1.49,
1933
+ "grad_norm": 2.5781538486480713,
1934
+ "learning_rate": 7.024595733115913e-06,
1935
+ "loss": 1.2319,
1936
+ "step": 21900
1937
+ },
1938
+ {
1939
+ "epoch": 1.49,
1940
+ "grad_norm": 1.9507685899734497,
1941
+ "learning_rate": 7.011006930289442e-06,
1942
+ "loss": 1.2185,
1943
+ "step": 22000
1944
+ },
1945
+ {
1946
+ "epoch": 1.49,
1947
+ "eval_loss": 1.245086908340454,
1948
+ "eval_runtime": 71.042,
1949
+ "eval_samples_per_second": 35.796,
1950
+ "eval_steps_per_second": 4.476,
1951
+ "step": 22000
1952
+ },
1953
+ {
1954
+ "epoch": 1.5,
1955
+ "grad_norm": 2.347245216369629,
1956
+ "learning_rate": 6.997418127462972e-06,
1957
+ "loss": 1.2024,
1958
+ "step": 22100
1959
+ },
1960
+ {
1961
+ "epoch": 1.51,
1962
+ "grad_norm": 1.6178568601608276,
1963
+ "learning_rate": 6.9838293246364995e-06,
1964
+ "loss": 1.2379,
1965
+ "step": 22200
1966
+ },
1967
+ {
1968
+ "epoch": 1.52,
1969
+ "grad_norm": 1.7551498413085938,
1970
+ "learning_rate": 6.970240521810029e-06,
1971
+ "loss": 1.2344,
1972
+ "step": 22300
1973
+ },
1974
+ {
1975
+ "epoch": 1.52,
1976
+ "grad_norm": 1.9250737428665161,
1977
+ "learning_rate": 6.956651718983558e-06,
1978
+ "loss": 1.2942,
1979
+ "step": 22400
1980
+ },
1981
+ {
1982
+ "epoch": 1.53,
1983
+ "grad_norm": 1.640271782875061,
1984
+ "learning_rate": 6.9430629161570876e-06,
1985
+ "loss": 1.2441,
1986
+ "step": 22500
1987
+ },
1988
+ {
1989
+ "epoch": 1.53,
1990
+ "eval_loss": 1.2352341413497925,
1991
+ "eval_runtime": 70.9437,
1992
+ "eval_samples_per_second": 35.845,
1993
+ "eval_steps_per_second": 4.482,
1994
+ "step": 22500
1995
+ },
1996
+ {
1997
+ "epoch": 1.54,
1998
+ "grad_norm": 2.083061456680298,
1999
+ "learning_rate": 6.929474113330616e-06,
2000
+ "loss": 1.1819,
2001
+ "step": 22600
2002
+ },
2003
+ {
2004
+ "epoch": 1.54,
2005
+ "grad_norm": 1.8274168968200684,
2006
+ "learning_rate": 6.915885310504146e-06,
2007
+ "loss": 1.2243,
2008
+ "step": 22700
2009
+ },
2010
+ {
2011
+ "epoch": 1.55,
2012
+ "grad_norm": 1.7711529731750488,
2013
+ "learning_rate": 6.902296507677674e-06,
2014
+ "loss": 1.2318,
2015
+ "step": 22800
2016
+ },
2017
+ {
2018
+ "epoch": 1.56,
2019
+ "grad_norm": 2.0537028312683105,
2020
+ "learning_rate": 6.888707704851203e-06,
2021
+ "loss": 1.1958,
2022
+ "step": 22900
2023
+ },
2024
+ {
2025
+ "epoch": 1.56,
2026
+ "grad_norm": 1.8466728925704956,
2027
+ "learning_rate": 6.875118902024732e-06,
2028
+ "loss": 1.2564,
2029
+ "step": 23000
2030
+ },
2031
+ {
2032
+ "epoch": 1.56,
2033
+ "eval_loss": 1.2326687574386597,
2034
+ "eval_runtime": 70.9545,
2035
+ "eval_samples_per_second": 35.84,
2036
+ "eval_steps_per_second": 4.482,
2037
+ "step": 23000
2038
+ },
2039
+ {
2040
+ "epoch": 1.57,
2041
+ "grad_norm": 1.6301406621932983,
2042
+ "learning_rate": 6.861530099198261e-06,
2043
+ "loss": 1.1904,
2044
+ "step": 23100
2045
+ },
2046
+ {
2047
+ "epoch": 1.58,
2048
+ "grad_norm": 2.1303887367248535,
2049
+ "learning_rate": 6.847941296371791e-06,
2050
+ "loss": 1.2,
2051
+ "step": 23200
2052
+ },
2053
+ {
2054
+ "epoch": 1.58,
2055
+ "grad_norm": 2.042210340499878,
2056
+ "learning_rate": 6.834352493545319e-06,
2057
+ "loss": 1.2432,
2058
+ "step": 23300
2059
+ },
2060
+ {
2061
+ "epoch": 1.59,
2062
+ "grad_norm": 1.8403574228286743,
2063
+ "learning_rate": 6.820763690718848e-06,
2064
+ "loss": 1.2195,
2065
+ "step": 23400
2066
+ },
2067
+ {
2068
+ "epoch": 1.6,
2069
+ "grad_norm": 1.8628817796707153,
2070
+ "learning_rate": 6.807174887892377e-06,
2071
+ "loss": 1.2032,
2072
+ "step": 23500
2073
+ },
2074
+ {
2075
+ "epoch": 1.6,
2076
+ "eval_loss": 1.2271267175674438,
2077
+ "eval_runtime": 70.9472,
2078
+ "eval_samples_per_second": 35.844,
2079
+ "eval_steps_per_second": 4.482,
2080
+ "step": 23500
2081
+ },
2082
+ {
2083
+ "epoch": 1.6,
2084
+ "grad_norm": 2.2309892177581787,
2085
+ "learning_rate": 6.793586085065907e-06,
2086
+ "loss": 1.1931,
2087
+ "step": 23600
2088
+ },
2089
+ {
2090
+ "epoch": 1.61,
2091
+ "grad_norm": 1.4337612390518188,
2092
+ "learning_rate": 6.7799972822394354e-06,
2093
+ "loss": 1.2401,
2094
+ "step": 23700
2095
+ },
2096
+ {
2097
+ "epoch": 1.62,
2098
+ "grad_norm": 1.7968145608901978,
2099
+ "learning_rate": 6.766408479412965e-06,
2100
+ "loss": 1.233,
2101
+ "step": 23800
2102
+ },
2103
+ {
2104
+ "epoch": 1.62,
2105
+ "grad_norm": 1.7918980121612549,
2106
+ "learning_rate": 6.7529555646147584e-06,
2107
+ "loss": 1.1874,
2108
+ "step": 23900
2109
+ },
2110
+ {
2111
+ "epoch": 1.63,
2112
+ "grad_norm": 1.9370090961456299,
2113
+ "learning_rate": 6.739366761788287e-06,
2114
+ "loss": 1.2031,
2115
+ "step": 24000
2116
+ },
2117
+ {
2118
+ "epoch": 1.63,
2119
+ "eval_loss": 1.2228479385375977,
2120
+ "eval_runtime": 70.9391,
2121
+ "eval_samples_per_second": 35.848,
2122
+ "eval_steps_per_second": 4.483,
2123
+ "step": 24000
2124
+ },
2125
+ {
2126
+ "epoch": 1.64,
2127
+ "grad_norm": 2.238128900527954,
2128
+ "learning_rate": 6.725777958961815e-06,
2129
+ "loss": 1.2207,
2130
+ "step": 24100
2131
+ },
2132
+ {
2133
+ "epoch": 1.64,
2134
+ "grad_norm": 1.9183790683746338,
2135
+ "learning_rate": 6.712189156135345e-06,
2136
+ "loss": 1.2263,
2137
+ "step": 24200
2138
+ },
2139
+ {
2140
+ "epoch": 1.65,
2141
+ "grad_norm": 2.407428026199341,
2142
+ "learning_rate": 6.6986003533088736e-06,
2143
+ "loss": 1.2424,
2144
+ "step": 24300
2145
+ },
2146
+ {
2147
+ "epoch": 1.66,
2148
+ "grad_norm": 1.837365746498108,
2149
+ "learning_rate": 6.685011550482403e-06,
2150
+ "loss": 1.1832,
2151
+ "step": 24400
2152
+ },
2153
+ {
2154
+ "epoch": 1.66,
2155
+ "grad_norm": 1.9926724433898926,
2156
+ "learning_rate": 6.671422747655932e-06,
2157
+ "loss": 1.2088,
2158
+ "step": 24500
2159
+ },
2160
+ {
2161
+ "epoch": 1.66,
2162
+ "eval_loss": 1.2178888320922852,
2163
+ "eval_runtime": 71.0012,
2164
+ "eval_samples_per_second": 35.816,
2165
+ "eval_steps_per_second": 4.479,
2166
+ "step": 24500
2167
+ },
2168
+ {
2169
+ "epoch": 1.67,
2170
+ "grad_norm": 1.6491261720657349,
2171
+ "learning_rate": 6.657833944829462e-06,
2172
+ "loss": 1.2103,
2173
+ "step": 24600
2174
+ },
2175
+ {
2176
+ "epoch": 1.68,
2177
+ "grad_norm": 1.825826644897461,
2178
+ "learning_rate": 6.6442451420029895e-06,
2179
+ "loss": 1.1911,
2180
+ "step": 24700
2181
+ },
2182
+ {
2183
+ "epoch": 1.69,
2184
+ "grad_norm": 1.788091778755188,
2185
+ "learning_rate": 6.630656339176519e-06,
2186
+ "loss": 1.2246,
2187
+ "step": 24800
2188
+ },
2189
+ {
2190
+ "epoch": 1.69,
2191
+ "grad_norm": 1.8941233158111572,
2192
+ "learning_rate": 6.617067536350048e-06,
2193
+ "loss": 1.197,
2194
+ "step": 24900
2195
+ },
2196
+ {
2197
+ "epoch": 1.7,
2198
+ "grad_norm": 2.360272169113159,
2199
+ "learning_rate": 6.6034787335235775e-06,
2200
+ "loss": 1.1925,
2201
+ "step": 25000
2202
+ },
2203
+ {
2204
+ "epoch": 1.7,
2205
+ "eval_loss": 1.2120610475540161,
2206
+ "eval_runtime": 70.9625,
2207
+ "eval_samples_per_second": 35.836,
2208
+ "eval_steps_per_second": 4.481,
2209
+ "step": 25000
2210
+ },
2211
+ {
2212
+ "epoch": 1.71,
2213
+ "grad_norm": 2.0026679039001465,
2214
+ "learning_rate": 6.589889930697106e-06,
2215
+ "loss": 1.2063,
2216
+ "step": 25100
2217
+ },
2218
+ {
2219
+ "epoch": 1.71,
2220
+ "grad_norm": 1.9979290962219238,
2221
+ "learning_rate": 6.576301127870636e-06,
2222
+ "loss": 1.1784,
2223
+ "step": 25200
2224
+ },
2225
+ {
2226
+ "epoch": 1.72,
2227
+ "grad_norm": 1.682900071144104,
2228
+ "learning_rate": 6.562712325044164e-06,
2229
+ "loss": 1.1587,
2230
+ "step": 25300
2231
+ },
2232
+ {
2233
+ "epoch": 1.73,
2234
+ "grad_norm": 2.0586678981781006,
2235
+ "learning_rate": 6.549123522217693e-06,
2236
+ "loss": 1.2031,
2237
+ "step": 25400
2238
+ },
2239
+ {
2240
+ "epoch": 1.73,
2241
+ "grad_norm": 2.5424463748931885,
2242
+ "learning_rate": 6.535534719391222e-06,
2243
+ "loss": 1.2061,
2244
+ "step": 25500
2245
+ },
2246
+ {
2247
+ "epoch": 1.73,
2248
+ "eval_loss": 1.209425449371338,
2249
+ "eval_runtime": 70.8814,
2250
+ "eval_samples_per_second": 35.877,
2251
+ "eval_steps_per_second": 4.486,
2252
+ "step": 25500
2253
+ },
2254
+ {
2255
+ "epoch": 1.74,
2256
+ "grad_norm": 2.0070347785949707,
2257
+ "learning_rate": 6.521945916564751e-06,
2258
+ "loss": 1.2175,
2259
+ "step": 25600
2260
+ },
2261
+ {
2262
+ "epoch": 1.75,
2263
+ "grad_norm": 1.7913732528686523,
2264
+ "learning_rate": 6.508357113738281e-06,
2265
+ "loss": 1.2005,
2266
+ "step": 25700
2267
+ },
2268
+ {
2269
+ "epoch": 1.75,
2270
+ "grad_norm": 1.552306890487671,
2271
+ "learning_rate": 6.494768310911809e-06,
2272
+ "loss": 1.2011,
2273
+ "step": 25800
2274
+ },
2275
+ {
2276
+ "epoch": 1.76,
2277
+ "grad_norm": 3.1545894145965576,
2278
+ "learning_rate": 6.481179508085338e-06,
2279
+ "loss": 1.17,
2280
+ "step": 25900
2281
+ },
2282
+ {
2283
+ "epoch": 1.77,
2284
+ "grad_norm": 1.7653687000274658,
2285
+ "learning_rate": 6.467590705258867e-06,
2286
+ "loss": 1.1984,
2287
+ "step": 26000
2288
+ },
2289
+ {
2290
+ "epoch": 1.77,
2291
+ "eval_loss": 1.2038514614105225,
2292
+ "eval_runtime": 70.8478,
2293
+ "eval_samples_per_second": 35.894,
2294
+ "eval_steps_per_second": 4.488,
2295
+ "step": 26000
2296
+ },
2297
+ {
2298
+ "epoch": 1.77,
2299
+ "grad_norm": 1.9187010526657104,
2300
+ "learning_rate": 6.454001902432397e-06,
2301
+ "loss": 1.2051,
2302
+ "step": 26100
2303
+ },
2304
+ {
2305
+ "epoch": 1.78,
2306
+ "grad_norm": 1.6188615560531616,
2307
+ "learning_rate": 6.440413099605925e-06,
2308
+ "loss": 1.1712,
2309
+ "step": 26200
2310
+ },
2311
+ {
2312
+ "epoch": 1.79,
2313
+ "grad_norm": 1.9360331296920776,
2314
+ "learning_rate": 6.426824296779455e-06,
2315
+ "loss": 1.1531,
2316
+ "step": 26300
2317
+ },
2318
+ {
2319
+ "epoch": 1.79,
2320
+ "grad_norm": 2.710357189178467,
2321
+ "learning_rate": 6.413235493952983e-06,
2322
+ "loss": 1.1707,
2323
+ "step": 26400
2324
+ },
2325
+ {
2326
+ "epoch": 1.8,
2327
+ "grad_norm": 2.3331565856933594,
2328
+ "learning_rate": 6.399646691126512e-06,
2329
+ "loss": 1.1929,
2330
+ "step": 26500
2331
+ },
2332
+ {
2333
+ "epoch": 1.8,
2334
+ "eval_loss": 1.2011253833770752,
2335
+ "eval_runtime": 70.8706,
2336
+ "eval_samples_per_second": 35.882,
2337
+ "eval_steps_per_second": 4.487,
2338
+ "step": 26500
2339
+ },
2340
+ {
2341
+ "epoch": 1.81,
2342
+ "grad_norm": 2.030912399291992,
2343
+ "learning_rate": 6.386057888300041e-06,
2344
+ "loss": 1.1986,
2345
+ "step": 26600
2346
+ },
2347
+ {
2348
+ "epoch": 1.81,
2349
+ "grad_norm": 2.1584174633026123,
2350
+ "learning_rate": 6.37246908547357e-06,
2351
+ "loss": 1.2564,
2352
+ "step": 26700
2353
+ },
2354
+ {
2355
+ "epoch": 1.82,
2356
+ "grad_norm": 2.3068361282348633,
2357
+ "learning_rate": 6.3588802826471e-06,
2358
+ "loss": 1.1933,
2359
+ "step": 26800
2360
+ },
2361
+ {
2362
+ "epoch": 1.83,
2363
+ "grad_norm": 1.5643947124481201,
2364
+ "learning_rate": 6.3452914798206285e-06,
2365
+ "loss": 1.1691,
2366
+ "step": 26900
2367
+ },
2368
+ {
2369
+ "epoch": 1.83,
2370
+ "grad_norm": 2.531083822250366,
2371
+ "learning_rate": 6.331702676994157e-06,
2372
+ "loss": 1.1387,
2373
+ "step": 27000
2374
+ },
2375
+ {
2376
+ "epoch": 1.83,
2377
+ "eval_loss": 1.1969281435012817,
2378
+ "eval_runtime": 70.8053,
2379
+ "eval_samples_per_second": 35.915,
2380
+ "eval_steps_per_second": 4.491,
2381
+ "step": 27000
2382
+ },
2383
+ {
2384
+ "epoch": 1.84,
2385
+ "grad_norm": 1.800430417060852,
2386
+ "learning_rate": 6.318113874167686e-06,
2387
+ "loss": 1.1646,
2388
+ "step": 27100
2389
+ },
2390
+ {
2391
+ "epoch": 1.85,
2392
+ "grad_norm": 1.8300161361694336,
2393
+ "learning_rate": 6.304525071341216e-06,
2394
+ "loss": 1.2104,
2395
+ "step": 27200
2396
+ },
2397
+ {
2398
+ "epoch": 1.85,
2399
+ "grad_norm": 1.7128742933273315,
2400
+ "learning_rate": 6.2909362685147445e-06,
2401
+ "loss": 1.1932,
2402
+ "step": 27300
2403
+ },
2404
+ {
2405
+ "epoch": 1.86,
2406
+ "grad_norm": 1.9414857625961304,
2407
+ "learning_rate": 6.277347465688274e-06,
2408
+ "loss": 1.1818,
2409
+ "step": 27400
2410
+ },
2411
+ {
2412
+ "epoch": 1.87,
2413
+ "grad_norm": 1.9951707124710083,
2414
+ "learning_rate": 6.263758662861802e-06,
2415
+ "loss": 1.2024,
2416
+ "step": 27500
2417
+ },
2418
+ {
2419
+ "epoch": 1.87,
2420
+ "eval_loss": 1.1932079792022705,
2421
+ "eval_runtime": 70.8009,
2422
+ "eval_samples_per_second": 35.918,
2423
+ "eval_steps_per_second": 4.491,
2424
+ "step": 27500
2425
+ },
2426
+ {
2427
+ "epoch": 1.88,
2428
+ "grad_norm": 1.9903321266174316,
2429
+ "learning_rate": 6.250169860035331e-06,
2430
+ "loss": 1.1824,
2431
+ "step": 27600
2432
+ },
2433
+ {
2434
+ "epoch": 1.88,
2435
+ "grad_norm": 2.8851983547210693,
2436
+ "learning_rate": 6.2365810572088605e-06,
2437
+ "loss": 1.1851,
2438
+ "step": 27700
2439
+ },
2440
+ {
2441
+ "epoch": 1.89,
2442
+ "grad_norm": 2.0669078826904297,
2443
+ "learning_rate": 6.222992254382389e-06,
2444
+ "loss": 1.1646,
2445
+ "step": 27800
2446
+ },
2447
+ {
2448
+ "epoch": 1.9,
2449
+ "grad_norm": 1.9089607000350952,
2450
+ "learning_rate": 6.209403451555919e-06,
2451
+ "loss": 1.1776,
2452
+ "step": 27900
2453
+ },
2454
+ {
2455
+ "epoch": 1.9,
2456
+ "grad_norm": 2.2551538944244385,
2457
+ "learning_rate": 6.195814648729448e-06,
2458
+ "loss": 1.1909,
2459
+ "step": 28000
2460
+ },
2461
+ {
2462
+ "epoch": 1.9,
2463
+ "eval_loss": 1.1877614259719849,
2464
+ "eval_runtime": 70.8691,
2465
+ "eval_samples_per_second": 35.883,
2466
+ "eval_steps_per_second": 4.487,
2467
+ "step": 28000
2468
+ },
2469
+ {
2470
+ "epoch": 1.91,
2471
+ "grad_norm": 1.5612294673919678,
2472
+ "learning_rate": 6.182225845902976e-06,
2473
+ "loss": 1.1559,
2474
+ "step": 28100
2475
+ },
2476
+ {
2477
+ "epoch": 1.92,
2478
+ "grad_norm": 2.8218579292297363,
2479
+ "learning_rate": 6.168637043076505e-06,
2480
+ "loss": 1.1652,
2481
+ "step": 28200
2482
+ },
2483
+ {
2484
+ "epoch": 1.92,
2485
+ "grad_norm": 1.9702138900756836,
2486
+ "learning_rate": 6.155048240250035e-06,
2487
+ "loss": 1.1917,
2488
+ "step": 28300
2489
+ },
2490
+ {
2491
+ "epoch": 1.93,
2492
+ "grad_norm": 2.3673105239868164,
2493
+ "learning_rate": 6.141459437423564e-06,
2494
+ "loss": 1.1336,
2495
+ "step": 28400
2496
+ },
2497
+ {
2498
+ "epoch": 1.94,
2499
+ "grad_norm": 1.8467798233032227,
2500
+ "learning_rate": 6.127870634597093e-06,
2501
+ "loss": 1.1786,
2502
+ "step": 28500
2503
+ },
2504
+ {
2505
+ "epoch": 1.94,
2506
+ "eval_loss": 1.1837141513824463,
2507
+ "eval_runtime": 70.7666,
2508
+ "eval_samples_per_second": 35.935,
2509
+ "eval_steps_per_second": 4.494,
2510
+ "step": 28500
2511
+ },
2512
+ {
2513
+ "epoch": 1.94,
2514
+ "grad_norm": 1.830837368965149,
2515
+ "learning_rate": 6.114281831770621e-06,
2516
+ "loss": 1.1908,
2517
+ "step": 28600
2518
+ },
2519
+ {
2520
+ "epoch": 1.95,
2521
+ "grad_norm": 1.7194899320602417,
2522
+ "learning_rate": 6.10069302894415e-06,
2523
+ "loss": 1.1541,
2524
+ "step": 28700
2525
+ },
2526
+ {
2527
+ "epoch": 1.96,
2528
+ "grad_norm": 1.798368215560913,
2529
+ "learning_rate": 6.0871042261176796e-06,
2530
+ "loss": 1.1487,
2531
+ "step": 28800
2532
+ },
2533
+ {
2534
+ "epoch": 1.96,
2535
+ "grad_norm": 1.9699339866638184,
2536
+ "learning_rate": 6.073515423291208e-06,
2537
+ "loss": 1.166,
2538
+ "step": 28900
2539
+ },
2540
+ {
2541
+ "epoch": 1.97,
2542
+ "grad_norm": 2.1379947662353516,
2543
+ "learning_rate": 6.059926620464738e-06,
2544
+ "loss": 1.1724,
2545
+ "step": 29000
2546
+ },
2547
+ {
2548
+ "epoch": 1.97,
2549
+ "eval_loss": 1.181123971939087,
2550
+ "eval_runtime": 92.5562,
2551
+ "eval_samples_per_second": 27.475,
2552
+ "eval_steps_per_second": 3.436,
2553
+ "step": 29000
2554
+ },
2555
+ {
2556
+ "epoch": 1.98,
2557
+ "grad_norm": 2.405134439468384,
2558
+ "learning_rate": 6.046337817638267e-06,
2559
+ "loss": 1.1651,
2560
+ "step": 29100
2561
+ },
2562
+ {
2563
+ "epoch": 1.98,
2564
+ "grad_norm": 1.8744902610778809,
2565
+ "learning_rate": 6.032749014811795e-06,
2566
+ "loss": 1.201,
2567
+ "step": 29200
2568
+ },
2569
+ {
2570
+ "epoch": 1.99,
2571
+ "grad_norm": 3.014401435852051,
2572
+ "learning_rate": 6.019160211985324e-06,
2573
+ "loss": 1.1783,
2574
+ "step": 29300
2575
+ },
2576
+ {
2577
+ "epoch": 2.0,
2578
+ "grad_norm": 2.104191780090332,
2579
+ "learning_rate": 6.005571409158854e-06,
2580
+ "loss": 1.1479,
2581
+ "step": 29400
2582
+ },
2583
+ {
2584
+ "epoch": 2.0,
2585
+ "grad_norm": 1.9670746326446533,
2586
+ "learning_rate": 5.991982606332383e-06,
2587
+ "loss": 1.1372,
2588
+ "step": 29500
2589
+ },
2590
+ {
2591
+ "epoch": 2.0,
2592
+ "eval_loss": 1.176620602607727,
2593
+ "eval_runtime": 92.464,
2594
+ "eval_samples_per_second": 27.503,
2595
+ "eval_steps_per_second": 3.439,
2596
+ "step": 29500
2597
+ },
2598
+ {
2599
+ "epoch": 2.01,
2600
+ "grad_norm": 2.3720388412475586,
2601
+ "learning_rate": 5.978393803505912e-06,
2602
+ "loss": 1.1476,
2603
+ "step": 29600
2604
+ },
2605
+ {
2606
+ "epoch": 2.02,
2607
+ "grad_norm": 2.047060251235962,
2608
+ "learning_rate": 5.964805000679441e-06,
2609
+ "loss": 1.1303,
2610
+ "step": 29700
2611
+ },
2612
+ {
2613
+ "epoch": 2.02,
2614
+ "grad_norm": 1.9792667627334595,
2615
+ "learning_rate": 5.951216197852969e-06,
2616
+ "loss": 1.1462,
2617
+ "step": 29800
2618
+ },
2619
+ {
2620
+ "epoch": 2.03,
2621
+ "grad_norm": 2.241187572479248,
2622
+ "learning_rate": 5.937627395026499e-06,
2623
+ "loss": 1.1031,
2624
+ "step": 29900
2625
+ },
2626
+ {
2627
+ "epoch": 2.04,
2628
+ "grad_norm": 2.0233969688415527,
2629
+ "learning_rate": 5.9240385922000274e-06,
2630
+ "loss": 1.1396,
2631
+ "step": 30000
2632
+ },
2633
+ {
2634
+ "epoch": 2.04,
2635
+ "eval_loss": 1.1728562116622925,
2636
+ "eval_runtime": 92.5707,
2637
+ "eval_samples_per_second": 27.471,
2638
+ "eval_steps_per_second": 3.435,
2639
+ "step": 30000
2640
+ },
2641
+ {
2642
+ "epoch": 2.05,
2643
+ "grad_norm": 2.0683882236480713,
2644
+ "learning_rate": 5.910449789373557e-06,
2645
+ "loss": 1.1431,
2646
+ "step": 30100
2647
+ },
2648
+ {
2649
+ "epoch": 2.05,
2650
+ "grad_norm": 1.9208968877792358,
2651
+ "learning_rate": 5.896860986547086e-06,
2652
+ "loss": 1.1391,
2653
+ "step": 30200
2654
+ },
2655
+ {
2656
+ "epoch": 2.06,
2657
+ "grad_norm": 1.6621592044830322,
2658
+ "learning_rate": 5.883272183720614e-06,
2659
+ "loss": 1.1361,
2660
+ "step": 30300
2661
+ },
2662
+ {
2663
+ "epoch": 2.07,
2664
+ "grad_norm": 1.9728045463562012,
2665
+ "learning_rate": 5.869683380894143e-06,
2666
+ "loss": 1.1606,
2667
+ "step": 30400
2668
+ },
2669
+ {
2670
+ "epoch": 2.07,
2671
+ "grad_norm": 2.3189892768859863,
2672
+ "learning_rate": 5.856094578067672e-06,
2673
+ "loss": 1.1565,
2674
+ "step": 30500
2675
+ },
2676
+ {
2677
+ "epoch": 2.07,
2678
+ "eval_loss": 1.1692627668380737,
2679
+ "eval_runtime": 92.5367,
2680
+ "eval_samples_per_second": 27.481,
2681
+ "eval_steps_per_second": 3.436,
2682
+ "step": 30500
2683
+ },
2684
+ {
2685
+ "epoch": 2.08,
2686
+ "grad_norm": 1.936689853668213,
2687
+ "learning_rate": 5.842505775241202e-06,
2688
+ "loss": 1.1294,
2689
+ "step": 30600
2690
+ },
2691
+ {
2692
+ "epoch": 2.09,
2693
+ "grad_norm": 1.4617129564285278,
2694
+ "learning_rate": 5.8289169724147306e-06,
2695
+ "loss": 1.1591,
2696
+ "step": 30700
2697
+ },
2698
+ {
2699
+ "epoch": 2.09,
2700
+ "grad_norm": 1.5474071502685547,
2701
+ "learning_rate": 5.81532816958826e-06,
2702
+ "loss": 1.1377,
2703
+ "step": 30800
2704
+ },
2705
+ {
2706
+ "epoch": 2.1,
2707
+ "grad_norm": 1.7175779342651367,
2708
+ "learning_rate": 5.801739366761788e-06,
2709
+ "loss": 1.1532,
2710
+ "step": 30900
2711
+ },
2712
+ {
2713
+ "epoch": 2.11,
2714
+ "grad_norm": 1.8924795389175415,
2715
+ "learning_rate": 5.788150563935318e-06,
2716
+ "loss": 1.1002,
2717
+ "step": 31000
2718
+ },
2719
+ {
2720
+ "epoch": 2.11,
2721
+ "eval_loss": 1.1667861938476562,
2722
+ "eval_runtime": 92.475,
2723
+ "eval_samples_per_second": 27.499,
2724
+ "eval_steps_per_second": 3.439,
2725
+ "step": 31000
2726
+ },
2727
+ {
2728
+ "epoch": 2.11,
2729
+ "grad_norm": 2.3616528511047363,
2730
+ "learning_rate": 5.7745617611088465e-06,
2731
+ "loss": 1.1616,
2732
+ "step": 31100
2733
+ },
2734
+ {
2735
+ "epoch": 2.12,
2736
+ "grad_norm": 1.7967276573181152,
2737
+ "learning_rate": 5.760972958282376e-06,
2738
+ "loss": 1.1817,
2739
+ "step": 31200
2740
+ },
2741
+ {
2742
+ "epoch": 2.13,
2743
+ "grad_norm": 2.9053776264190674,
2744
+ "learning_rate": 5.747384155455905e-06,
2745
+ "loss": 1.1611,
2746
+ "step": 31300
2747
+ },
2748
+ {
2749
+ "epoch": 2.13,
2750
+ "grad_norm": 2.2042810916900635,
2751
+ "learning_rate": 5.7337953526294346e-06,
2752
+ "loss": 1.1394,
2753
+ "step": 31400
2754
+ },
2755
+ {
2756
+ "epoch": 2.14,
2757
+ "grad_norm": 1.8876034021377563,
2758
+ "learning_rate": 5.7202065498029625e-06,
2759
+ "loss": 1.1171,
2760
+ "step": 31500
2761
+ },
2762
+ {
2763
+ "epoch": 2.14,
2764
+ "eval_loss": 1.1626156568527222,
2765
+ "eval_runtime": 92.4663,
2766
+ "eval_samples_per_second": 27.502,
2767
+ "eval_steps_per_second": 3.439,
2768
+ "step": 31500
2769
+ }
2770
+ ],
2771
+ "logging_steps": 100,
2772
+ "max_steps": 73590,
2773
+ "num_input_tokens_seen": 0,
2774
+ "num_train_epochs": 5,
2775
+ "save_steps": 500,
2776
+ "total_flos": 1.55848975386624e+17,
2777
+ "train_batch_size": 8,
2778
+ "trial_name": null,
2779
+ "trial_params": null
2780
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98498687363a9e67d3d945942685e0ff1f43befa20a4ec5f62ad97f31e3e5d02
3
+ size 5048
vocab.json ADDED
The diff for this file is too large to render. See raw diff