pszemraj commited on
Commit
2fdd5b7
1 Parent(s): bd10ec8

update ckpt with 4+ epochs of training at 1e-3

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V7.9",
3
  "architectures": [
4
  "LongT5ForConditionalGeneration"
5
  ],
1
  {
2
+ "_name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V9",
3
  "architectures": [
4
  "LongT5ForConditionalGeneration"
5
  ],
long-t5-tglobal-base-16384-booksum-V9-ft1-booksum_training_metadata.json ADDED
@@ -0,0 +1 @@
 
1
+ {"output_dir": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V9-ft1-booksum", "overwrite_output_dir": true, "do_train": false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", "prediction_loss_only": false, "per_device_train_batch_size": 1, "per_device_eval_batch_size": 1, "per_gpu_train_batch_size": "None", "per_gpu_eval_batch_size": "None", "gradient_accumulation_steps": 64, "eval_accumulation_steps": "None", "eval_delay": 0, "learning_rate": 0.0006, "weight_decay": 0.01, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 0.3, "num_train_epochs": 2, "max_steps": -1, "lr_scheduler_type": "cosine", "warmup_ratio": 0.01, "warmup_steps": 0, "log_level": -1, "log_level_replica": -1, "log_on_each_node": true, "logging_dir": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V9-ft1-booksum/logs", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 2, "logging_nan_inf_filter": true, "save_strategy": "steps", "save_steps": 25, "save_total_limit": 1, "save_on_each_node": false, "no_cuda": false, "seed": 42, "data_seed": "None", "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": true, "fp16_opt_level": "O1", "half_precision_backend": "cuda_amp", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": "None", "local_rank": 0, "xpu_backend": "None", "tpu_num_cores": "None", "tpu_metrics_debug": false, "debug": "[]", "dataloader_drop_last": false, "eval_steps": "None", "dataloader_num_workers": 0, "past_index": -1, "run_name": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V9-ft1-booksum", "disable_tqdm": false, "remove_unused_columns": true, "label_names": "None", "load_best_model_at_end": false, "metric_for_best_model": "None", "greater_is_better": "None", "ignore_data_skip": false, "sharded_ddp": "[]", "fsdp": "[]", "fsdp_min_num_params": 0, "deepspeed": "/content/ds_config_zero2.json", "label_smoothing_factor": 0.0, "optim": "adamw_hf", "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": "['tensorboard']", "ddp_find_unused_parameters": "None", "ddp_bucket_cap_mb": "None", "dataloader_pin_memory": true, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": true, "resume_from_checkpoint": "None", "hub_model_id": "long-t5-tglobal-base-16384-booksum-V9-ft1-booksum", "hub_strategy": "end", "hub_token": "<HUB_TOKEN>", "hub_private_repo": true, "gradient_checkpointing": true, "include_inputs_for_metrics": false, "fp16_backend": "auto", "push_to_hub_model_id": "None", "push_to_hub_organization": "None", "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>", "_n_gpu": 1, "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": "None", "ray_scope": "last", "sortish_sampler": false, "predict_with_generate": false, "generation_max_length": "None", "generation_num_beams": "None", "train_batch_size": 1, "eval_batch_size": 1, "configs_src": "long-t5-tglobal-base-16384-booksum-V9-ft1-booksum"}
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b05f4b28e354b9cc1c758956764bfd54d590226a1dfbe604856ded1dbafd148e
3
  size 990388907
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5e0e5a85417bcf902a0e8aedc108e1cbb5c20e265454aa69ca70ab8408c309
3
  size 990388907
tokenizer_config.json CHANGED
@@ -103,7 +103,7 @@
103
  ],
104
  "eos_token": "</s>",
105
  "extra_ids": 100,
106
- "name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V7.9",
107
  "pad_token": "<pad>",
108
  "special_tokens_map_file": null,
109
  "tokenizer_class": "T5Tokenizer",
103
  ],
104
  "eos_token": "</s>",
105
  "extra_ids": 100,
106
+ "name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V9",
107
  "pad_token": "<pad>",
108
  "special_tokens_map_file": null,
109
  "tokenizer_class": "T5Tokenizer",
trainer_state.json CHANGED
@@ -9,1007 +9,1007 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 0.0005,
13
- "loss": 2.3487,
14
  "step": 2
15
  },
16
  {
17
  "epoch": 0.02,
18
- "learning_rate": 0.001,
19
- "loss": 2.3946,
20
  "step": 4
21
  },
22
  {
23
  "epoch": 0.04,
24
- "learning_rate": 0.0009999071352056674,
25
- "loss": 2.4059,
26
  "step": 6
27
  },
28
  {
29
  "epoch": 0.05,
30
- "learning_rate": 0.00099962857531815,
31
- "loss": 2.4061,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 0.06,
36
- "learning_rate": 0.000999164423811074,
37
- "loss": 2.3801,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 0.07,
42
- "learning_rate": 0.0009985148530977765,
43
- "loss": 2.4389,
44
  "step": 12
45
  },
46
  {
47
  "epoch": 0.08,
48
- "learning_rate": 0.0009976801044672607,
49
- "loss": 2.4007,
50
  "step": 14
51
  },
52
  {
53
  "epoch": 0.1,
54
- "learning_rate": 0.0009966604879945657,
55
- "loss": 2.4691,
56
  "step": 16
57
  },
58
  {
59
  "epoch": 0.11,
60
- "learning_rate": 0.0009954563824255878,
61
- "loss": 2.4015,
62
  "step": 18
63
  },
64
  {
65
  "epoch": 0.12,
66
- "learning_rate": 0.0009940682350363913,
67
- "loss": 2.4415,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 0.13,
72
- "learning_rate": 0.000992496561467063,
73
- "loss": 2.477,
74
  "step": 22
75
  },
76
  {
77
  "epoch": 0.15,
78
- "learning_rate": 0.000990741945530174,
79
- "loss": 2.4429,
80
  "step": 24
81
  },
82
  {
83
  "epoch": 0.16,
84
- "learning_rate": 0.0009888050389939172,
85
- "loss": 2.4429,
86
  "step": 26
87
  },
88
  {
89
  "epoch": 0.17,
90
- "learning_rate": 0.0009866865613400006,
91
- "loss": 2.4597,
92
  "step": 28
93
  },
94
  {
95
  "epoch": 0.18,
96
- "learning_rate": 0.0009843872994963912,
97
- "loss": 2.4501,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 0.19,
102
- "learning_rate": 0.0009819081075450014,
103
- "loss": 2.4307,
104
  "step": 32
105
  },
106
  {
107
  "epoch": 0.21,
108
- "learning_rate": 0.0009792499064044343,
109
- "loss": 2.4182,
110
  "step": 34
111
  },
112
  {
113
  "epoch": 0.22,
114
- "learning_rate": 0.0009764136834878986,
115
- "loss": 2.4354,
116
  "step": 36
117
  },
118
  {
119
  "epoch": 0.23,
120
- "learning_rate": 0.0009734004923364257,
121
- "loss": 2.4323,
122
  "step": 38
123
  },
124
  {
125
  "epoch": 0.24,
126
- "learning_rate": 0.0009702114522275216,
127
- "loss": 2.4592,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 0.25,
132
- "learning_rate": 0.000966847747759402,
133
- "loss": 2.4242,
134
  "step": 42
135
  },
136
  {
137
  "epoch": 0.27,
138
- "learning_rate": 0.0009633106284109611,
139
- "loss": 2.4355,
140
  "step": 44
141
  },
142
  {
143
  "epoch": 0.28,
144
- "learning_rate": 0.0009596014080776422,
145
- "loss": 2.4379,
146
  "step": 46
147
  },
148
  {
149
  "epoch": 0.29,
150
- "learning_rate": 0.0009557214645833791,
151
- "loss": 2.3786,
152
  "step": 48
153
  },
154
  {
155
  "epoch": 0.3,
156
- "learning_rate": 0.0009516722391687902,
157
- "loss": 2.4303,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 0.31,
162
- "learning_rate": 0.0009474552359558167,
163
- "loss": 2.3946,
164
  "step": 52
165
  },
166
  {
167
  "epoch": 0.33,
168
- "learning_rate": 0.000943072021389003,
169
- "loss": 2.4104,
170
  "step": 54
171
  },
172
  {
173
  "epoch": 0.34,
174
- "learning_rate": 0.0009385242236536259,
175
- "loss": 2.4266,
176
  "step": 56
177
  },
178
  {
179
  "epoch": 0.35,
180
- "learning_rate": 0.0009338135320708912,
181
- "loss": 2.5106,
182
  "step": 58
183
  },
184
  {
185
  "epoch": 0.36,
186
- "learning_rate": 0.0009289416964704185,
187
- "loss": 2.4225,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 0.37,
192
- "learning_rate": 0.0009239105265402525,
193
- "loss": 2.4745,
194
  "step": 62
195
  },
196
  {
197
  "epoch": 0.39,
198
- "learning_rate": 0.0009187218911546363,
199
- "loss": 2.4572,
200
  "step": 64
201
  },
202
  {
203
  "epoch": 0.4,
204
- "learning_rate": 0.0009133777176798013,
205
- "loss": 2.4366,
206
  "step": 66
207
  },
208
  {
209
  "epoch": 0.41,
210
- "learning_rate": 0.0009078799912580304,
211
- "loss": 2.4021,
212
  "step": 68
213
  },
214
  {
215
  "epoch": 0.42,
216
- "learning_rate": 0.0009022307540702576,
217
- "loss": 2.4054,
218
  "step": 70
219
  },
220
  {
221
  "epoch": 0.44,
222
- "learning_rate": 0.0008964321045774807,
223
- "loss": 2.4628,
224
  "step": 72
225
  },
226
  {
227
  "epoch": 0.45,
228
- "learning_rate": 0.0008904861967412702,
229
- "loss": 2.5038,
230
  "step": 74
231
  },
232
  {
233
  "epoch": 0.46,
234
- "learning_rate": 0.0008843952392236594,
235
- "loss": 2.3801,
236
  "step": 76
237
  },
238
  {
239
  "epoch": 0.47,
240
- "learning_rate": 0.0008781614945667169,
241
- "loss": 2.4056,
242
  "step": 78
243
  },
244
  {
245
  "epoch": 0.48,
246
- "learning_rate": 0.0008717872783521047,
247
- "loss": 2.3334,
248
  "step": 80
249
  },
250
  {
251
  "epoch": 0.5,
252
- "learning_rate": 0.0008652749583409339,
253
- "loss": 2.3913,
254
  "step": 82
255
  },
256
  {
257
  "epoch": 0.51,
258
- "learning_rate": 0.0008586269535942384,
259
- "loss": 2.3784,
260
  "step": 84
261
  },
262
  {
263
  "epoch": 0.52,
264
- "learning_rate": 0.0008518457335743926,
265
- "loss": 2.4436,
266
  "step": 86
267
  },
268
  {
269
  "epoch": 0.53,
270
- "learning_rate": 0.0008449338172278058,
271
- "loss": 2.3735,
272
  "step": 88
273
  },
274
  {
275
  "epoch": 0.54,
276
- "learning_rate": 0.0008378937720492384,
277
- "loss": 2.374,
278
  "step": 90
279
  },
280
  {
281
  "epoch": 0.56,
282
- "learning_rate": 0.0008307282131280805,
283
- "loss": 2.4064,
284
  "step": 92
285
  },
286
  {
287
  "epoch": 0.57,
288
- "learning_rate": 0.000823439802176954,
289
- "loss": 2.4124,
290
  "step": 94
291
  },
292
  {
293
  "epoch": 0.58,
294
- "learning_rate": 0.0008160312465429952,
295
- "loss": 2.4181,
296
  "step": 96
297
  },
298
  {
299
  "epoch": 0.59,
300
- "learning_rate": 0.0008085052982021848,
301
- "loss": 2.4253,
302
  "step": 98
303
  },
304
  {
305
  "epoch": 0.6,
306
- "learning_rate": 0.0008008647527371022,
307
- "loss": 2.4678,
308
  "step": 100
309
  },
310
  {
311
  "epoch": 0.62,
312
- "learning_rate": 0.0007931124482984802,
313
- "loss": 2.4738,
314
  "step": 102
315
  },
316
  {
317
  "epoch": 0.63,
318
- "learning_rate": 0.0007852512645509479,
319
- "loss": 2.3738,
320
  "step": 104
321
  },
322
  {
323
  "epoch": 0.64,
324
- "learning_rate": 0.0007772841216033533,
325
- "loss": 2.4081,
326
  "step": 106
327
  },
328
  {
329
  "epoch": 0.65,
330
- "learning_rate": 0.0007692139789240611,
331
- "loss": 2.3738,
332
  "step": 108
333
  },
334
  {
335
  "epoch": 0.66,
336
- "learning_rate": 0.0007610438342416319,
337
- "loss": 2.3701,
338
  "step": 110
339
  },
340
  {
341
  "epoch": 0.68,
342
- "learning_rate": 0.0007527767224312882,
343
- "loss": 2.4355,
344
  "step": 112
345
  },
346
  {
347
  "epoch": 0.69,
348
- "learning_rate": 0.000744415714387582,
349
- "loss": 2.4036,
350
  "step": 114
351
  },
352
  {
353
  "epoch": 0.7,
354
- "learning_rate": 0.0007359639158836828,
355
- "loss": 2.3746,
356
  "step": 116
357
  },
358
  {
359
  "epoch": 0.71,
360
- "learning_rate": 0.0007274244664177097,
361
- "loss": 2.4855,
362
  "step": 118
363
  },
364
  {
365
  "epoch": 0.73,
366
- "learning_rate": 0.0007188005380465365,
367
- "loss": 2.379,
368
  "step": 120
369
  },
370
  {
371
  "epoch": 0.74,
372
- "learning_rate": 0.000710095334207501,
373
- "loss": 2.4178,
374
  "step": 122
375
  },
376
  {
377
  "epoch": 0.75,
378
- "learning_rate": 0.0007013120885284599,
379
- "loss": 2.4561,
380
  "step": 124
381
  },
382
  {
383
  "epoch": 0.76,
384
- "learning_rate": 0.0006924540636266272,
385
- "loss": 2.4024,
386
  "step": 126
387
  },
388
  {
389
  "epoch": 0.77,
390
- "learning_rate": 0.000683524549896646,
391
- "loss": 2.4172,
392
  "step": 128
393
  },
394
  {
395
  "epoch": 0.79,
396
- "learning_rate": 0.0006745268642883404,
397
- "loss": 2.3858,
398
  "step": 130
399
  },
400
  {
401
  "epoch": 0.8,
402
- "learning_rate": 0.0006654643490746042,
403
- "loss": 2.3547,
404
  "step": 132
405
  },
406
  {
407
  "epoch": 0.81,
408
- "learning_rate": 0.0006563403706098833,
409
- "loss": 2.4372,
410
  "step": 134
411
  },
412
  {
413
  "epoch": 0.82,
414
- "learning_rate": 0.0006471583180797121,
415
- "loss": 2.3785,
416
  "step": 136
417
  },
418
  {
419
  "epoch": 0.83,
420
- "learning_rate": 0.0006379216022417695,
421
- "loss": 2.3815,
422
  "step": 138
423
  },
424
  {
425
  "epoch": 0.85,
426
- "learning_rate": 0.0006286336541589224,
427
- "loss": 2.4209,
428
  "step": 140
429
  },
430
  {
431
  "epoch": 0.86,
432
- "learning_rate": 0.0006192979239247243,
433
- "loss": 2.3962,
434
  "step": 142
435
  },
436
  {
437
  "epoch": 0.87,
438
- "learning_rate": 0.0006099178793818478,
439
- "loss": 2.3626,
440
  "step": 144
441
  },
442
  {
443
  "epoch": 0.88,
444
- "learning_rate": 0.0006004970048339225,
445
- "loss": 2.3991,
446
  "step": 146
447
  },
448
  {
449
  "epoch": 0.89,
450
- "learning_rate": 0.0005910387997512573,
451
- "loss": 2.4396,
452
  "step": 148
453
  },
454
  {
455
  "epoch": 0.91,
456
- "learning_rate": 0.0005815467774709313,
457
- "loss": 2.3816,
458
  "step": 150
459
  },
460
  {
461
  "epoch": 0.92,
462
- "learning_rate": 0.0005720244638917323,
463
- "loss": 2.3866,
464
  "step": 152
465
  },
466
  {
467
  "epoch": 0.93,
468
- "learning_rate": 0.0005624753961644281,
469
- "loss": 2.4035,
470
  "step": 154
471
  },
472
  {
473
  "epoch": 0.94,
474
- "learning_rate": 0.0005529031213778615,
475
- "loss": 2.4063,
476
  "step": 156
477
  },
478
  {
479
  "epoch": 0.95,
480
- "learning_rate": 0.0005433111952413496,
481
- "loss": 2.3944,
482
  "step": 158
483
  },
484
  {
485
  "epoch": 0.97,
486
- "learning_rate": 0.0005337031807638841,
487
- "loss": 2.4192,
488
  "step": 160
489
  },
490
  {
491
  "epoch": 0.98,
492
- "learning_rate": 0.0005240826469306187,
493
- "loss": 2.3603,
494
  "step": 162
495
  },
496
  {
497
  "epoch": 0.99,
498
- "learning_rate": 0.0005144531673771364,
499
- "loss": 2.4041,
500
  "step": 164
501
  },
502
  {
503
  "epoch": 1.01,
504
- "learning_rate": 0.0005048183190619903,
505
- "loss": 2.8813,
506
  "step": 166
507
  },
508
  {
509
  "epoch": 1.02,
510
- "learning_rate": 0.0004951816809380097,
511
- "loss": 2.2786,
512
  "step": 168
513
  },
514
  {
515
  "epoch": 1.03,
516
- "learning_rate": 0.0004855468326228638,
517
- "loss": 2.2886,
518
  "step": 170
519
  },
520
  {
521
  "epoch": 1.04,
522
- "learning_rate": 0.00047591735306938137,
523
- "loss": 2.1822,
524
  "step": 172
525
  },
526
  {
527
  "epoch": 1.05,
528
- "learning_rate": 0.00046629681923611606,
529
- "loss": 2.2589,
530
  "step": 174
531
  },
532
  {
533
  "epoch": 1.07,
534
- "learning_rate": 0.0004566888047586507,
535
- "loss": 2.2625,
536
  "step": 176
537
  },
538
  {
539
  "epoch": 1.08,
540
- "learning_rate": 0.00044709687862213866,
541
- "loss": 2.2715,
542
  "step": 178
543
  },
544
  {
545
  "epoch": 1.09,
546
- "learning_rate": 0.000437524603835572,
547
- "loss": 2.1988,
548
  "step": 180
549
  },
550
  {
551
  "epoch": 1.1,
552
- "learning_rate": 0.000427975536108268,
553
- "loss": 2.3257,
554
  "step": 182
555
  },
556
  {
557
  "epoch": 1.11,
558
- "learning_rate": 0.00041845322252906863,
559
- "loss": 2.3026,
560
  "step": 184
561
  },
562
  {
563
  "epoch": 1.13,
564
- "learning_rate": 0.00040896120024874283,
565
- "loss": 2.2306,
566
  "step": 186
567
  },
568
  {
569
  "epoch": 1.14,
570
- "learning_rate": 0.0003995029951660776,
571
- "loss": 2.2269,
572
  "step": 188
573
  },
574
  {
575
  "epoch": 1.15,
576
- "learning_rate": 0.00039008212061815206,
577
- "loss": 2.3079,
578
  "step": 190
579
  },
580
  {
581
  "epoch": 1.16,
582
- "learning_rate": 0.00038070207607527587,
583
- "loss": 2.218,
584
  "step": 192
585
  },
586
  {
587
  "epoch": 1.18,
588
- "learning_rate": 0.00037136634584107787,
589
- "loss": 2.2667,
590
  "step": 194
591
  },
592
  {
593
  "epoch": 1.19,
594
- "learning_rate": 0.0003620783977582305,
595
- "loss": 2.2754,
596
  "step": 196
597
  },
598
  {
599
  "epoch": 1.2,
600
- "learning_rate": 0.0003528416819202881,
601
- "loss": 2.2835,
602
  "step": 198
603
  },
604
  {
605
  "epoch": 1.21,
606
- "learning_rate": 0.00034365962939011697,
607
- "loss": 2.2843,
608
  "step": 200
609
  },
610
  {
611
  "epoch": 1.22,
612
- "learning_rate": 0.00033453565092539584,
613
- "loss": 2.2387,
614
  "step": 202
615
  },
616
  {
617
  "epoch": 1.24,
618
- "learning_rate": 0.0003254731357116597,
619
- "loss": 2.254,
620
  "step": 204
621
  },
622
  {
623
  "epoch": 1.25,
624
- "learning_rate": 0.000316475450103354,
625
- "loss": 2.2686,
626
  "step": 206
627
  },
628
  {
629
  "epoch": 1.26,
630
- "learning_rate": 0.00030754593637337277,
631
- "loss": 2.2422,
632
  "step": 208
633
  },
634
  {
635
  "epoch": 1.27,
636
- "learning_rate": 0.0002986879114715403,
637
- "loss": 2.3003,
638
  "step": 210
639
  },
640
  {
641
  "epoch": 1.28,
642
- "learning_rate": 0.0002899046657924992,
643
- "loss": 2.2619,
644
  "step": 212
645
  },
646
  {
647
  "epoch": 1.3,
648
- "learning_rate": 0.00028119946195346375,
649
- "loss": 2.3022,
650
  "step": 214
651
  },
652
  {
653
  "epoch": 1.31,
654
- "learning_rate": 0.00027257553358229033,
655
- "loss": 2.2523,
656
  "step": 216
657
  },
658
  {
659
  "epoch": 1.32,
660
- "learning_rate": 0.0002640360841163174,
661
- "loss": 2.3098,
662
  "step": 218
663
  },
664
  {
665
  "epoch": 1.33,
666
- "learning_rate": 0.0002555842856124182,
667
- "loss": 2.235,
668
  "step": 220
669
  },
670
  {
671
  "epoch": 1.34,
672
- "learning_rate": 0.00024722327756871186,
673
- "loss": 2.2448,
674
  "step": 222
675
  },
676
  {
677
  "epoch": 1.36,
678
- "learning_rate": 0.0002389561657583681,
679
- "loss": 2.2411,
680
  "step": 224
681
  },
682
  {
683
  "epoch": 1.37,
684
- "learning_rate": 0.00023078602107593898,
685
- "loss": 2.2485,
686
  "step": 226
687
  },
688
  {
689
  "epoch": 1.38,
690
- "learning_rate": 0.0002227158783966467,
691
- "loss": 2.2261,
692
  "step": 228
693
  },
694
  {
695
  "epoch": 1.39,
696
- "learning_rate": 0.00021474873544905204,
697
- "loss": 2.2427,
698
  "step": 230
699
  },
700
  {
701
  "epoch": 1.4,
702
- "learning_rate": 0.00020688755170151997,
703
- "loss": 2.2961,
704
  "step": 232
705
  },
706
  {
707
  "epoch": 1.42,
708
- "learning_rate": 0.00019913524726289784,
709
- "loss": 2.2272,
710
  "step": 234
711
  },
712
  {
713
  "epoch": 1.43,
714
- "learning_rate": 0.00019149470179781532,
715
- "loss": 2.2368,
716
  "step": 236
717
  },
718
  {
719
  "epoch": 1.44,
720
- "learning_rate": 0.00018396875345700497,
721
- "loss": 2.2846,
722
  "step": 238
723
  },
724
  {
725
  "epoch": 1.45,
726
- "learning_rate": 0.000176560197823046,
727
- "loss": 2.1709,
728
  "step": 240
729
  },
730
  {
731
  "epoch": 1.47,
732
- "learning_rate": 0.0001692717868719195,
733
- "loss": 2.2659,
734
  "step": 242
735
  },
736
  {
737
  "epoch": 1.48,
738
- "learning_rate": 0.0001621062279507617,
739
- "loss": 2.2655,
740
  "step": 244
741
  },
742
  {
743
  "epoch": 1.49,
744
- "learning_rate": 0.0001550661827721941,
745
- "loss": 2.2284,
746
  "step": 246
747
  },
748
  {
749
  "epoch": 1.5,
750
- "learning_rate": 0.00014815426642560752,
751
- "loss": 2.2444,
752
  "step": 248
753
  },
754
  {
755
  "epoch": 1.51,
756
- "learning_rate": 0.0001413730464057616,
757
- "loss": 2.3102,
758
  "step": 250
759
  },
760
  {
761
  "epoch": 1.53,
762
- "learning_rate": 0.00013472504165906613,
763
- "loss": 2.2287,
764
  "step": 252
765
  },
766
  {
767
  "epoch": 1.54,
768
- "learning_rate": 0.00012821272164789544,
769
- "loss": 2.2713,
770
  "step": 254
771
  },
772
  {
773
  "epoch": 1.55,
774
- "learning_rate": 0.00012183850543328313,
775
- "loss": 2.2127,
776
  "step": 256
777
  },
778
  {
779
  "epoch": 1.56,
780
- "learning_rate": 0.00011560476077634069,
781
- "loss": 2.1682,
782
  "step": 258
783
  },
784
  {
785
  "epoch": 1.57,
786
- "learning_rate": 0.00010951380325872979,
787
- "loss": 2.2393,
788
  "step": 260
789
  },
790
  {
791
  "epoch": 1.59,
792
- "learning_rate": 0.00010356789542251938,
793
- "loss": 2.2259,
794
  "step": 262
795
  },
796
  {
797
  "epoch": 1.6,
798
- "learning_rate": 9.776924592974257e-05,
799
- "loss": 2.2157,
800
  "step": 264
801
  },
802
  {
803
  "epoch": 1.61,
804
- "learning_rate": 9.212000874196952e-05,
805
- "loss": 2.2393,
806
  "step": 266
807
  },
808
  {
809
  "epoch": 1.62,
810
- "learning_rate": 8.662228232019875e-05,
811
- "loss": 2.2613,
812
  "step": 268
813
  },
814
  {
815
  "epoch": 1.63,
816
- "learning_rate": 8.127810884536401e-05,
817
- "loss": 2.1981,
818
  "step": 270
819
  },
820
  {
821
  "epoch": 1.65,
822
- "learning_rate": 7.60894734597476e-05,
823
- "loss": 2.2457,
824
  "step": 272
825
  },
826
  {
827
  "epoch": 1.66,
828
- "learning_rate": 7.105830352958143e-05,
829
- "loss": 2.2571,
830
  "step": 274
831
  },
832
  {
833
  "epoch": 1.67,
834
- "learning_rate": 6.618646792910893e-05,
835
- "loss": 2.1771,
836
  "step": 276
837
  },
838
  {
839
  "epoch": 1.68,
840
- "learning_rate": 6.147577634637414e-05,
841
- "loss": 2.2243,
842
  "step": 278
843
  },
844
  {
845
  "epoch": 1.69,
846
- "learning_rate": 5.692797861099719e-05,
847
- "loss": 2.2427,
848
  "step": 280
849
  },
850
  {
851
  "epoch": 1.71,
852
- "learning_rate": 5.25447640441834e-05,
853
- "loss": 2.2266,
854
  "step": 282
855
  },
856
  {
857
  "epoch": 1.72,
858
- "learning_rate": 4.832776083120982e-05,
859
- "loss": 2.3057,
860
  "step": 284
861
  },
862
  {
863
  "epoch": 1.73,
864
- "learning_rate": 4.4278535416620916e-05,
865
- "loss": 2.2225,
866
  "step": 286
867
  },
868
  {
869
  "epoch": 1.74,
870
- "learning_rate": 4.039859192235778e-05,
871
- "loss": 2.2665,
872
  "step": 288
873
  },
874
  {
875
  "epoch": 1.76,
876
- "learning_rate": 3.668937158903901e-05,
877
- "loss": 2.2807,
878
  "step": 290
879
  },
880
  {
881
  "epoch": 1.77,
882
- "learning_rate": 3.315225224059809e-05,
883
- "loss": 2.2165,
884
  "step": 292
885
  },
886
  {
887
  "epoch": 1.78,
888
- "learning_rate": 2.9788547772478415e-05,
889
- "loss": 2.2651,
890
  "step": 294
891
  },
892
  {
893
  "epoch": 1.79,
894
- "learning_rate": 2.6599507663574384e-05,
895
- "loss": 2.2437,
896
  "step": 296
897
  },
898
  {
899
  "epoch": 1.8,
900
- "learning_rate": 2.3586316512101414e-05,
901
- "loss": 2.3066,
902
  "step": 298
903
  },
904
  {
905
  "epoch": 1.82,
906
- "learning_rate": 2.0750093595565732e-05,
907
- "loss": 2.1727,
908
  "step": 300
909
  },
910
  {
911
  "epoch": 1.83,
912
- "learning_rate": 1.8091892454998595e-05,
913
- "loss": 2.2409,
914
  "step": 302
915
  },
916
  {
917
  "epoch": 1.84,
918
- "learning_rate": 1.561270050360897e-05,
919
- "loss": 2.2908,
920
  "step": 304
921
  },
922
  {
923
  "epoch": 1.85,
924
- "learning_rate": 1.33134386599994e-05,
925
- "loss": 2.2925,
926
  "step": 306
927
  },
928
  {
929
  "epoch": 1.86,
930
- "learning_rate": 1.1194961006082971e-05,
931
- "loss": 2.2449,
932
  "step": 308
933
  },
934
  {
935
  "epoch": 1.88,
936
- "learning_rate": 9.258054469825972e-06,
937
- "loss": 2.235,
938
  "step": 310
939
  },
940
  {
941
  "epoch": 1.89,
942
- "learning_rate": 7.503438532937168e-06,
943
- "loss": 2.2216,
944
  "step": 312
945
  },
946
  {
947
  "epoch": 1.9,
948
- "learning_rate": 5.931764963608866e-06,
949
- "loss": 2.2884,
950
  "step": 314
951
  },
952
  {
953
  "epoch": 1.91,
954
- "learning_rate": 4.5436175744121845e-06,
955
- "loss": 2.2124,
956
  "step": 316
957
  },
958
  {
959
  "epoch": 1.92,
960
- "learning_rate": 3.3395120054343087e-06,
961
- "loss": 2.2418,
962
  "step": 318
963
  },
964
  {
965
  "epoch": 1.94,
966
- "learning_rate": 2.319895532739369e-06,
967
- "loss": 2.2855,
968
  "step": 320
969
  },
970
  {
971
  "epoch": 1.95,
972
- "learning_rate": 1.4851469022234e-06,
973
- "loss": 2.2974,
974
  "step": 322
975
  },
976
  {
977
  "epoch": 1.96,
978
- "learning_rate": 8.35576188926046e-07,
979
- "loss": 2.2552,
980
  "step": 324
981
  },
982
  {
983
  "epoch": 1.97,
984
- "learning_rate": 3.71424681850141e-07,
985
- "loss": 2.2209,
986
  "step": 326
987
  },
988
  {
989
  "epoch": 1.99,
990
- "learning_rate": 9.286479433257e-08,
991
- "loss": 2.1935,
992
  "step": 328
993
  },
994
  {
995
  "epoch": 2.0,
996
  "learning_rate": 0.0,
997
- "loss": 2.2702,
998
  "step": 330
999
  },
1000
  {
1001
  "epoch": 2.0,
1002
  "step": 330,
1003
- "total_flos": 4.634629374287544e+17,
1004
- "train_loss": 2.336302039117524,
1005
- "train_runtime": 79791.9217,
1006
  "train_samples_per_second": 0.265,
1007
  "train_steps_per_second": 0.004
1008
  }
1009
  ],
1010
  "max_steps": 330,
1011
  "num_train_epochs": 2,
1012
- "total_flos": 4.634629374287544e+17,
1013
  "trial_name": null,
1014
  "trial_params": null
1015
  }
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 0.0003,
13
+ "loss": 2.0736,
14
  "step": 2
15
  },
16
  {
17
  "epoch": 0.02,
18
+ "learning_rate": 0.0006,
19
+ "loss": 2.0741,
20
  "step": 4
21
  },
22
  {
23
  "epoch": 0.04,
24
+ "learning_rate": 0.0005999442811234004,
25
+ "loss": 2.098,
26
  "step": 6
27
  },
28
  {
29
  "epoch": 0.05,
30
+ "learning_rate": 0.0005997771451908898,
31
+ "loss": 2.0627,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 0.06,
36
+ "learning_rate": 0.0005994986542866444,
37
+ "loss": 2.0418,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 0.07,
42
+ "learning_rate": 0.0005991089118586658,
43
+ "loss": 2.0717,
44
  "step": 12
45
  },
46
  {
47
  "epoch": 0.08,
48
+ "learning_rate": 0.0005986080626803564,
49
+ "loss": 2.0504,
50
  "step": 14
51
  },
52
  {
53
  "epoch": 0.1,
54
+ "learning_rate": 0.0005979962927967394,
55
+ "loss": 2.0314,
56
  "step": 16
57
  },
58
  {
59
  "epoch": 0.11,
60
+ "learning_rate": 0.0005972738294553527,
61
+ "loss": 2.0568,
62
  "step": 18
63
  },
64
  {
65
  "epoch": 0.12,
66
+ "learning_rate": 0.0005964409410218346,
67
+ "loss": 2.073,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 0.13,
72
+ "learning_rate": 0.0005954979368802377,
73
+ "loss": 2.0737,
74
  "step": 22
75
  },
76
  {
77
  "epoch": 0.15,
78
+ "learning_rate": 0.0005944451673181043,
79
+ "loss": 2.0936,
80
  "step": 24
81
  },
82
  {
83
  "epoch": 0.16,
84
+ "learning_rate": 0.0005932830233963502,
85
+ "loss": 2.0846,
86
  "step": 26
87
  },
88
  {
89
  "epoch": 0.17,
90
+ "learning_rate": 0.0005920119368040003,
91
+ "loss": 2.1374,
92
  "step": 28
93
  },
94
  {
95
  "epoch": 0.18,
96
+ "learning_rate": 0.0005906323796978346,
97
+ "loss": 2.0828,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 0.19,
102
+ "learning_rate": 0.0005891448645270008,
103
+ "loss": 2.0802,
104
  "step": 32
105
  },
106
  {
107
  "epoch": 0.21,
108
+ "learning_rate": 0.0005875499438426604,
109
+ "loss": 2.147,
110
  "step": 34
111
  },
112
  {
113
  "epoch": 0.22,
114
+ "learning_rate": 0.0005858482100927391,
115
+ "loss": 2.0904,
116
  "step": 36
117
  },
118
  {
119
  "epoch": 0.23,
120
+ "learning_rate": 0.0005840402954018554,
121
+ "loss": 2.0823,
122
  "step": 38
123
  },
124
  {
125
  "epoch": 0.24,
126
+ "learning_rate": 0.0005821268713365129,
127
+ "loss": 2.0792,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 0.25,
132
+ "learning_rate": 0.0005801086486556411,
133
+ "loss": 2.0716,
134
  "step": 42
135
  },
136
  {
137
  "epoch": 0.27,
138
+ "learning_rate": 0.0005779863770465765,
139
+ "loss": 2.2061,
140
  "step": 44
141
  },
142
  {
143
  "epoch": 0.28,
144
+ "learning_rate": 0.0005757608448465853,
145
+ "loss": 2.1735,
146
  "step": 46
147
  },
148
  {
149
  "epoch": 0.29,
150
+ "learning_rate": 0.0005734328787500274,
151
+ "loss": 2.098,
152
  "step": 48
153
  },
154
  {
155
  "epoch": 0.3,
156
+ "learning_rate": 0.000571003343501274,
157
+ "loss": 2.0977,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 0.31,
162
+ "learning_rate": 0.0005684731415734899,
163
+ "loss": 2.1076,
164
  "step": 52
165
  },
166
  {
167
  "epoch": 0.33,
168
+ "learning_rate": 0.0005658432128334017,
169
+ "loss": 2.0476,
170
  "step": 54
171
  },
172
  {
173
  "epoch": 0.34,
174
+ "learning_rate": 0.0005631145341921755,
175
+ "loss": 2.0728,
176
  "step": 56
177
  },
178
  {
179
  "epoch": 0.35,
180
+ "learning_rate": 0.0005602881192425346,
181
+ "loss": 2.0477,
182
  "step": 58
183
  },
184
  {
185
  "epoch": 0.36,
186
+ "learning_rate": 0.000557365017882251,
187
+ "loss": 2.1102,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 0.37,
192
+ "learning_rate": 0.0005543463159241515,
193
+ "loss": 2.0754,
194
  "step": 62
195
  },
196
  {
197
  "epoch": 0.39,
198
+ "learning_rate": 0.0005512331346927817,
199
+ "loss": 2.1029,
200
  "step": 64
201
  },
202
  {
203
  "epoch": 0.4,
204
+ "learning_rate": 0.0005480266306078807,
205
+ "loss": 1.9922,
206
  "step": 66
207
  },
208
  {
209
  "epoch": 0.41,
210
+ "learning_rate": 0.0005447279947548182,
211
+ "loss": 2.1599,
212
  "step": 68
213
  },
214
  {
215
  "epoch": 0.42,
216
+ "learning_rate": 0.0005413384524421545,
217
+ "loss": 2.1388,
218
  "step": 70
219
  },
220
  {
221
  "epoch": 0.44,
222
+ "learning_rate": 0.0005378592627464883,
223
+ "loss": 2.131,
224
  "step": 72
225
  },
226
  {
227
  "epoch": 0.45,
228
+ "learning_rate": 0.0005342917180447621,
229
+ "loss": 2.0658,
230
  "step": 74
231
  },
232
  {
233
  "epoch": 0.46,
234
+ "learning_rate": 0.0005306371435341955,
235
+ "loss": 2.1437,
236
  "step": 76
237
  },
238
  {
239
  "epoch": 0.47,
240
+ "learning_rate": 0.0005268968967400301,
241
+ "loss": 2.1145,
242
  "step": 78
243
  },
244
  {
245
  "epoch": 0.48,
246
+ "learning_rate": 0.0005230723670112627,
247
+ "loss": 2.1263,
248
  "step": 80
249
  },
250
  {
251
  "epoch": 0.5,
252
+ "learning_rate": 0.0005191649750045603,
253
+ "loss": 2.0513,
254
  "step": 82
255
  },
256
  {
257
  "epoch": 0.51,
258
+ "learning_rate": 0.000515176172156543,
259
+ "loss": 2.0723,
260
  "step": 84
261
  },
262
  {
263
  "epoch": 0.52,
264
+ "learning_rate": 0.0005111074401446355,
265
+ "loss": 2.0744,
266
  "step": 86
267
  },
268
  {
269
  "epoch": 0.53,
270
+ "learning_rate": 0.0005069602903366834,
271
+ "loss": 2.131,
272
  "step": 88
273
  },
274
  {
275
  "epoch": 0.54,
276
+ "learning_rate": 0.0005027362632295429,
277
+ "loss": 2.094,
278
  "step": 90
279
  },
280
  {
281
  "epoch": 0.56,
282
+ "learning_rate": 0.0004984369278768482,
283
+ "loss": 2.0633,
284
  "step": 92
285
  },
286
  {
287
  "epoch": 0.57,
288
+ "learning_rate": 0.0004940638813061723,
289
+ "loss": 2.1182,
290
  "step": 94
291
  },
292
  {
293
  "epoch": 0.58,
294
+ "learning_rate": 0.0004896187479257971,
295
+ "loss": 2.1664,
296
  "step": 96
297
  },
298
  {
299
  "epoch": 0.59,
300
+ "learning_rate": 0.0004851031789213108,
301
+ "loss": 2.1365,
302
  "step": 98
303
  },
304
  {
305
  "epoch": 0.6,
306
+ "learning_rate": 0.0004805188516422613,
307
+ "loss": 2.1204,
308
  "step": 100
309
  },
310
  {
311
  "epoch": 0.62,
312
+ "learning_rate": 0.00047586746897908803,
313
+ "loss": 2.0677,
314
  "step": 102
315
  },
316
  {
317
  "epoch": 0.63,
318
+ "learning_rate": 0.00047115075873056876,
319
+ "loss": 2.1588,
320
  "step": 104
321
  },
322
  {
323
  "epoch": 0.64,
324
+ "learning_rate": 0.0004663704729620119,
325
+ "loss": 2.0746,
326
  "step": 106
327
  },
328
  {
329
  "epoch": 0.65,
330
+ "learning_rate": 0.0004615283873544366,
331
+ "loss": 2.126,
332
  "step": 108
333
  },
334
  {
335
  "epoch": 0.66,
336
+ "learning_rate": 0.0004566263005449791,
337
+ "loss": 2.0786,
338
  "step": 110
339
  },
340
  {
341
  "epoch": 0.68,
342
+ "learning_rate": 0.0004516660334587729,
343
+ "loss": 2.1019,
344
  "step": 112
345
  },
346
  {
347
  "epoch": 0.69,
348
+ "learning_rate": 0.00044664942863254913,
349
+ "loss": 2.0605,
350
  "step": 114
351
  },
352
  {
353
  "epoch": 0.7,
354
+ "learning_rate": 0.0004415783495302096,
355
+ "loss": 2.0879,
356
  "step": 116
357
  },
358
  {
359
  "epoch": 0.71,
360
+ "learning_rate": 0.0004364546798506258,
361
+ "loss": 2.1516,
362
  "step": 118
363
  },
364
  {
365
  "epoch": 0.73,
366
+ "learning_rate": 0.0004312803228279218,
367
+ "loss": 2.1287,
368
  "step": 120
369
  },
370
  {
371
  "epoch": 0.74,
372
+ "learning_rate": 0.0004260572005245005,
373
+ "loss": 2.1054,
374
  "step": 122
375
  },
376
  {
377
  "epoch": 0.75,
378
+ "learning_rate": 0.00042078725311707585,
379
+ "loss": 2.0905,
380
  "step": 124
381
  },
382
  {
383
  "epoch": 0.76,
384
+ "learning_rate": 0.0004154724381759763,
385
+ "loss": 2.0875,
386
  "step": 126
387
  },
388
  {
389
  "epoch": 0.77,
390
+ "learning_rate": 0.0004101147299379876,
391
+ "loss": 2.0948,
392
  "step": 128
393
  },
394
  {
395
  "epoch": 0.79,
396
+ "learning_rate": 0.00040471611857300423,
397
+ "loss": 2.1521,
398
  "step": 130
399
  },
400
  {
401
  "epoch": 0.8,
402
+ "learning_rate": 0.0003992786094447625,
403
+ "loss": 2.0645,
404
  "step": 132
405
  },
406
  {
407
  "epoch": 0.81,
408
+ "learning_rate": 0.0003938042223659299,
409
+ "loss": 2.0953,
410
  "step": 134
411
  },
412
  {
413
  "epoch": 0.82,
414
+ "learning_rate": 0.0003882949908478272,
415
+ "loss": 2.1046,
416
  "step": 136
417
  },
418
  {
419
  "epoch": 0.83,
420
+ "learning_rate": 0.0003827529613450617,
421
+ "loss": 2.1814,
422
  "step": 138
423
  },
424
  {
425
  "epoch": 0.85,
426
+ "learning_rate": 0.0003771801924953534,
427
+ "loss": 2.1147,
428
  "step": 140
429
  },
430
  {
431
  "epoch": 0.86,
432
+ "learning_rate": 0.0003715787543548345,
433
+ "loss": 2.1226,
434
  "step": 142
435
  },
436
  {
437
  "epoch": 0.87,
438
+ "learning_rate": 0.0003659507276291087,
439
+ "loss": 2.1093,
440
  "step": 144
441
  },
442
  {
443
  "epoch": 0.88,
444
+ "learning_rate": 0.00036029820290035347,
445
+ "loss": 2.1038,
446
  "step": 146
447
  },
448
  {
449
  "epoch": 0.89,
450
+ "learning_rate": 0.0003546232798507543,
451
+ "loss": 2.0581,
452
  "step": 148
453
  },
454
  {
455
  "epoch": 0.91,
456
+ "learning_rate": 0.00034892806648255875,
457
+ "loss": 2.0076,
458
  "step": 150
459
  },
460
  {
461
  "epoch": 0.92,
462
+ "learning_rate": 0.0003432146783350393,
463
+ "loss": 2.1017,
464
  "step": 152
465
  },
466
  {
467
  "epoch": 0.93,
468
+ "learning_rate": 0.0003374852376986568,
469
+ "loss": 2.1353,
470
  "step": 154
471
  },
472
  {
473
  "epoch": 0.94,
474
+ "learning_rate": 0.00033174187282671686,
475
+ "loss": 2.0836,
476
  "step": 156
477
  },
478
  {
479
  "epoch": 0.95,
480
+ "learning_rate": 0.0003259867171448097,
481
+ "loss": 2.098,
482
  "step": 158
483
  },
484
  {
485
  "epoch": 0.97,
486
+ "learning_rate": 0.00032022190845833035,
487
+ "loss": 2.1308,
488
  "step": 160
489
  },
490
  {
491
  "epoch": 0.98,
492
+ "learning_rate": 0.0003144495881583712,
493
+ "loss": 2.1204,
494
  "step": 162
495
  },
496
  {
497
  "epoch": 0.99,
498
+ "learning_rate": 0.00030867190042628177,
499
+ "loss": 2.0564,
500
  "step": 164
501
  },
502
  {
503
  "epoch": 1.01,
504
+ "learning_rate": 0.0003028909914371942,
505
+ "loss": 2.5573,
506
  "step": 166
507
  },
508
  {
509
  "epoch": 1.02,
510
+ "learning_rate": 0.0002971090085628058,
511
+ "loss": 1.9576,
512
  "step": 168
513
  },
514
  {
515
  "epoch": 1.03,
516
+ "learning_rate": 0.00029132809957371823,
517
+ "loss": 1.9274,
518
  "step": 170
519
  },
520
  {
521
  "epoch": 1.04,
522
+ "learning_rate": 0.0002855504118416288,
523
+ "loss": 1.9637,
524
  "step": 172
525
  },
526
  {
527
  "epoch": 1.05,
528
+ "learning_rate": 0.0002797780915416696,
529
+ "loss": 1.9729,
530
  "step": 174
531
  },
532
  {
533
  "epoch": 1.07,
534
+ "learning_rate": 0.0002740132828551904,
535
+ "loss": 2.0254,
536
  "step": 176
537
  },
538
  {
539
  "epoch": 1.08,
540
+ "learning_rate": 0.00026825812717328314,
541
+ "loss": 1.9427,
542
  "step": 178
543
  },
544
  {
545
  "epoch": 1.09,
546
+ "learning_rate": 0.00026251476230134313,
547
+ "loss": 1.97,
548
  "step": 180
549
  },
550
  {
551
  "epoch": 1.1,
552
+ "learning_rate": 0.00025678532166496077,
553
+ "loss": 2.042,
554
  "step": 182
555
  },
556
  {
557
  "epoch": 1.11,
558
+ "learning_rate": 0.00025107193351744115,
559
+ "loss": 2.0033,
560
  "step": 184
561
  },
562
  {
563
  "epoch": 1.13,
564
+ "learning_rate": 0.0002453767201492457,
565
+ "loss": 1.9466,
566
  "step": 186
567
  },
568
  {
569
  "epoch": 1.14,
570
+ "learning_rate": 0.00023970179709964656,
571
+ "loss": 2.0294,
572
  "step": 188
573
  },
574
  {
575
  "epoch": 1.15,
576
+ "learning_rate": 0.0002340492723708912,
577
+ "loss": 2.0002,
578
  "step": 190
579
  },
580
  {
581
  "epoch": 1.16,
582
+ "learning_rate": 0.00022842124564516548,
583
+ "loss": 1.9686,
584
  "step": 192
585
  },
586
  {
587
  "epoch": 1.18,
588
+ "learning_rate": 0.0002228198075046467,
589
+ "loss": 2.0857,
590
  "step": 194
591
  },
592
  {
593
  "epoch": 1.19,
594
+ "learning_rate": 0.00021724703865493827,
595
+ "loss": 2.0111,
596
  "step": 196
597
  },
598
  {
599
  "epoch": 1.2,
600
+ "learning_rate": 0.00021170500915217283,
601
+ "loss": 2.0058,
602
  "step": 198
603
  },
604
  {
605
  "epoch": 1.21,
606
+ "learning_rate": 0.00020619577763407015,
607
+ "loss": 1.9725,
608
  "step": 200
609
  },
610
  {
611
  "epoch": 1.22,
612
+ "learning_rate": 0.0002007213905552375,
613
+ "loss": 2.0542,
614
  "step": 202
615
  },
616
  {
617
  "epoch": 1.24,
618
+ "learning_rate": 0.0001952838814269958,
619
+ "loss": 2.0265,
620
  "step": 204
621
  },
622
  {
623
  "epoch": 1.25,
624
+ "learning_rate": 0.00018988527006201237,
625
+ "loss": 2.1143,
626
  "step": 206
627
  },
628
  {
629
  "epoch": 1.26,
630
+ "learning_rate": 0.00018452756182402364,
631
+ "loss": 1.96,
632
  "step": 208
633
  },
634
  {
635
  "epoch": 1.27,
636
+ "learning_rate": 0.00017921274688292415,
637
+ "loss": 2.0181,
638
  "step": 210
639
  },
640
  {
641
  "epoch": 1.28,
642
+ "learning_rate": 0.00017394279947549948,
643
+ "loss": 1.9909,
644
  "step": 212
645
  },
646
  {
647
  "epoch": 1.3,
648
+ "learning_rate": 0.00016871967717207824,
649
+ "loss": 2.0021,
650
  "step": 214
651
  },
652
  {
653
  "epoch": 1.31,
654
+ "learning_rate": 0.00016354532014937418,
655
+ "loss": 1.9598,
656
  "step": 216
657
  },
658
  {
659
  "epoch": 1.32,
660
+ "learning_rate": 0.00015842165046979042,
661
+ "loss": 1.9433,
662
  "step": 218
663
  },
664
  {
665
  "epoch": 1.33,
666
+ "learning_rate": 0.0001533505713674509,
667
+ "loss": 2.0222,
668
  "step": 220
669
  },
670
  {
671
  "epoch": 1.34,
672
+ "learning_rate": 0.0001483339665412271,
673
+ "loss": 1.9898,
674
  "step": 222
675
  },
676
  {
677
  "epoch": 1.36,
678
+ "learning_rate": 0.00014337369945502084,
679
+ "loss": 2.0555,
680
  "step": 224
681
  },
682
  {
683
  "epoch": 1.37,
684
+ "learning_rate": 0.00013847161264556339,
685
+ "loss": 2.0082,
686
  "step": 226
687
  },
688
  {
689
  "epoch": 1.38,
690
+ "learning_rate": 0.000133629527037988,
691
+ "loss": 2.0081,
692
  "step": 228
693
  },
694
  {
695
  "epoch": 1.39,
696
+ "learning_rate": 0.00012884924126943122,
697
+ "loss": 2.0028,
698
  "step": 230
699
  },
700
  {
701
  "epoch": 1.4,
702
+ "learning_rate": 0.00012413253102091197,
703
+ "loss": 2.0535,
704
  "step": 232
705
  },
706
  {
707
  "epoch": 1.42,
708
+ "learning_rate": 0.00011948114835773868,
709
+ "loss": 1.9512,
710
  "step": 234
711
  },
712
  {
713
  "epoch": 1.43,
714
+ "learning_rate": 0.00011489682107868918,
715
+ "loss": 1.9141,
716
  "step": 236
717
  },
718
  {
719
  "epoch": 1.44,
720
+ "learning_rate": 0.00011038125207420298,
721
+ "loss": 2.0705,
722
  "step": 238
723
  },
724
  {
725
  "epoch": 1.45,
726
+ "learning_rate": 0.00010593611869382759,
727
+ "loss": 1.9869,
728
  "step": 240
729
  },
730
  {
731
  "epoch": 1.47,
732
+ "learning_rate": 0.0001015630721231517,
733
+ "loss": 1.9448,
734
  "step": 242
735
  },
736
  {
737
  "epoch": 1.48,
738
+ "learning_rate": 9.7263736770457e-05,
739
+ "loss": 1.9565,
740
  "step": 244
741
  },
742
  {
743
  "epoch": 1.49,
744
+ "learning_rate": 9.303970966331645e-05,
745
+ "loss": 1.9925,
746
  "step": 246
747
  },
748
  {
749
  "epoch": 1.5,
750
+ "learning_rate": 8.88925598553645e-05,
751
+ "loss": 1.995,
752
  "step": 248
753
  },
754
  {
755
  "epoch": 1.51,
756
+ "learning_rate": 8.482382784345695e-05,
757
+ "loss": 1.9627,
758
  "step": 250
759
  },
760
  {
761
  "epoch": 1.53,
762
+ "learning_rate": 8.083502499543967e-05,
763
+ "loss": 2.0096,
764
  "step": 252
765
  },
766
  {
767
  "epoch": 1.54,
768
+ "learning_rate": 7.692763298873725e-05,
769
+ "loss": 2.0731,
770
  "step": 254
771
  },
772
  {
773
  "epoch": 1.55,
774
+ "learning_rate": 7.310310325996986e-05,
775
+ "loss": 2.0134,
776
  "step": 256
777
  },
778
  {
779
  "epoch": 1.56,
780
+ "learning_rate": 6.936285646580441e-05,
781
+ "loss": 2.0516,
782
  "step": 258
783
  },
784
  {
785
  "epoch": 1.57,
786
+ "learning_rate": 6.570828195523786e-05,
787
+ "loss": 2.0061,
788
  "step": 260
789
  },
790
  {
791
  "epoch": 1.59,
792
+ "learning_rate": 6.214073725351162e-05,
793
+ "loss": 1.9709,
794
  "step": 262
795
  },
796
  {
797
  "epoch": 1.6,
798
+ "learning_rate": 5.8661547557845534e-05,
799
+ "loss": 2.03,
800
  "step": 264
801
  },
802
  {
803
  "epoch": 1.61,
804
+ "learning_rate": 5.5272005245181705e-05,
805
+ "loss": 1.9751,
806
  "step": 266
807
  },
808
  {
809
  "epoch": 1.62,
810
+ "learning_rate": 5.197336939211925e-05,
811
+ "loss": 2.0363,
812
  "step": 268
813
  },
814
  {
815
  "epoch": 1.63,
816
+ "learning_rate": 4.87668653072184e-05,
817
+ "loss": 2.0713,
818
  "step": 270
819
  },
820
  {
821
  "epoch": 1.65,
822
+ "learning_rate": 4.565368407584855e-05,
823
+ "loss": 2.021,
824
  "step": 272
825
  },
826
  {
827
  "epoch": 1.66,
828
+ "learning_rate": 4.2634982117748854e-05,
829
+ "loss": 1.9747,
830
  "step": 274
831
  },
832
  {
833
  "epoch": 1.67,
834
+ "learning_rate": 3.971188075746535e-05,
835
+ "loss": 2.0548,
836
  "step": 276
837
  },
838
  {
839
  "epoch": 1.68,
840
+ "learning_rate": 3.688546580782448e-05,
841
+ "loss": 1.9953,
842
  "step": 278
843
  },
844
  {
845
  "epoch": 1.69,
846
+ "learning_rate": 3.415678716659831e-05,
847
+ "loss": 1.9769,
848
  "step": 280
849
  },
850
  {
851
  "epoch": 1.71,
852
+ "learning_rate": 3.152685842651004e-05,
853
+ "loss": 2.0558,
854
  "step": 282
855
  },
856
  {
857
  "epoch": 1.72,
858
+ "learning_rate": 2.899665649872589e-05,
859
+ "loss": 2.047,
860
  "step": 284
861
  },
862
  {
863
  "epoch": 1.73,
864
+ "learning_rate": 2.6567121249972544e-05,
865
+ "loss": 2.0231,
866
  "step": 286
867
  },
868
  {
869
  "epoch": 1.74,
870
+ "learning_rate": 2.423915515341467e-05,
871
+ "loss": 2.0504,
872
  "step": 288
873
  },
874
  {
875
  "epoch": 1.76,
876
+ "learning_rate": 2.2013622953423405e-05,
877
+ "loss": 2.0075,
878
  "step": 290
879
  },
880
  {
881
  "epoch": 1.77,
882
+ "learning_rate": 1.9891351344358853e-05,
883
+ "loss": 2.0219,
884
  "step": 292
885
  },
886
  {
887
  "epoch": 1.78,
888
+ "learning_rate": 1.7873128663487047e-05,
889
+ "loss": 1.9923,
890
  "step": 294
891
  },
892
  {
893
  "epoch": 1.79,
894
+ "learning_rate": 1.5959704598144628e-05,
895
+ "loss": 2.0081,
896
  "step": 296
897
  },
898
  {
899
  "epoch": 1.8,
900
+ "learning_rate": 1.4151789907260846e-05,
901
+ "loss": 1.9971,
902
  "step": 298
903
  },
904
  {
905
  "epoch": 1.82,
906
+ "learning_rate": 1.2450056157339439e-05,
907
+ "loss": 1.9923,
908
  "step": 300
909
  },
910
  {
911
  "epoch": 1.83,
912
+ "learning_rate": 1.0855135472999155e-05,
913
+ "loss": 2.003,
914
  "step": 302
915
  },
916
  {
917
  "epoch": 1.84,
918
+ "learning_rate": 9.36762030216538e-06,
919
+ "loss": 2.0014,
920
  "step": 304
921
  },
922
  {
923
  "epoch": 1.85,
924
+ "learning_rate": 7.988063195999639e-06,
925
+ "loss": 2.0675,
926
  "step": 306
927
  },
928
  {
929
  "epoch": 1.86,
930
+ "learning_rate": 6.716976603649782e-06,
931
+ "loss": 1.9773,
932
  "step": 308
933
  },
934
  {
935
  "epoch": 1.88,
936
+ "learning_rate": 5.554832681895582e-06,
937
+ "loss": 1.997,
938
  "step": 310
939
  },
940
  {
941
  "epoch": 1.89,
942
+ "learning_rate": 4.5020631197623e-06,
943
+ "loss": 2.0323,
944
  "step": 312
945
  },
946
  {
947
  "epoch": 1.9,
948
+ "learning_rate": 3.559058978165319e-06,
949
+ "loss": 2.0221,
950
  "step": 314
951
  },
952
  {
953
  "epoch": 1.91,
954
+ "learning_rate": 2.7261705446473103e-06,
955
+ "loss": 1.9734,
956
  "step": 316
957
  },
958
  {
959
  "epoch": 1.92,
960
+ "learning_rate": 2.003707203260585e-06,
961
+ "loss": 1.9529,
962
  "step": 318
963
  },
964
  {
965
  "epoch": 1.94,
966
+ "learning_rate": 1.3919373196436213e-06,
967
+ "loss": 1.982,
968
  "step": 320
969
  },
970
  {
971
  "epoch": 1.95,
972
+ "learning_rate": 8.910881413340398e-07,
973
+ "loss": 2.0501,
974
  "step": 322
975
  },
976
  {
977
  "epoch": 1.96,
978
+ "learning_rate": 5.013457133556276e-07,
979
+ "loss": 2.0041,
980
  "step": 324
981
  },
982
  {
983
  "epoch": 1.97,
984
+ "learning_rate": 2.2285480911008457e-07,
985
+ "loss": 1.983,
986
  "step": 326
987
  },
988
  {
989
  "epoch": 1.99,
990
+ "learning_rate": 5.5718876599541995e-08,
991
+ "loss": 2.0002,
992
  "step": 328
993
  },
994
  {
995
  "epoch": 2.0,
996
  "learning_rate": 0.0,
997
+ "loss": 2.0595,
998
  "step": 330
999
  },
1000
  {
1001
  "epoch": 2.0,
1002
  "step": 330,
1003
+ "total_flos": 4.634629372945367e+17,
1004
+ "train_loss": 2.053801321260857,
1005
+ "train_runtime": 80029.5559,
1006
  "train_samples_per_second": 0.265,
1007
  "train_steps_per_second": 0.004
1008
  }
1009
  ],
1010
  "max_steps": 330,
1011
  "num_train_epochs": 2,
1012
+ "total_flos": 4.634629372945367e+17,
1013
  "trial_name": null,
1014
  "trial_params": null
1015
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4526ccf2486e6fb3048af4d26eb6228cf640199b02d5c9ab46e06e3bf549ec3a
3
  size 4527
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9abaea40950d2b896d2f741ace2caf6e83894992e5b5e580309396bb46b7b92c
3
  size 4527