hllj commited on
Commit
d3eab40
1 Parent(s): 8e8d6ab

Model save

Browse files
README.md CHANGED
@@ -1,10 +1,15 @@
1
  ---
2
  base_model: hllj/mistral-vi-math
3
  tags:
 
 
4
  - generated_from_trainer
 
 
5
  model-index:
6
  - name: sft-mistral-v1-clean-valid
7
  results: []
 
8
  ---
9
 
10
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -12,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
12
 
13
  # sft-mistral-v1-clean-valid
14
 
15
- This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 0.4106
18
 
19
  ## Model description
20
 
@@ -48,12 +53,13 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 0.2982 | 1.02 | 1000 | 0.4478 |
52
 
53
 
54
  ### Framework versions
55
 
56
- - Transformers 4.35.2
57
- - Pytorch 2.1.0
58
- - Datasets 2.15.0
 
59
  - Tokenizers 0.15.0
 
1
  ---
2
  base_model: hllj/mistral-vi-math
3
  tags:
4
+ - trl
5
+ - sft
6
  - generated_from_trainer
7
+ datasets:
8
+ - generator
9
  model-index:
10
  - name: sft-mistral-v1-clean-valid
11
  results: []
12
+ library_name: peft
13
  ---
14
 
15
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
17
 
18
  # sft-mistral-v1-clean-valid
19
 
20
+ This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.5101
23
 
24
  ## Model description
25
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.2676 | 1.05 | 1000 | 0.5086 |
57
 
58
 
59
  ### Framework versions
60
 
61
+ - PEFT 0.5.0
62
+ - Transformers 4.36.2
63
+ - Pytorch 2.1.2+cu121
64
+ - Datasets 2.16.0
65
  - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dbea24cb8dace749af26b1a96e0523e56265ba990d3282f14d28792653e68b3
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde042ccf32ffa4b57b1b9c1dcb5e326931a6822190e834eea9e7bbedb9b468b
3
  size 872450448
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.44,
3
- "eval_loss": 0.41060009598731995,
4
- "eval_runtime": 6.7722,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.673,
7
- "eval_steps_per_second": 5.168,
8
- "train_loss": 0.3275021193364904,
9
- "train_runtime": 3966.2406,
10
- "train_samples": 8657,
11
- "train_samples_per_second": 4.365,
12
- "train_steps_per_second": 1.092
13
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.5100870728492737,
4
+ "eval_runtime": 12.2379,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 3.595,
7
+ "eval_steps_per_second": 0.899,
8
+ "train_loss": 0.3227183536165639,
9
+ "train_runtime": 6710.5312,
10
+ "train_samples": 8517,
11
+ "train_samples_per_second": 1.132,
12
+ "train_steps_per_second": 0.283
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.44,
3
- "eval_loss": 0.41060009598731995,
4
- "eval_runtime": 6.7722,
5
  "eval_samples": 140,
6
- "eval_samples_per_second": 20.673,
7
- "eval_steps_per_second": 5.168
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "eval_loss": 0.5100870728492737,
4
+ "eval_runtime": 12.2379,
5
  "eval_samples": 140,
6
+ "eval_samples_per_second": 3.595,
7
+ "eval_steps_per_second": 0.899
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.44,
3
- "train_loss": 0.3275021193364904,
4
- "train_runtime": 3966.2406,
5
- "train_samples": 8657,
6
- "train_samples_per_second": 4.365,
7
- "train_steps_per_second": 1.092
8
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "train_loss": 0.3227183536165639,
4
+ "train_runtime": 6710.5312,
5
+ "train_samples": 8517,
6
+ "train_samples_per_second": 1.132,
7
+ "train_steps_per_second": 0.283
8
  }
trainer_state.json CHANGED
@@ -1,1194 +1,1184 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.4438799076212472,
5
  "eval_steps": 1000,
6
- "global_step": 1922,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 1.3824884792626728e-07,
14
- "loss": 0.7351,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.0,
19
- "learning_rate": 1.3824884792626729e-06,
20
- "loss": 0.7455,
21
  "step": 10
22
  },
23
  {
24
- "epoch": 0.01,
25
- "learning_rate": 2.7649769585253458e-06,
26
- "loss": 0.7061,
27
  "step": 20
28
  },
29
  {
30
- "epoch": 0.01,
31
- "learning_rate": 4.147465437788019e-06,
32
- "loss": 0.6593,
33
  "step": 30
34
  },
35
  {
36
- "epoch": 0.02,
37
- "learning_rate": 5.5299539170506915e-06,
38
- "loss": 0.6177,
39
  "step": 40
40
  },
41
  {
42
- "epoch": 0.02,
43
- "learning_rate": 6.912442396313364e-06,
44
- "loss": 0.5817,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.03,
49
- "learning_rate": 8.294930875576038e-06,
50
- "loss": 0.5095,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.03,
55
- "learning_rate": 9.67741935483871e-06,
56
- "loss": 0.4615,
57
  "step": 70
58
  },
59
  {
60
- "epoch": 0.04,
61
- "learning_rate": 1.1059907834101383e-05,
62
- "loss": 0.4126,
63
  "step": 80
64
  },
65
  {
66
- "epoch": 0.04,
67
- "learning_rate": 1.2442396313364056e-05,
68
- "loss": 0.4067,
69
  "step": 90
70
  },
71
  {
72
- "epoch": 0.05,
73
- "learning_rate": 1.3824884792626728e-05,
74
- "loss": 0.3974,
75
  "step": 100
76
  },
77
  {
78
- "epoch": 0.05,
79
- "learning_rate": 1.5207373271889403e-05,
80
- "loss": 0.3864,
81
  "step": 110
82
  },
83
  {
84
- "epoch": 0.06,
85
- "learning_rate": 1.6589861751152075e-05,
86
- "loss": 0.4016,
87
  "step": 120
88
  },
89
  {
90
- "epoch": 0.06,
91
- "learning_rate": 1.7972350230414745e-05,
92
- "loss": 0.3873,
93
  "step": 130
94
  },
95
  {
96
- "epoch": 0.06,
97
- "learning_rate": 1.935483870967742e-05,
98
- "loss": 0.3837,
99
  "step": 140
100
  },
101
  {
102
- "epoch": 0.07,
103
- "learning_rate": 2.0737327188940094e-05,
104
- "loss": 0.3868,
105
  "step": 150
106
  },
107
  {
108
- "epoch": 0.07,
109
- "learning_rate": 2.2119815668202766e-05,
110
- "loss": 0.3817,
111
  "step": 160
112
  },
113
  {
114
- "epoch": 0.08,
115
- "learning_rate": 2.350230414746544e-05,
116
- "loss": 0.3593,
117
  "step": 170
118
  },
119
  {
120
- "epoch": 0.08,
121
- "learning_rate": 2.488479262672811e-05,
122
- "loss": 0.3641,
123
  "step": 180
124
  },
125
  {
126
- "epoch": 0.09,
127
- "learning_rate": 2.6267281105990784e-05,
128
- "loss": 0.3497,
129
  "step": 190
130
  },
131
  {
132
- "epoch": 0.09,
133
- "learning_rate": 2.7649769585253457e-05,
134
- "loss": 0.3679,
135
  "step": 200
136
  },
137
  {
138
- "epoch": 0.1,
139
- "learning_rate": 2.903225806451613e-05,
140
- "loss": 0.3751,
141
  "step": 210
142
  },
143
  {
144
- "epoch": 0.1,
145
- "learning_rate": 2.9999960619075335e-05,
146
- "loss": 0.3828,
147
  "step": 220
148
  },
149
  {
150
- "epoch": 0.11,
151
- "learning_rate": 2.9999260519500367e-05,
152
- "loss": 0.3763,
153
  "step": 230
154
  },
155
  {
156
- "epoch": 0.11,
157
- "learning_rate": 2.9997685335280646e-05,
158
- "loss": 0.3553,
159
  "step": 240
160
  },
161
  {
162
- "epoch": 0.12,
163
- "learning_rate": 2.9995235158315353e-05,
164
- "loss": 0.3589,
165
  "step": 250
166
  },
167
  {
168
- "epoch": 0.12,
169
- "learning_rate": 2.999191013155234e-05,
170
- "loss": 0.3585,
171
  "step": 260
172
  },
173
  {
174
- "epoch": 0.12,
175
- "learning_rate": 2.998771044897983e-05,
176
- "loss": 0.3529,
177
  "step": 270
178
  },
179
  {
180
- "epoch": 0.13,
181
- "learning_rate": 2.9982636355615092e-05,
182
- "loss": 0.3303,
183
  "step": 280
184
  },
185
  {
186
- "epoch": 0.13,
187
- "learning_rate": 2.997668814749012e-05,
188
- "loss": 0.3696,
189
  "step": 290
190
  },
191
  {
192
- "epoch": 0.14,
193
- "learning_rate": 2.99698661716344e-05,
194
- "loss": 0.3353,
195
  "step": 300
196
  },
197
  {
198
- "epoch": 0.14,
199
- "learning_rate": 2.9962170826054645e-05,
200
- "loss": 0.3562,
201
  "step": 310
202
  },
203
  {
204
- "epoch": 0.15,
205
- "learning_rate": 2.995360255971157e-05,
206
- "loss": 0.3652,
207
  "step": 320
208
  },
209
  {
210
- "epoch": 0.15,
211
- "learning_rate": 2.994416187249371e-05,
212
- "loss": 0.3522,
213
  "step": 330
214
  },
215
  {
216
- "epoch": 0.16,
217
- "learning_rate": 2.9933849315188233e-05,
218
- "loss": 0.3754,
219
  "step": 340
220
  },
221
  {
222
- "epoch": 0.16,
223
- "learning_rate": 2.992266548944885e-05,
224
- "loss": 0.3348,
225
  "step": 350
226
  },
227
  {
228
- "epoch": 0.17,
229
- "learning_rate": 2.991061104776067e-05,
230
- "loss": 0.3513,
231
  "step": 360
232
  },
233
  {
234
- "epoch": 0.17,
235
- "learning_rate": 2.9897686693402138e-05,
236
- "loss": 0.3214,
237
  "step": 370
238
  },
239
  {
240
- "epoch": 0.18,
241
- "learning_rate": 2.9883893180404046e-05,
242
- "loss": 0.3451,
243
  "step": 380
244
  },
245
  {
246
- "epoch": 0.18,
247
- "learning_rate": 2.986923131350549e-05,
248
- "loss": 0.3579,
249
  "step": 390
250
  },
251
  {
252
- "epoch": 0.18,
253
- "learning_rate": 2.9853701948106944e-05,
254
- "loss": 0.3353,
255
  "step": 400
256
  },
257
  {
258
- "epoch": 0.19,
259
- "learning_rate": 2.9837305990220357e-05,
260
- "loss": 0.3374,
261
  "step": 410
262
  },
263
  {
264
- "epoch": 0.19,
265
- "learning_rate": 2.982004439641628e-05,
266
- "loss": 0.3544,
267
  "step": 420
268
  },
269
  {
270
- "epoch": 0.2,
271
- "learning_rate": 2.980191817376808e-05,
272
- "loss": 0.3313,
273
  "step": 430
274
  },
275
  {
276
- "epoch": 0.2,
277
- "learning_rate": 2.9782928379793154e-05,
278
- "loss": 0.3488,
279
  "step": 440
280
  },
281
  {
282
- "epoch": 0.21,
283
- "learning_rate": 2.976307612239127e-05,
284
- "loss": 0.3384,
285
  "step": 450
286
  },
287
  {
288
- "epoch": 0.21,
289
- "learning_rate": 2.97423625597799e-05,
290
- "loss": 0.3359,
291
  "step": 460
292
  },
293
  {
294
- "epoch": 0.22,
295
- "learning_rate": 2.9720788900426657e-05,
296
- "loss": 0.3353,
297
  "step": 470
298
  },
299
  {
300
- "epoch": 0.22,
301
- "learning_rate": 2.969835640297879e-05,
302
- "loss": 0.356,
303
  "step": 480
304
  },
305
  {
306
- "epoch": 0.23,
307
- "learning_rate": 2.967506637618976e-05,
308
- "loss": 0.3575,
309
  "step": 490
310
  },
311
  {
312
- "epoch": 0.23,
313
- "learning_rate": 2.9650920178842874e-05,
314
- "loss": 0.3163,
315
  "step": 500
316
  },
317
  {
318
- "epoch": 0.24,
319
- "learning_rate": 2.9625919219672017e-05,
320
- "loss": 0.3277,
321
  "step": 510
322
  },
323
  {
324
- "epoch": 0.24,
325
- "learning_rate": 2.960006495727946e-05,
326
- "loss": 0.3627,
327
  "step": 520
328
  },
329
  {
330
- "epoch": 0.24,
331
- "learning_rate": 2.9573358900050764e-05,
332
- "loss": 0.3412,
333
  "step": 530
334
  },
335
  {
336
- "epoch": 0.25,
337
- "learning_rate": 2.9545802606066778e-05,
338
- "loss": 0.3464,
339
  "step": 540
340
  },
341
  {
342
- "epoch": 0.25,
343
- "learning_rate": 2.9517397683012747e-05,
344
- "loss": 0.3417,
345
  "step": 550
346
  },
347
  {
348
- "epoch": 0.26,
349
- "learning_rate": 2.9488145788084502e-05,
350
- "loss": 0.349,
351
  "step": 560
352
  },
353
  {
354
- "epoch": 0.26,
355
- "learning_rate": 2.945804862789178e-05,
356
- "loss": 0.3497,
357
  "step": 570
358
  },
359
  {
360
- "epoch": 0.27,
361
- "learning_rate": 2.942710795835866e-05,
362
- "loss": 0.3318,
363
  "step": 580
364
  },
365
  {
366
- "epoch": 0.27,
367
- "learning_rate": 2.9395325584621122e-05,
368
- "loss": 0.3249,
369
  "step": 590
370
  },
371
  {
372
- "epoch": 0.28,
373
- "learning_rate": 2.9362703360921722e-05,
374
- "loss": 0.3346,
375
  "step": 600
376
  },
377
  {
378
- "epoch": 0.28,
379
- "learning_rate": 2.932924319050143e-05,
380
- "loss": 0.3401,
381
  "step": 610
382
  },
383
  {
384
- "epoch": 0.29,
385
- "learning_rate": 2.9294947025488568e-05,
386
- "loss": 0.327,
387
  "step": 620
388
  },
389
  {
390
- "epoch": 0.29,
391
- "learning_rate": 2.925981686678494e-05,
392
- "loss": 0.3185,
393
  "step": 630
394
  },
395
  {
396
- "epoch": 0.3,
397
- "learning_rate": 2.9223854763949082e-05,
398
- "loss": 0.3411,
399
  "step": 640
400
  },
401
  {
402
- "epoch": 0.3,
403
- "learning_rate": 2.9187062815076688e-05,
404
- "loss": 0.3401,
405
  "step": 650
406
  },
407
  {
408
- "epoch": 0.3,
409
- "learning_rate": 2.914944316667822e-05,
410
- "loss": 0.3443,
411
  "step": 660
412
  },
413
  {
414
- "epoch": 0.31,
415
- "learning_rate": 2.9110998013553653e-05,
416
- "loss": 0.3336,
417
  "step": 670
418
  },
419
  {
420
- "epoch": 0.31,
421
- "learning_rate": 2.9071729598664433e-05,
422
- "loss": 0.3271,
423
  "step": 680
424
  },
425
  {
426
- "epoch": 0.32,
427
- "learning_rate": 2.9031640213002638e-05,
428
- "loss": 0.3233,
429
  "step": 690
430
  },
431
  {
432
- "epoch": 0.32,
433
- "learning_rate": 2.899073219545729e-05,
434
- "loss": 0.3548,
435
  "step": 700
436
  },
437
  {
438
- "epoch": 0.33,
439
- "learning_rate": 2.8949007932677915e-05,
440
- "loss": 0.3243,
441
  "step": 710
442
  },
443
  {
444
- "epoch": 0.33,
445
- "learning_rate": 2.89064698589353e-05,
446
- "loss": 0.3151,
447
  "step": 720
448
  },
449
  {
450
- "epoch": 0.34,
451
- "learning_rate": 2.8863120455979458e-05,
452
- "loss": 0.325,
453
  "step": 730
454
  },
455
  {
456
- "epoch": 0.34,
457
- "learning_rate": 2.8818962252894872e-05,
458
- "loss": 0.3406,
459
  "step": 740
460
  },
461
  {
462
- "epoch": 0.35,
463
- "learning_rate": 2.8773997825952914e-05,
464
- "loss": 0.3059,
465
  "step": 750
466
  },
467
  {
468
- "epoch": 0.35,
469
- "learning_rate": 2.872822979846154e-05,
470
- "loss": 0.3236,
471
  "step": 760
472
  },
473
  {
474
- "epoch": 0.36,
475
- "learning_rate": 2.8681660840612262e-05,
476
- "loss": 0.3465,
477
  "step": 770
478
  },
479
  {
480
- "epoch": 0.36,
481
- "learning_rate": 2.8634293669324353e-05,
482
- "loss": 0.34,
483
  "step": 780
484
  },
485
  {
486
- "epoch": 0.36,
487
- "learning_rate": 2.8586131048086334e-05,
488
- "loss": 0.3319,
489
  "step": 790
490
  },
491
  {
492
- "epoch": 0.37,
493
- "learning_rate": 2.853717578679474e-05,
494
- "loss": 0.3322,
495
  "step": 800
496
  },
497
  {
498
- "epoch": 0.37,
499
- "learning_rate": 2.848743074159021e-05,
500
- "loss": 0.3125,
501
  "step": 810
502
  },
503
  {
504
- "epoch": 0.38,
505
- "learning_rate": 2.8436898814690837e-05,
506
- "loss": 0.3411,
507
  "step": 820
508
  },
509
  {
510
- "epoch": 0.38,
511
- "learning_rate": 2.838558295422284e-05,
512
- "loss": 0.3371,
513
  "step": 830
514
  },
515
  {
516
- "epoch": 0.39,
517
- "learning_rate": 2.833348615404859e-05,
518
- "loss": 0.3336,
519
  "step": 840
520
  },
521
  {
522
- "epoch": 0.39,
523
- "learning_rate": 2.8280611453591908e-05,
524
- "loss": 0.3335,
525
  "step": 850
526
  },
527
  {
528
- "epoch": 0.4,
529
- "learning_rate": 2.8226961937660773e-05,
530
- "loss": 0.3131,
531
  "step": 860
532
  },
533
  {
534
- "epoch": 0.4,
535
- "learning_rate": 2.817254073626733e-05,
536
- "loss": 0.3202,
537
  "step": 870
538
  },
539
  {
540
- "epoch": 0.41,
541
- "learning_rate": 2.811735102444528e-05,
542
- "loss": 0.3265,
543
  "step": 880
544
  },
545
  {
546
- "epoch": 0.41,
547
- "learning_rate": 2.8061396022064657e-05,
548
- "loss": 0.3162,
549
  "step": 890
550
  },
551
  {
552
- "epoch": 0.42,
553
- "learning_rate": 2.8004678993643952e-05,
554
- "loss": 0.3124,
555
  "step": 900
556
  },
557
  {
558
- "epoch": 0.42,
559
- "learning_rate": 2.7947203248159665e-05,
560
- "loss": 0.3289,
561
  "step": 910
562
  },
563
  {
564
- "epoch": 0.42,
565
- "learning_rate": 2.788897213885327e-05,
566
- "loss": 0.3242,
567
  "step": 920
568
  },
569
  {
570
- "epoch": 0.43,
571
- "learning_rate": 2.782998906303555e-05,
572
- "loss": 0.3038,
573
  "step": 930
574
  },
575
  {
576
- "epoch": 0.43,
577
- "learning_rate": 2.777025746188842e-05,
578
- "loss": 0.3319,
579
  "step": 940
580
  },
581
  {
582
- "epoch": 0.44,
583
- "learning_rate": 2.7709780820264147e-05,
584
- "loss": 0.3372,
585
  "step": 950
586
  },
587
  {
588
- "epoch": 0.44,
589
- "learning_rate": 2.764856266648202e-05,
590
- "loss": 0.306,
591
  "step": 960
592
  },
593
  {
594
- "epoch": 1.0,
595
- "learning_rate": 2.758660657212255e-05,
596
- "loss": 0.3001,
597
  "step": 970
598
  },
599
  {
600
- "epoch": 1.01,
601
- "learning_rate": 2.7523916151819048e-05,
602
- "loss": 0.3044,
603
  "step": 980
604
  },
605
  {
606
- "epoch": 1.01,
607
- "learning_rate": 2.746049506304678e-05,
608
- "loss": 0.3114,
609
  "step": 990
610
  },
611
  {
612
- "epoch": 1.02,
613
- "learning_rate": 2.7396347005909535e-05,
614
- "loss": 0.2982,
615
  "step": 1000
616
  },
617
  {
618
- "epoch": 1.02,
619
- "eval_loss": 0.44782933592796326,
620
- "eval_runtime": 6.8987,
621
- "eval_samples_per_second": 20.294,
622
- "eval_steps_per_second": 5.073,
623
  "step": 1000
624
  },
625
  {
626
- "epoch": 1.02,
627
- "learning_rate": 2.733147572292381e-05,
628
- "loss": 0.3091,
629
  "step": 1010
630
  },
631
  {
632
- "epoch": 1.03,
633
- "learning_rate": 2.7265884998800434e-05,
634
- "loss": 0.3011,
635
  "step": 1020
636
  },
637
  {
638
- "epoch": 1.03,
639
- "learning_rate": 2.7199578660223743e-05,
640
- "loss": 0.2794,
641
  "step": 1030
642
  },
643
  {
644
- "epoch": 1.04,
645
- "learning_rate": 2.7132560575628377e-05,
646
- "loss": 0.2956,
647
  "step": 1040
648
  },
649
  {
650
- "epoch": 1.04,
651
- "learning_rate": 2.7064834654973534e-05,
652
- "loss": 0.3098,
653
  "step": 1050
654
  },
655
  {
656
- "epoch": 1.05,
657
- "learning_rate": 2.6996404849514885e-05,
658
- "loss": 0.3315,
659
  "step": 1060
660
  },
661
  {
662
- "epoch": 1.05,
663
- "learning_rate": 2.6927275151574053e-05,
664
- "loss": 0.305,
665
  "step": 1070
666
  },
667
  {
668
- "epoch": 1.05,
669
- "learning_rate": 2.6857449594305674e-05,
670
- "loss": 0.2986,
671
  "step": 1080
672
  },
673
  {
674
- "epoch": 1.06,
675
- "learning_rate": 2.678693225146211e-05,
676
- "loss": 0.269,
677
  "step": 1090
678
  },
679
  {
680
- "epoch": 1.06,
681
- "learning_rate": 2.6715727237155777e-05,
682
- "loss": 0.3124,
683
  "step": 1100
684
  },
685
  {
686
- "epoch": 1.07,
687
- "learning_rate": 2.6643838705619117e-05,
688
- "loss": 0.303,
689
  "step": 1110
690
  },
691
  {
692
- "epoch": 1.07,
693
- "learning_rate": 2.6571270850962234e-05,
694
- "loss": 0.3084,
695
  "step": 1120
696
  },
697
  {
698
- "epoch": 1.08,
699
- "learning_rate": 2.6498027906928195e-05,
700
- "loss": 0.31,
701
  "step": 1130
702
  },
703
  {
704
- "epoch": 1.08,
705
- "learning_rate": 2.6424114146646043e-05,
706
- "loss": 0.3203,
707
  "step": 1140
708
  },
709
  {
710
- "epoch": 1.09,
711
- "learning_rate": 2.6349533882381475e-05,
712
- "loss": 0.2963,
713
  "step": 1150
714
  },
715
  {
716
- "epoch": 1.09,
717
- "learning_rate": 2.6274291465285266e-05,
718
- "loss": 0.3042,
719
  "step": 1160
720
  },
721
  {
722
- "epoch": 1.1,
723
- "learning_rate": 2.6198391285139417e-05,
724
- "loss": 0.314,
725
  "step": 1170
726
  },
727
  {
728
- "epoch": 1.1,
729
- "learning_rate": 2.612183777010104e-05,
730
- "loss": 0.3144,
731
  "step": 1180
732
  },
733
  {
734
- "epoch": 1.11,
735
- "learning_rate": 2.6044635386444024e-05,
736
- "loss": 0.3157,
737
  "step": 1190
738
  },
739
  {
740
- "epoch": 1.11,
741
- "learning_rate": 2.5966788638298443e-05,
742
- "loss": 0.2958,
743
  "step": 1200
744
  },
745
  {
746
- "epoch": 1.12,
747
- "learning_rate": 2.5888302067387793e-05,
748
- "loss": 0.2867,
749
  "step": 1210
750
  },
751
  {
752
- "epoch": 1.12,
753
- "learning_rate": 2.5809180252764022e-05,
754
- "loss": 0.2999,
755
  "step": 1220
756
  },
757
  {
758
- "epoch": 1.12,
759
- "learning_rate": 2.572942781054036e-05,
760
- "loss": 0.2888,
761
  "step": 1230
762
  },
763
  {
764
- "epoch": 1.13,
765
- "learning_rate": 2.564904939362204e-05,
766
- "loss": 0.3036,
767
  "step": 1240
768
  },
769
  {
770
- "epoch": 1.13,
771
- "learning_rate": 2.5568049691434794e-05,
772
- "loss": 0.3127,
773
  "step": 1250
774
  },
775
  {
776
- "epoch": 1.14,
777
- "learning_rate": 2.5486433429651304e-05,
778
- "loss": 0.2939,
779
  "step": 1260
780
  },
781
  {
782
- "epoch": 1.14,
783
- "learning_rate": 2.5404205369915473e-05,
784
- "loss": 0.3096,
785
  "step": 1270
786
  },
787
  {
788
- "epoch": 1.15,
789
- "learning_rate": 2.532137030956464e-05,
790
- "loss": 0.3039,
791
  "step": 1280
792
  },
793
  {
794
- "epoch": 1.15,
795
- "learning_rate": 2.523793308134967e-05,
796
- "loss": 0.2948,
797
  "step": 1290
798
  },
799
  {
800
- "epoch": 1.16,
801
- "learning_rate": 2.5153898553153024e-05,
802
- "loss": 0.3053,
803
  "step": 1300
804
  },
805
  {
806
- "epoch": 1.16,
807
- "learning_rate": 2.506927162770475e-05,
808
- "loss": 0.2962,
809
  "step": 1310
810
  },
811
  {
812
- "epoch": 1.17,
813
- "learning_rate": 2.4984057242296464e-05,
814
- "loss": 0.3027,
815
  "step": 1320
816
  },
817
  {
818
- "epoch": 1.17,
819
- "learning_rate": 2.489826036849325e-05,
820
- "loss": 0.3051,
821
  "step": 1330
822
  },
823
  {
824
- "epoch": 1.18,
825
- "learning_rate": 2.4811886011843673e-05,
826
- "loss": 0.3214,
827
  "step": 1340
828
  },
829
  {
830
- "epoch": 1.18,
831
- "learning_rate": 2.4724939211587706e-05,
832
- "loss": 0.3002,
833
  "step": 1350
834
  },
835
  {
836
- "epoch": 1.18,
837
- "learning_rate": 2.4637425040362744e-05,
838
- "loss": 0.3177,
839
  "step": 1360
840
  },
841
  {
842
- "epoch": 1.19,
843
- "learning_rate": 2.4549348603907658e-05,
844
- "loss": 0.3141,
845
  "step": 1370
846
  },
847
  {
848
- "epoch": 1.19,
849
- "learning_rate": 2.4460715040764916e-05,
850
- "loss": 0.3191,
851
  "step": 1380
852
  },
853
  {
854
- "epoch": 1.2,
855
- "learning_rate": 2.4371529521980775e-05,
856
- "loss": 0.2736,
857
  "step": 1390
858
  },
859
  {
860
- "epoch": 1.2,
861
- "learning_rate": 2.428179725080362e-05,
862
- "loss": 0.3063,
863
  "step": 1400
864
  },
865
  {
866
- "epoch": 1.21,
867
- "learning_rate": 2.419152346238038e-05,
868
- "loss": 0.2965,
869
  "step": 1410
870
  },
871
  {
872
- "epoch": 1.21,
873
- "learning_rate": 2.410071342345111e-05,
874
- "loss": 0.2942,
875
  "step": 1420
876
  },
877
  {
878
- "epoch": 1.22,
879
- "learning_rate": 2.4009372432041702e-05,
880
- "loss": 0.2895,
881
  "step": 1430
882
  },
883
  {
884
- "epoch": 1.22,
885
- "learning_rate": 2.3917505817154795e-05,
886
- "loss": 0.3084,
887
  "step": 1440
888
  },
889
  {
890
- "epoch": 1.23,
891
- "learning_rate": 2.3825118938458894e-05,
892
- "loss": 0.2849,
893
  "step": 1450
894
  },
895
  {
896
- "epoch": 1.23,
897
- "learning_rate": 2.373221718597564e-05,
898
- "loss": 0.2923,
899
  "step": 1460
900
  },
901
  {
902
- "epoch": 1.24,
903
- "learning_rate": 2.3638805979765387e-05,
904
- "loss": 0.3062,
905
  "step": 1470
906
  },
907
  {
908
- "epoch": 1.24,
909
- "learning_rate": 2.3544890769610936e-05,
910
- "loss": 0.2832,
911
  "step": 1480
912
  },
913
  {
914
- "epoch": 1.24,
915
- "learning_rate": 2.3450477034699632e-05,
916
- "loss": 0.2823,
917
  "step": 1490
918
  },
919
  {
920
- "epoch": 1.25,
921
- "learning_rate": 2.335557028330366e-05,
922
- "loss": 0.2822,
923
  "step": 1500
924
  },
925
  {
926
- "epoch": 1.25,
927
- "learning_rate": 2.326017605245872e-05,
928
- "loss": 0.2885,
929
  "step": 1510
930
  },
931
  {
932
- "epoch": 1.26,
933
- "learning_rate": 2.3164299907640955e-05,
934
- "loss": 0.2852,
935
  "step": 1520
936
  },
937
  {
938
- "epoch": 1.26,
939
- "learning_rate": 2.3067947442442264e-05,
940
- "loss": 0.3022,
941
  "step": 1530
942
  },
943
  {
944
- "epoch": 1.27,
945
- "learning_rate": 2.2971124278243957e-05,
946
- "loss": 0.2666,
947
  "step": 1540
948
  },
949
  {
950
- "epoch": 1.27,
951
- "learning_rate": 2.28738360638888e-05,
952
- "loss": 0.2892,
953
  "step": 1550
954
  },
955
  {
956
- "epoch": 1.28,
957
- "learning_rate": 2.2776088475351445e-05,
958
- "loss": 0.2689,
959
  "step": 1560
960
  },
961
  {
962
- "epoch": 1.28,
963
- "learning_rate": 2.2677887215407278e-05,
964
- "loss": 0.2864,
965
  "step": 1570
966
  },
967
  {
968
- "epoch": 1.29,
969
- "learning_rate": 2.257923801329973e-05,
970
- "loss": 0.281,
971
  "step": 1580
972
  },
973
  {
974
- "epoch": 1.29,
975
- "learning_rate": 2.248014662440599e-05,
976
- "loss": 0.3068,
977
  "step": 1590
978
  },
979
  {
980
- "epoch": 1.3,
981
- "learning_rate": 2.238061882990126e-05,
982
- "loss": 0.2753,
983
  "step": 1600
984
  },
985
  {
986
- "epoch": 1.3,
987
- "learning_rate": 2.2280660436421443e-05,
988
- "loss": 0.2701,
989
  "step": 1610
990
  },
991
  {
992
- "epoch": 1.3,
993
- "learning_rate": 2.2180277275724385e-05,
994
- "loss": 0.2891,
995
  "step": 1620
996
  },
997
  {
998
- "epoch": 1.31,
999
- "learning_rate": 2.2079475204349645e-05,
1000
- "loss": 0.2691,
1001
  "step": 1630
1002
  },
1003
  {
1004
- "epoch": 1.31,
1005
- "learning_rate": 2.1978260103276796e-05,
1006
- "loss": 0.2969,
1007
  "step": 1640
1008
  },
1009
  {
1010
- "epoch": 1.32,
1011
- "learning_rate": 2.187663787758234e-05,
1012
- "loss": 0.2629,
1013
  "step": 1650
1014
  },
1015
  {
1016
- "epoch": 1.32,
1017
- "learning_rate": 2.177461445609518e-05,
1018
- "loss": 0.2746,
1019
  "step": 1660
1020
  },
1021
  {
1022
- "epoch": 1.33,
1023
- "learning_rate": 2.1672195791050712e-05,
1024
- "loss": 0.2775,
1025
  "step": 1670
1026
  },
1027
  {
1028
- "epoch": 1.33,
1029
- "learning_rate": 2.1569387857743596e-05,
1030
- "loss": 0.2654,
1031
  "step": 1680
1032
  },
1033
  {
1034
- "epoch": 1.34,
1035
- "learning_rate": 2.1466196654179107e-05,
1036
- "loss": 0.2678,
1037
  "step": 1690
1038
  },
1039
  {
1040
- "epoch": 1.34,
1041
- "learning_rate": 2.1362628200723228e-05,
1042
- "loss": 0.2606,
1043
  "step": 1700
1044
  },
1045
  {
1046
- "epoch": 1.35,
1047
- "learning_rate": 2.1258688539751387e-05,
1048
- "loss": 0.2886,
1049
  "step": 1710
1050
  },
1051
  {
1052
- "epoch": 1.35,
1053
- "learning_rate": 2.115438373529596e-05,
1054
- "loss": 0.2799,
1055
  "step": 1720
1056
  },
1057
  {
1058
- "epoch": 1.36,
1059
- "learning_rate": 2.104971987269245e-05,
1060
- "loss": 0.2804,
1061
  "step": 1730
1062
  },
1063
  {
1064
- "epoch": 1.36,
1065
- "learning_rate": 2.0944703058224504e-05,
1066
- "loss": 0.2583,
1067
  "step": 1740
1068
  },
1069
  {
1070
- "epoch": 1.36,
1071
- "learning_rate": 2.0839339418767616e-05,
1072
- "loss": 0.2857,
1073
  "step": 1750
1074
  },
1075
  {
1076
- "epoch": 1.37,
1077
- "learning_rate": 2.0733635101431694e-05,
1078
- "loss": 0.2692,
1079
  "step": 1760
1080
  },
1081
  {
1082
- "epoch": 1.37,
1083
- "learning_rate": 2.0627596273202435e-05,
1084
- "loss": 0.2759,
1085
  "step": 1770
1086
  },
1087
  {
1088
- "epoch": 1.38,
1089
- "learning_rate": 2.05212291205815e-05,
1090
- "loss": 0.2685,
1091
  "step": 1780
1092
  },
1093
  {
1094
- "epoch": 1.38,
1095
- "learning_rate": 2.0414539849225637e-05,
1096
- "loss": 0.2511,
1097
  "step": 1790
1098
  },
1099
  {
1100
- "epoch": 1.39,
1101
- "learning_rate": 2.0307534683584565e-05,
1102
- "loss": 0.2764,
1103
  "step": 1800
1104
  },
1105
  {
1106
- "epoch": 1.39,
1107
- "learning_rate": 2.0200219866537882e-05,
1108
- "loss": 0.2763,
1109
  "step": 1810
1110
  },
1111
  {
1112
- "epoch": 1.4,
1113
- "learning_rate": 2.0092601659030807e-05,
1114
- "loss": 0.2731,
1115
  "step": 1820
1116
  },
1117
  {
1118
- "epoch": 1.4,
1119
- "learning_rate": 1.9984686339708927e-05,
1120
- "loss": 0.2803,
1121
  "step": 1830
1122
  },
1123
  {
1124
- "epoch": 1.41,
1125
- "learning_rate": 1.9876480204551894e-05,
1126
- "loss": 0.281,
1127
  "step": 1840
1128
  },
1129
  {
1130
- "epoch": 1.41,
1131
- "learning_rate": 1.976798956650607e-05,
1132
- "loss": 0.2802,
1133
  "step": 1850
1134
  },
1135
  {
1136
- "epoch": 1.42,
1137
- "learning_rate": 1.9659220755116277e-05,
1138
- "loss": 0.2743,
1139
  "step": 1860
1140
  },
1141
  {
1142
- "epoch": 1.42,
1143
- "learning_rate": 1.9550180116156447e-05,
1144
- "loss": 0.2579,
1145
  "step": 1870
1146
  },
1147
  {
1148
- "epoch": 1.42,
1149
- "learning_rate": 1.9440874011259458e-05,
1150
- "loss": 0.2721,
1151
  "step": 1880
1152
  },
1153
  {
1154
- "epoch": 1.43,
1155
- "learning_rate": 1.9331308817545963e-05,
1156
- "loss": 0.2713,
1157
  "step": 1890
1158
  },
1159
  {
1160
- "epoch": 1.43,
1161
- "learning_rate": 1.922149092725233e-05,
1162
- "loss": 0.2874,
1163
  "step": 1900
1164
  },
1165
  {
1166
- "epoch": 1.44,
1167
- "learning_rate": 1.911142674735771e-05,
1168
- "loss": 0.2766,
1169
- "step": 1910
1170
- },
1171
- {
1172
- "epoch": 1.44,
1173
- "learning_rate": 1.900112269921026e-05,
1174
- "loss": 0.2695,
1175
- "step": 1920
1176
- },
1177
- {
1178
- "epoch": 1.44,
1179
- "step": 1922,
1180
- "total_flos": 3.4599459174219776e+17,
1181
- "train_loss": 0.3275021193364904,
1182
- "train_runtime": 3966.2406,
1183
- "train_samples_per_second": 4.365,
1184
- "train_steps_per_second": 1.092
1185
  }
1186
  ],
1187
  "logging_steps": 10,
1188
- "max_steps": 4330,
 
1189
  "num_train_epochs": 2,
1190
  "save_steps": 1000,
1191
- "total_flos": 3.4599459174219776e+17,
 
1192
  "trial_name": null,
1193
  "trial_params": null
1194
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 1000,
6
+ "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 3.1578947368421055e-07,
14
+ "loss": 0.7568,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.01,
19
+ "learning_rate": 3.157894736842105e-06,
20
+ "loss": 0.7081,
21
  "step": 10
22
  },
23
  {
24
+ "epoch": 0.02,
25
+ "learning_rate": 6.31578947368421e-06,
26
+ "loss": 0.6806,
27
  "step": 20
28
  },
29
  {
30
+ "epoch": 0.03,
31
+ "learning_rate": 9.473684210526315e-06,
32
+ "loss": 0.6123,
33
  "step": 30
34
  },
35
  {
36
+ "epoch": 0.04,
37
+ "learning_rate": 1.263157894736842e-05,
38
+ "loss": 0.4982,
39
  "step": 40
40
  },
41
  {
42
+ "epoch": 0.05,
43
+ "learning_rate": 1.5789473684210526e-05,
44
+ "loss": 0.4448,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.06,
49
+ "learning_rate": 1.894736842105263e-05,
50
+ "loss": 0.4243,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.07,
55
+ "learning_rate": 2.2105263157894736e-05,
56
+ "loss": 0.3961,
57
  "step": 70
58
  },
59
  {
60
+ "epoch": 0.08,
61
+ "learning_rate": 2.526315789473684e-05,
62
+ "loss": 0.3963,
63
  "step": 80
64
  },
65
  {
66
+ "epoch": 0.09,
67
+ "learning_rate": 2.8421052631578946e-05,
68
+ "loss": 0.3795,
69
  "step": 90
70
  },
71
  {
72
+ "epoch": 0.11,
73
+ "learning_rate": 2.9999432005848255e-05,
74
+ "loss": 0.3786,
75
  "step": 100
76
  },
77
  {
78
+ "epoch": 0.12,
79
+ "learning_rate": 2.99948883107249e-05,
80
+ "loss": 0.3962,
81
  "step": 110
82
  },
83
  {
84
+ "epoch": 0.13,
85
+ "learning_rate": 2.9985802296874666e-05,
86
+ "loss": 0.3566,
87
  "step": 120
88
  },
89
  {
90
+ "epoch": 0.14,
91
+ "learning_rate": 2.9972176716673562e-05,
92
+ "loss": 0.3817,
93
  "step": 130
94
  },
95
  {
96
+ "epoch": 0.15,
97
+ "learning_rate": 2.9954015697643372e-05,
98
+ "loss": 0.3644,
99
  "step": 140
100
  },
101
  {
102
+ "epoch": 0.16,
103
+ "learning_rate": 2.9931324741201325e-05,
104
+ "loss": 0.3497,
105
  "step": 150
106
  },
107
  {
108
+ "epoch": 0.17,
109
+ "learning_rate": 2.9904110720993565e-05,
110
+ "loss": 0.3601,
111
  "step": 160
112
  },
113
  {
114
+ "epoch": 0.18,
115
+ "learning_rate": 2.987238188081299e-05,
116
+ "loss": 0.3877,
117
  "step": 170
118
  },
119
  {
120
+ "epoch": 0.19,
121
+ "learning_rate": 2.983614783210197e-05,
122
+ "loss": 0.3585,
123
  "step": 180
124
  },
125
  {
126
+ "epoch": 0.2,
127
+ "learning_rate": 2.9795419551040836e-05,
128
+ "loss": 0.3498,
129
  "step": 190
130
  },
131
  {
132
+ "epoch": 0.21,
133
+ "learning_rate": 2.9750209375222893e-05,
134
+ "loss": 0.3618,
135
  "step": 200
136
  },
137
  {
138
+ "epoch": 0.22,
139
+ "learning_rate": 2.97005309999171e-05,
140
+ "loss": 0.3496,
141
  "step": 210
142
  },
143
  {
144
+ "epoch": 0.23,
145
+ "learning_rate": 2.964639947391939e-05,
146
+ "loss": 0.3483,
147
  "step": 220
148
  },
149
  {
150
+ "epoch": 0.24,
151
+ "learning_rate": 2.958783119499408e-05,
152
+ "loss": 0.3488,
153
  "step": 230
154
  },
155
  {
156
+ "epoch": 0.25,
157
+ "learning_rate": 2.9524843904906528e-05,
158
+ "loss": 0.3405,
159
  "step": 240
160
  },
161
  {
162
+ "epoch": 0.26,
163
+ "learning_rate": 2.9457456684048772e-05,
164
+ "loss": 0.3441,
165
  "step": 250
166
  },
167
  {
168
+ "epoch": 0.27,
169
+ "learning_rate": 2.938568994565956e-05,
170
+ "loss": 0.3638,
171
  "step": 260
172
  },
173
  {
174
+ "epoch": 0.28,
175
+ "learning_rate": 2.9309565429640724e-05,
176
+ "loss": 0.3695,
177
  "step": 270
178
  },
179
  {
180
+ "epoch": 0.29,
181
+ "learning_rate": 2.9229106195971603e-05,
182
+ "loss": 0.3372,
183
  "step": 280
184
  },
185
  {
186
+ "epoch": 0.31,
187
+ "learning_rate": 2.9144336617723625e-05,
188
+ "loss": 0.354,
189
  "step": 290
190
  },
191
  {
192
+ "epoch": 0.32,
193
+ "learning_rate": 2.90552823736771e-05,
194
+ "loss": 0.358,
195
  "step": 300
196
  },
197
  {
198
+ "epoch": 0.33,
199
+ "learning_rate": 2.8961970440542496e-05,
200
+ "loss": 0.3473,
201
  "step": 310
202
  },
203
  {
204
+ "epoch": 0.34,
205
+ "learning_rate": 2.8864429084788534e-05,
206
+ "loss": 0.3379,
207
  "step": 320
208
  },
209
  {
210
+ "epoch": 0.35,
211
+ "learning_rate": 2.8762687854079563e-05,
212
+ "loss": 0.3794,
213
  "step": 330
214
  },
215
  {
216
+ "epoch": 0.36,
217
+ "learning_rate": 2.8656777568324878e-05,
218
+ "loss": 0.3475,
219
  "step": 340
220
  },
221
  {
222
+ "epoch": 0.37,
223
+ "learning_rate": 2.8546730310342593e-05,
224
+ "loss": 0.3655,
225
  "step": 350
226
  },
227
  {
228
+ "epoch": 0.38,
229
+ "learning_rate": 2.8432579416140984e-05,
230
+ "loss": 0.3354,
231
  "step": 360
232
  },
233
  {
234
+ "epoch": 0.39,
235
+ "learning_rate": 2.8314359464820184e-05,
236
+ "loss": 0.3448,
237
  "step": 370
238
  },
239
  {
240
+ "epoch": 0.4,
241
+ "learning_rate": 2.8192106268097336e-05,
242
+ "loss": 0.3565,
243
  "step": 380
244
  },
245
  {
246
+ "epoch": 0.41,
247
+ "learning_rate": 2.8065856859458346e-05,
248
+ "loss": 0.3436,
249
  "step": 390
250
  },
251
  {
252
+ "epoch": 0.42,
253
+ "learning_rate": 2.7935649482939533e-05,
254
+ "loss": 0.3509,
255
  "step": 400
256
  },
257
  {
258
+ "epoch": 0.43,
259
+ "learning_rate": 2.7801523581542563e-05,
260
+ "loss": 0.3312,
261
  "step": 410
262
  },
263
  {
264
+ "epoch": 0.44,
265
+ "learning_rate": 2.766351978528622e-05,
266
+ "loss": 0.3445,
267
  "step": 420
268
  },
269
  {
270
+ "epoch": 0.45,
271
+ "learning_rate": 2.7521679898898567e-05,
272
+ "loss": 0.3374,
273
  "step": 430
274
  },
275
  {
276
+ "epoch": 0.46,
277
+ "learning_rate": 2.737604688915327e-05,
278
+ "loss": 0.3438,
279
  "step": 440
280
  },
281
  {
282
+ "epoch": 0.47,
283
+ "learning_rate": 2.72266648718539e-05,
284
+ "loss": 0.3307,
285
  "step": 450
286
  },
287
  {
288
+ "epoch": 0.48,
289
+ "learning_rate": 2.7073579098470196e-05,
290
+ "loss": 0.3344,
291
  "step": 460
292
  },
293
  {
294
+ "epoch": 0.49,
295
+ "learning_rate": 2.6916835942430292e-05,
296
+ "loss": 0.325,
297
  "step": 470
298
  },
299
  {
300
+ "epoch": 0.51,
301
+ "learning_rate": 2.6756482885073032e-05,
302
+ "loss": 0.3296,
303
  "step": 480
304
  },
305
  {
306
+ "epoch": 0.52,
307
+ "learning_rate": 2.6592568501264746e-05,
308
+ "loss": 0.3536,
309
  "step": 490
310
  },
311
  {
312
+ "epoch": 0.53,
313
+ "learning_rate": 2.6425142444684735e-05,
314
+ "loss": 0.3272,
315
  "step": 500
316
  },
317
  {
318
+ "epoch": 0.54,
319
+ "learning_rate": 2.6254255432783933e-05,
320
+ "loss": 0.3547,
321
  "step": 510
322
  },
323
  {
324
+ "epoch": 0.55,
325
+ "learning_rate": 2.6079959231421347e-05,
326
+ "loss": 0.3313,
327
  "step": 520
328
  },
329
  {
330
+ "epoch": 0.56,
331
+ "learning_rate": 2.5902306639182952e-05,
332
+ "loss": 0.343,
333
  "step": 530
334
  },
335
  {
336
+ "epoch": 0.57,
337
+ "learning_rate": 2.5721351471387666e-05,
338
+ "loss": 0.3129,
339
  "step": 540
340
  },
341
  {
342
+ "epoch": 0.58,
343
+ "learning_rate": 2.5537148543785385e-05,
344
+ "loss": 0.3505,
345
  "step": 550
346
  },
347
  {
348
+ "epoch": 0.59,
349
+ "learning_rate": 2.534975365595196e-05,
350
+ "loss": 0.3523,
351
  "step": 560
352
  },
353
  {
354
+ "epoch": 0.6,
355
+ "learning_rate": 2.5159223574386117e-05,
356
+ "loss": 0.3699,
357
  "step": 570
358
  },
359
  {
360
+ "epoch": 0.61,
361
+ "learning_rate": 2.496561601531353e-05,
362
+ "loss": 0.315,
363
  "step": 580
364
  },
365
  {
366
+ "epoch": 0.62,
367
+ "learning_rate": 2.4768989627203123e-05,
368
+ "loss": 0.3382,
369
  "step": 590
370
  },
371
  {
372
+ "epoch": 0.63,
373
+ "learning_rate": 2.4569403973001045e-05,
374
+ "loss": 0.3387,
375
  "step": 600
376
  },
377
  {
378
+ "epoch": 0.64,
379
+ "learning_rate": 2.436691951208758e-05,
380
+ "loss": 0.3286,
381
  "step": 610
382
  },
383
  {
384
+ "epoch": 0.65,
385
+ "learning_rate": 2.4161597581962526e-05,
386
+ "loss": 0.3096,
387
  "step": 620
388
  },
389
  {
390
+ "epoch": 0.66,
391
+ "learning_rate": 2.395350037966456e-05,
392
+ "loss": 0.3384,
393
  "step": 630
394
  },
395
  {
396
+ "epoch": 0.67,
397
+ "learning_rate": 2.3742690942930235e-05,
398
+ "loss": 0.3284,
399
  "step": 640
400
  },
401
  {
402
+ "epoch": 0.68,
403
+ "learning_rate": 2.3529233131098313e-05,
404
+ "loss": 0.3295,
405
  "step": 650
406
  },
407
  {
408
+ "epoch": 0.69,
409
+ "learning_rate": 2.33131916057652e-05,
410
+ "loss": 0.3164,
411
  "step": 660
412
  },
413
  {
414
+ "epoch": 0.71,
415
+ "learning_rate": 2.309463181119736e-05,
416
+ "loss": 0.3597,
417
  "step": 670
418
  },
419
  {
420
+ "epoch": 0.72,
421
+ "learning_rate": 2.287361995450667e-05,
422
+ "loss": 0.3239,
423
  "step": 680
424
  },
425
  {
426
+ "epoch": 0.73,
427
+ "learning_rate": 2.2650222985594634e-05,
428
+ "loss": 0.3298,
429
  "step": 690
430
  },
431
  {
432
+ "epoch": 0.74,
433
+ "learning_rate": 2.2424508576871623e-05,
434
+ "loss": 0.3176,
435
  "step": 700
436
  },
437
  {
438
+ "epoch": 0.75,
439
+ "learning_rate": 2.219654510275728e-05,
440
+ "loss": 0.3344,
441
  "step": 710
442
  },
443
  {
444
+ "epoch": 0.76,
445
+ "learning_rate": 2.1966401618968194e-05,
446
+ "loss": 0.3518,
447
  "step": 720
448
  },
449
  {
450
+ "epoch": 0.77,
451
+ "learning_rate": 2.173414784159925e-05,
452
+ "loss": 0.3323,
453
  "step": 730
454
  },
455
  {
456
+ "epoch": 0.78,
457
+ "learning_rate": 2.149985412600492e-05,
458
+ "loss": 0.3343,
459
  "step": 740
460
  },
461
  {
462
+ "epoch": 0.79,
463
+ "learning_rate": 2.1263591445486895e-05,
464
+ "loss": 0.3383,
465
  "step": 750
466
  },
467
  {
468
+ "epoch": 0.8,
469
+ "learning_rate": 2.1025431369794546e-05,
470
+ "loss": 0.3135,
471
  "step": 760
472
  },
473
  {
474
+ "epoch": 0.81,
475
+ "learning_rate": 2.0785446043444677e-05,
476
+ "loss": 0.3278,
477
  "step": 770
478
  },
479
  {
480
+ "epoch": 0.82,
481
+ "learning_rate": 2.0543708163867204e-05,
482
+ "loss": 0.3148,
483
  "step": 780
484
  },
485
  {
486
+ "epoch": 0.83,
487
+ "learning_rate": 2.0300290959383318e-05,
488
+ "loss": 0.3511,
489
  "step": 790
490
  },
491
  {
492
+ "epoch": 0.84,
493
+ "learning_rate": 2.0055268167022835e-05,
494
+ "loss": 0.3146,
495
  "step": 800
496
  },
497
  {
498
+ "epoch": 0.85,
499
+ "learning_rate": 1.9808714010187425e-05,
500
+ "loss": 0.3321,
501
  "step": 810
502
  },
503
  {
504
+ "epoch": 0.86,
505
+ "learning_rate": 1.9560703176166565e-05,
506
+ "loss": 0.3373,
507
  "step": 820
508
  },
509
  {
510
+ "epoch": 0.87,
511
+ "learning_rate": 1.931131079351289e-05,
512
+ "loss": 0.32,
513
  "step": 830
514
  },
515
  {
516
+ "epoch": 0.88,
517
+ "learning_rate": 1.9060612409283946e-05,
518
+ "loss": 0.3106,
519
  "step": 840
520
  },
521
  {
522
+ "epoch": 0.89,
523
+ "learning_rate": 1.8808683966157132e-05,
524
+ "loss": 0.3167,
525
  "step": 850
526
  },
527
  {
528
+ "epoch": 0.91,
529
+ "learning_rate": 1.8555601779424778e-05,
530
+ "loss": 0.2993,
531
  "step": 860
532
  },
533
  {
534
+ "epoch": 0.92,
535
+ "learning_rate": 1.8301442513876406e-05,
536
+ "loss": 0.3405,
537
  "step": 870
538
  },
539
  {
540
+ "epoch": 0.93,
541
+ "learning_rate": 1.804628316057508e-05,
542
+ "loss": 0.3346,
543
  "step": 880
544
  },
545
  {
546
+ "epoch": 0.94,
547
+ "learning_rate": 1.779020101353492e-05,
548
+ "loss": 0.3319,
549
  "step": 890
550
  },
551
  {
552
+ "epoch": 0.95,
553
+ "learning_rate": 1.7533273646306857e-05,
554
+ "loss": 0.3087,
555
  "step": 900
556
  },
557
  {
558
+ "epoch": 0.96,
559
+ "learning_rate": 1.7275578888479714e-05,
560
+ "loss": 0.316,
561
  "step": 910
562
  },
563
  {
564
+ "epoch": 0.97,
565
+ "learning_rate": 1.7017194802103705e-05,
566
+ "loss": 0.3054,
567
  "step": 920
568
  },
569
  {
570
+ "epoch": 0.98,
571
+ "learning_rate": 1.6758199658043538e-05,
572
+ "loss": 0.3255,
573
  "step": 930
574
  },
575
  {
576
+ "epoch": 0.99,
577
+ "learning_rate": 1.6498671912268256e-05,
578
+ "loss": 0.3175,
579
  "step": 940
580
  },
581
  {
582
+ "epoch": 1.0,
583
+ "learning_rate": 1.623869018208499e-05,
584
+ "loss": 0.3424,
585
  "step": 950
586
  },
587
  {
588
+ "epoch": 1.01,
589
+ "learning_rate": 1.5978333222323858e-05,
590
+ "loss": 0.2825,
591
  "step": 960
592
  },
593
  {
594
+ "epoch": 1.02,
595
+ "learning_rate": 1.571767990148122e-05,
596
+ "loss": 0.2886,
597
  "step": 970
598
  },
599
  {
600
+ "epoch": 1.03,
601
+ "learning_rate": 1.5456809177828444e-05,
602
+ "loss": 0.3196,
603
  "step": 980
604
  },
605
  {
606
+ "epoch": 1.04,
607
+ "learning_rate": 1.5195800075493542e-05,
608
+ "loss": 0.3178,
609
  "step": 990
610
  },
611
  {
612
+ "epoch": 1.05,
613
+ "learning_rate": 1.4934731660522817e-05,
614
+ "loss": 0.2676,
615
  "step": 1000
616
  },
617
  {
618
+ "epoch": 1.05,
619
+ "eval_loss": 0.508576512336731,
620
+ "eval_runtime": 12.2399,
621
+ "eval_samples_per_second": 3.595,
622
+ "eval_steps_per_second": 0.899,
623
  "step": 1000
624
  },
625
  {
626
+ "epoch": 1.06,
627
+ "learning_rate": 1.4673683016929805e-05,
628
+ "loss": 0.2803,
629
  "step": 1010
630
  },
631
  {
632
+ "epoch": 1.07,
633
+ "learning_rate": 1.441273322273884e-05,
634
+ "loss": 0.319,
635
  "step": 1020
636
  },
637
  {
638
+ "epoch": 1.08,
639
+ "learning_rate": 1.4151961326030314e-05,
640
+ "loss": 0.2965,
641
  "step": 1030
642
  },
643
  {
644
+ "epoch": 1.09,
645
+ "learning_rate": 1.3891446320995143e-05,
646
+ "loss": 0.3067,
647
  "step": 1040
648
  },
649
  {
650
+ "epoch": 1.11,
651
+ "learning_rate": 1.3631267124005453e-05,
652
+ "loss": 0.3072,
653
  "step": 1050
654
  },
655
  {
656
+ "epoch": 1.12,
657
+ "learning_rate": 1.337150254970891e-05,
658
+ "loss": 0.2963,
659
  "step": 1060
660
  },
661
  {
662
+ "epoch": 1.13,
663
+ "learning_rate": 1.3112231287153798e-05,
664
+ "loss": 0.3063,
665
  "step": 1070
666
  },
667
  {
668
+ "epoch": 1.14,
669
+ "learning_rate": 1.28535318759522e-05,
670
+ "loss": 0.2732,
671
  "step": 1080
672
  },
673
  {
674
+ "epoch": 1.15,
675
+ "learning_rate": 1.2595482682488443e-05,
676
+ "loss": 0.2874,
677
  "step": 1090
678
  },
679
  {
680
+ "epoch": 1.16,
681
+ "learning_rate": 1.2338161876179964e-05,
682
+ "loss": 0.2998,
683
  "step": 1100
684
  },
685
  {
686
+ "epoch": 1.17,
687
+ "learning_rate": 1.2081647405797923e-05,
688
+ "loss": 0.2849,
689
  "step": 1110
690
  },
691
  {
692
+ "epoch": 1.18,
693
+ "learning_rate": 1.1826016975854563e-05,
694
+ "loss": 0.2735,
695
  "step": 1120
696
  },
697
  {
698
+ "epoch": 1.19,
699
+ "learning_rate": 1.1571348023064662e-05,
700
+ "loss": 0.2937,
701
  "step": 1130
702
  },
703
  {
704
+ "epoch": 1.2,
705
+ "learning_rate": 1.1317717692888014e-05,
706
+ "loss": 0.2993,
707
  "step": 1140
708
  },
709
  {
710
+ "epoch": 1.21,
711
+ "learning_rate": 1.1065202816160213e-05,
712
+ "loss": 0.2933,
713
  "step": 1150
714
  },
715
  {
716
+ "epoch": 1.22,
717
+ "learning_rate": 1.081387988581869e-05,
718
+ "loss": 0.285,
719
  "step": 1160
720
  },
721
  {
722
+ "epoch": 1.23,
723
+ "learning_rate": 1.0563825033731146e-05,
724
+ "loss": 0.2896,
725
  "step": 1170
726
  },
727
  {
728
+ "epoch": 1.24,
729
+ "learning_rate": 1.031511400763332e-05,
730
+ "loss": 0.2874,
731
  "step": 1180
732
  },
733
  {
734
+ "epoch": 1.25,
735
+ "learning_rate": 1.0067822148183194e-05,
736
+ "loss": 0.2851,
737
  "step": 1190
738
  },
739
  {
740
+ "epoch": 1.26,
741
+ "learning_rate": 9.822024366138397e-06,
742
+ "loss": 0.2925,
743
  "step": 1200
744
  },
745
  {
746
+ "epoch": 1.27,
747
+ "learning_rate": 9.577795119663966e-06,
748
+ "loss": 0.2843,
749
  "step": 1210
750
  },
751
  {
752
+ "epoch": 1.28,
753
+ "learning_rate": 9.335208391777106e-06,
754
+ "loss": 0.2884,
755
  "step": 1220
756
  },
757
  {
758
+ "epoch": 1.29,
759
+ "learning_rate": 9.094337667935942e-06,
760
+ "loss": 0.269,
761
  "step": 1230
762
  },
763
  {
764
+ "epoch": 1.31,
765
+ "learning_rate": 8.855255913778949e-06,
766
+ "loss": 0.2849,
767
  "step": 1240
768
  },
769
  {
770
+ "epoch": 1.32,
771
+ "learning_rate": 8.618035553021925e-06,
772
+ "loss": 0.304,
773
  "step": 1250
774
  },
775
  {
776
+ "epoch": 1.33,
777
+ "learning_rate": 8.382748445519008e-06,
778
+ "loss": 0.3049,
779
  "step": 1260
780
  },
781
  {
782
+ "epoch": 1.34,
783
+ "learning_rate": 8.149465865494633e-06,
784
+ "loss": 0.2999,
785
  "step": 1270
786
  },
787
  {
788
+ "epoch": 1.35,
789
+ "learning_rate": 7.918258479952763e-06,
790
+ "loss": 0.2835,
791
  "step": 1280
792
  },
793
  {
794
+ "epoch": 1.36,
795
+ "learning_rate": 7.689196327270171e-06,
796
+ "loss": 0.3167,
797
  "step": 1290
798
  },
799
  {
800
+ "epoch": 1.37,
801
+ "learning_rate": 7.462348795980088e-06,
802
+ "loss": 0.2842,
803
  "step": 1300
804
  },
805
  {
806
+ "epoch": 1.38,
807
+ "learning_rate": 7.237784603752705e-06,
808
+ "loss": 0.2909,
809
  "step": 1310
810
  },
811
  {
812
+ "epoch": 1.39,
813
+ "learning_rate": 7.015571776578922e-06,
814
+ "loss": 0.2881,
815
  "step": 1320
816
  },
817
  {
818
+ "epoch": 1.4,
819
+ "learning_rate": 6.795777628163599e-06,
820
+ "loss": 0.2796,
821
  "step": 1330
822
  },
823
  {
824
+ "epoch": 1.41,
825
+ "learning_rate": 6.578468739534602e-06,
826
+ "loss": 0.3056,
827
  "step": 1340
828
  },
829
  {
830
+ "epoch": 1.42,
831
+ "learning_rate": 6.363710938873759e-06,
832
+ "loss": 0.2987,
833
  "step": 1350
834
  },
835
  {
836
+ "epoch": 1.43,
837
+ "learning_rate": 6.151569281575925e-06,
838
+ "loss": 0.288,
839
  "step": 1360
840
  },
841
  {
842
+ "epoch": 1.44,
843
+ "learning_rate": 5.942108030542074e-06,
844
+ "loss": 0.2954,
845
  "step": 1370
846
  },
847
  {
848
+ "epoch": 1.45,
849
+ "learning_rate": 5.735390636712514e-06,
850
+ "loss": 0.2837,
851
  "step": 1380
852
  },
853
  {
854
+ "epoch": 1.46,
855
+ "learning_rate": 5.531479719846038e-06,
856
+ "loss": 0.3055,
857
  "step": 1390
858
  },
859
  {
860
+ "epoch": 1.47,
861
+ "learning_rate": 5.330437049550868e-06,
862
+ "loss": 0.313,
863
  "step": 1400
864
  },
865
  {
866
+ "epoch": 1.48,
867
+ "learning_rate": 5.132323526573126e-06,
868
+ "loss": 0.2966,
869
  "step": 1410
870
  },
871
  {
872
+ "epoch": 1.49,
873
+ "learning_rate": 4.937199164348521e-06,
874
+ "loss": 0.2741,
875
  "step": 1420
876
  },
877
  {
878
+ "epoch": 1.51,
879
+ "learning_rate": 4.745123070822786e-06,
880
+ "loss": 0.2973,
881
  "step": 1430
882
  },
883
  {
884
+ "epoch": 1.52,
885
+ "learning_rate": 4.556153430546451e-06,
886
+ "loss": 0.281,
887
  "step": 1440
888
  },
889
  {
890
+ "epoch": 1.53,
891
+ "learning_rate": 4.370347487049313e-06,
892
+ "loss": 0.2905,
893
  "step": 1450
894
  },
895
  {
896
+ "epoch": 1.54,
897
+ "learning_rate": 4.187761525499973e-06,
898
+ "loss": 0.2806,
899
  "step": 1460
900
  },
901
  {
902
+ "epoch": 1.55,
903
+ "learning_rate": 4.008450855655675e-06,
904
+ "loss": 0.2716,
905
  "step": 1470
906
  },
907
  {
908
+ "epoch": 1.56,
909
+ "learning_rate": 3.83246979510764e-06,
910
+ "loss": 0.2834,
911
  "step": 1480
912
  },
913
  {
914
+ "epoch": 1.57,
915
+ "learning_rate": 3.676977737529078e-06,
916
+ "loss": 0.2809,
917
  "step": 1490
918
  },
919
  {
920
+ "epoch": 1.58,
921
+ "learning_rate": 3.5074689542164895e-06,
922
+ "loss": 0.2845,
923
  "step": 1500
924
  },
925
  {
926
+ "epoch": 1.59,
927
+ "learning_rate": 3.341441539881574e-06,
928
+ "loss": 0.3001,
929
  "step": 1510
930
  },
931
  {
932
+ "epoch": 1.6,
933
+ "learning_rate": 3.1789457882922753e-06,
934
+ "loss": 0.2941,
935
  "step": 1520
936
  },
937
  {
938
+ "epoch": 1.61,
939
+ "learning_rate": 3.020030923389471e-06,
940
+ "loss": 0.2917,
941
  "step": 1530
942
  },
943
  {
944
+ "epoch": 1.62,
945
+ "learning_rate": 2.86474508437579e-06,
946
+ "loss": 0.293,
947
  "step": 1540
948
  },
949
  {
950
+ "epoch": 1.63,
951
+ "learning_rate": 2.7131353111330843e-06,
952
+ "loss": 0.2941,
953
  "step": 1550
954
  },
955
  {
956
+ "epoch": 1.64,
957
+ "learning_rate": 2.565247529972901e-06,
958
+ "loss": 0.2787,
959
  "step": 1560
960
  },
961
  {
962
+ "epoch": 1.65,
963
+ "learning_rate": 2.4211265397242854e-06,
964
+ "loss": 0.2899,
965
  "step": 1570
966
  },
967
  {
968
+ "epoch": 1.66,
969
+ "learning_rate": 2.280815998163083e-06,
970
+ "loss": 0.2814,
971
  "step": 1580
972
  },
973
  {
974
+ "epoch": 1.67,
975
+ "learning_rate": 2.144358408786986e-06,
976
+ "loss": 0.2786,
977
  "step": 1590
978
  },
979
  {
980
+ "epoch": 1.68,
981
+ "learning_rate": 2.011795107940138e-06,
982
+ "loss": 0.2916,
983
  "step": 1600
984
  },
985
  {
986
+ "epoch": 1.69,
987
+ "learning_rate": 1.8831662522913594e-06,
988
+ "loss": 0.2834,
989
  "step": 1610
990
  },
991
  {
992
+ "epoch": 1.71,
993
+ "learning_rate": 1.7585108066697136e-06,
994
+ "loss": 0.2735,
995
  "step": 1620
996
  },
997
  {
998
+ "epoch": 1.72,
999
+ "learning_rate": 1.6378665322611002e-06,
1000
+ "loss": 0.3039,
1001
  "step": 1630
1002
  },
1003
  {
1004
+ "epoch": 1.73,
1005
+ "learning_rate": 1.521269975169471e-06,
1006
+ "loss": 0.2872,
1007
  "step": 1640
1008
  },
1009
  {
1010
+ "epoch": 1.74,
1011
+ "learning_rate": 1.408756455346114e-06,
1012
+ "loss": 0.2863,
1013
  "step": 1650
1014
  },
1015
  {
1016
+ "epoch": 1.75,
1017
+ "learning_rate": 1.3003600558903927e-06,
1018
+ "loss": 0.2854,
1019
  "step": 1660
1020
  },
1021
  {
1022
+ "epoch": 1.76,
1023
+ "learning_rate": 1.196113612725116e-06,
1024
+ "loss": 0.2727,
1025
  "step": 1670
1026
  },
1027
  {
1028
+ "epoch": 1.77,
1029
+ "learning_rate": 1.0960487046497524e-06,
1030
+ "loss": 0.2763,
1031
  "step": 1680
1032
  },
1033
  {
1034
+ "epoch": 1.78,
1035
+ "learning_rate": 1.000195643774431e-06,
1036
+ "loss": 0.2962,
1037
  "step": 1690
1038
  },
1039
  {
1040
+ "epoch": 1.79,
1041
+ "learning_rate": 9.085834663376629e-07,
1042
+ "loss": 0.2888,
1043
  "step": 1700
1044
  },
1045
  {
1046
+ "epoch": 1.8,
1047
+ "learning_rate": 8.212399239105534e-07,
1048
+ "loss": 0.2839,
1049
  "step": 1710
1050
  },
1051
  {
1052
+ "epoch": 1.81,
1053
+ "learning_rate": 7.381914749901752e-07,
1054
+ "loss": 0.2789,
1055
  "step": 1720
1056
  },
1057
  {
1058
+ "epoch": 1.82,
1059
+ "learning_rate": 6.594632769846353e-07,
1060
+ "loss": 0.2772,
1061
  "step": 1730
1062
  },
1063
  {
1064
+ "epoch": 1.83,
1065
+ "learning_rate": 5.850791785922849e-07,
1066
+ "loss": 0.278,
1067
  "step": 1740
1068
  },
1069
  {
1070
+ "epoch": 1.84,
1071
+ "learning_rate": 5.150617125773633e-07,
1072
+ "loss": 0.2878,
1073
  "step": 1750
1074
  },
1075
  {
1076
+ "epoch": 1.85,
1077
+ "learning_rate": 4.494320889442749e-07,
1078
+ "loss": 0.2734,
1079
  "step": 1760
1080
  },
1081
  {
1082
+ "epoch": 1.86,
1083
+ "learning_rate": 3.882101885125539e-07,
1084
+ "loss": 0.2826,
1085
  "step": 1770
1086
  },
1087
  {
1088
+ "epoch": 1.87,
1089
+ "learning_rate": 3.3141455689448266e-07,
1090
+ "loss": 0.2875,
1091
  "step": 1780
1092
  },
1093
  {
1094
+ "epoch": 1.88,
1095
+ "learning_rate": 2.790623988771712e-07,
1096
+ "loss": 0.2898,
1097
  "step": 1790
1098
  },
1099
  {
1100
+ "epoch": 1.89,
1101
+ "learning_rate": 2.3116957321080102e-07,
1102
+ "loss": 0.2919,
1103
  "step": 1800
1104
  },
1105
  {
1106
+ "epoch": 1.91,
1107
+ "learning_rate": 1.8775058780463094e-07,
1108
+ "loss": 0.2778,
1109
  "step": 1810
1110
  },
1111
  {
1112
+ "epoch": 1.92,
1113
+ "learning_rate": 1.4881859533218466e-07,
1114
+ "loss": 0.3179,
1115
  "step": 1820
1116
  },
1117
  {
1118
+ "epoch": 1.93,
1119
+ "learning_rate": 1.1438538924699094e-07,
1120
+ "loss": 0.2771,
1121
  "step": 1830
1122
  },
1123
  {
1124
+ "epoch": 1.94,
1125
+ "learning_rate": 8.446140021006132e-08,
1126
+ "loss": 0.2817,
1127
  "step": 1840
1128
  },
1129
  {
1130
+ "epoch": 1.95,
1131
+ "learning_rate": 5.9055692930179426e-08,
1132
+ "loss": 0.2903,
1133
  "step": 1850
1134
  },
1135
  {
1136
+ "epoch": 1.96,
1137
+ "learning_rate": 3.8175963417980685e-08,
1138
+ "loss": 0.2934,
1139
  "step": 1860
1140
  },
1141
  {
1142
+ "epoch": 1.97,
1143
+ "learning_rate": 2.1828536654647235e-08,
1144
+ "loss": 0.2952,
1145
  "step": 1870
1146
  },
1147
  {
1148
+ "epoch": 1.98,
1149
+ "learning_rate": 1.0018364675912217e-08,
1150
+ "loss": 0.2881,
1151
  "step": 1880
1152
  },
1153
  {
1154
+ "epoch": 1.99,
1155
+ "learning_rate": 2.7490250719663933e-09,
1156
+ "loss": 0.2701,
1157
  "step": 1890
1158
  },
1159
  {
1160
+ "epoch": 2.0,
1161
+ "learning_rate": 2.2719903721712954e-11,
1162
+ "loss": 0.2955,
1163
  "step": 1900
1164
  },
1165
  {
1166
+ "epoch": 2.0,
1167
+ "step": 1900,
1168
+ "total_flos": 3.4203213408659046e+17,
1169
+ "train_loss": 0.3227183536165639,
1170
+ "train_runtime": 6710.5312,
1171
+ "train_samples_per_second": 1.132,
1172
+ "train_steps_per_second": 0.283
 
 
 
 
 
 
 
 
 
 
 
 
1173
  }
1174
  ],
1175
  "logging_steps": 10,
1176
+ "max_steps": 1900,
1177
+ "num_input_tokens_seen": 0,
1178
  "num_train_epochs": 2,
1179
  "save_steps": 1000,
1180
+ "total_flos": 3.4203213408659046e+17,
1181
+ "train_batch_size": 4,
1182
  "trial_name": null,
1183
  "trial_params": null
1184
  }