jacpacd commited on
Commit
d03f590
·
verified ·
1 Parent(s): 958559d

Upload checkpoint-3000\trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. checkpoint-3000//trainer_state.json +724 -0
checkpoint-3000//trainer_state.json ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3000,
3
+ "best_metric": 0.8622779005977237,
4
+ "best_model_checkpoint": "./model_checkpoint\\checkpoint-3000",
5
+ "epoch": 3.927308447937132,
6
+ "eval_steps": 100,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06548788474132286,
14
+ "grad_norm": 0.932673454284668,
15
+ "learning_rate": 4.9000000000000005e-06,
16
+ "loss": 0.6777,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.13097576948264572,
21
+ "grad_norm": 1.4263807535171509,
22
+ "learning_rate": 9.900000000000002e-06,
23
+ "loss": 0.584,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.13097576948264572,
28
+ "eval_accuracy": 0.7383116351428806,
29
+ "eval_loss": 0.4755142033100128,
30
+ "eval_runtime": 16.8317,
31
+ "eval_samples_per_second": 725.593,
32
+ "eval_steps_per_second": 45.39,
33
+ "step": 100
34
+ },
35
+ {
36
+ "epoch": 0.19646365422396855,
37
+ "grad_norm": 2.0286386013031006,
38
+ "learning_rate": 1.49e-05,
39
+ "loss": 0.4077,
40
+ "step": 150
41
+ },
42
+ {
43
+ "epoch": 0.26195153896529144,
44
+ "grad_norm": 1.7711361646652222,
45
+ "learning_rate": 1.9900000000000003e-05,
46
+ "loss": 0.2967,
47
+ "step": 200
48
+ },
49
+ {
50
+ "epoch": 0.26195153896529144,
51
+ "eval_accuracy": 0.8327192336035372,
52
+ "eval_loss": 0.2789490818977356,
53
+ "eval_runtime": 17.0163,
54
+ "eval_samples_per_second": 717.724,
55
+ "eval_steps_per_second": 44.898,
56
+ "step": 200
57
+ },
58
+ {
59
+ "epoch": 0.3274394237066143,
60
+ "grad_norm": 1.2721681594848633,
61
+ "learning_rate": 2.4900000000000002e-05,
62
+ "loss": 0.2954,
63
+ "step": 250
64
+ },
65
+ {
66
+ "epoch": 0.3929273084479371,
67
+ "grad_norm": 0.7673302292823792,
68
+ "learning_rate": 2.9900000000000002e-05,
69
+ "loss": 0.2724,
70
+ "step": 300
71
+ },
72
+ {
73
+ "epoch": 0.3929273084479371,
74
+ "eval_accuracy": 0.8397609105051994,
75
+ "eval_loss": 0.2580530345439911,
76
+ "eval_runtime": 17.1319,
77
+ "eval_samples_per_second": 712.879,
78
+ "eval_steps_per_second": 44.595,
79
+ "step": 300
80
+ },
81
+ {
82
+ "epoch": 0.45841519318926,
83
+ "grad_norm": 0.6606993079185486,
84
+ "learning_rate": 3.49e-05,
85
+ "loss": 0.2524,
86
+ "step": 350
87
+ },
88
+ {
89
+ "epoch": 0.5239030779305829,
90
+ "grad_norm": 0.9855170249938965,
91
+ "learning_rate": 3.9800000000000005e-05,
92
+ "loss": 0.246,
93
+ "step": 400
94
+ },
95
+ {
96
+ "epoch": 0.5239030779305829,
97
+ "eval_accuracy": 0.8489314664701547,
98
+ "eval_loss": 0.24741230905056,
99
+ "eval_runtime": 17.2565,
100
+ "eval_samples_per_second": 707.732,
101
+ "eval_steps_per_second": 44.273,
102
+ "step": 400
103
+ },
104
+ {
105
+ "epoch": 0.5893909626719057,
106
+ "grad_norm": 0.863400936126709,
107
+ "learning_rate": 4.4800000000000005e-05,
108
+ "loss": 0.2509,
109
+ "step": 450
110
+ },
111
+ {
112
+ "epoch": 0.6548788474132285,
113
+ "grad_norm": 0.2728968858718872,
114
+ "learning_rate": 4.9800000000000004e-05,
115
+ "loss": 0.2502,
116
+ "step": 500
117
+ },
118
+ {
119
+ "epoch": 0.6548788474132285,
120
+ "eval_accuracy": 0.8481126668304266,
121
+ "eval_loss": 0.24498052895069122,
122
+ "eval_runtime": 17.4414,
123
+ "eval_samples_per_second": 700.229,
124
+ "eval_steps_per_second": 43.804,
125
+ "step": 500
126
+ },
127
+ {
128
+ "epoch": 0.7203667321545514,
129
+ "grad_norm": 0.5378761291503906,
130
+ "learning_rate": 4.906103286384977e-05,
131
+ "loss": 0.2579,
132
+ "step": 550
133
+ },
134
+ {
135
+ "epoch": 0.7858546168958742,
136
+ "grad_norm": 0.2830002009868622,
137
+ "learning_rate": 4.808294209702661e-05,
138
+ "loss": 0.2367,
139
+ "step": 600
140
+ },
141
+ {
142
+ "epoch": 0.7858546168958742,
143
+ "eval_accuracy": 0.8522066650290674,
144
+ "eval_loss": 0.23902325332164764,
145
+ "eval_runtime": 17.3908,
146
+ "eval_samples_per_second": 702.27,
147
+ "eval_steps_per_second": 43.931,
148
+ "step": 600
149
+ },
150
+ {
151
+ "epoch": 0.8513425016371972,
152
+ "grad_norm": 0.5070816874504089,
153
+ "learning_rate": 4.710485133020345e-05,
154
+ "loss": 0.2405,
155
+ "step": 650
156
+ },
157
+ {
158
+ "epoch": 0.91683038637852,
159
+ "grad_norm": 6.335970401763916,
160
+ "learning_rate": 4.6126760563380286e-05,
161
+ "loss": 0.2453,
162
+ "step": 700
163
+ },
164
+ {
165
+ "epoch": 0.91683038637852,
166
+ "eval_accuracy": 0.850159665929747,
167
+ "eval_loss": 0.2547371983528137,
168
+ "eval_runtime": 17.4528,
169
+ "eval_samples_per_second": 699.773,
170
+ "eval_steps_per_second": 43.775,
171
+ "step": 700
172
+ },
173
+ {
174
+ "epoch": 0.9823182711198428,
175
+ "grad_norm": 0.29032933712005615,
176
+ "learning_rate": 4.514866979655713e-05,
177
+ "loss": 0.2473,
178
+ "step": 750
179
+ },
180
+ {
181
+ "epoch": 1.0471512770137525,
182
+ "grad_norm": 0.278637558221817,
183
+ "learning_rate": 4.417057902973396e-05,
184
+ "loss": 0.2596,
185
+ "step": 800
186
+ },
187
+ {
188
+ "epoch": 1.0471512770137525,
189
+ "eval_accuracy": 0.8526979448129043,
190
+ "eval_loss": 0.23811034858226776,
191
+ "eval_runtime": 17.4788,
192
+ "eval_samples_per_second": 698.732,
193
+ "eval_steps_per_second": 43.71,
194
+ "step": 800
195
+ },
196
+ {
197
+ "epoch": 1.1126391617550753,
198
+ "grad_norm": 2.2316906452178955,
199
+ "learning_rate": 4.3192488262910805e-05,
200
+ "loss": 0.2351,
201
+ "step": 850
202
+ },
203
+ {
204
+ "epoch": 1.1781270464963982,
205
+ "grad_norm": 6.879007339477539,
206
+ "learning_rate": 4.221439749608764e-05,
207
+ "loss": 0.2388,
208
+ "step": 900
209
+ },
210
+ {
211
+ "epoch": 1.1781270464963982,
212
+ "eval_accuracy": 0.8549905838041432,
213
+ "eval_loss": 0.2344694286584854,
214
+ "eval_runtime": 17.4998,
215
+ "eval_samples_per_second": 697.893,
216
+ "eval_steps_per_second": 43.658,
217
+ "step": 900
218
+ },
219
+ {
220
+ "epoch": 1.243614931237721,
221
+ "grad_norm": 0.19675926864147186,
222
+ "learning_rate": 4.123630672926448e-05,
223
+ "loss": 0.242,
224
+ "step": 950
225
+ },
226
+ {
227
+ "epoch": 1.3091028159790439,
228
+ "grad_norm": 0.2648178040981293,
229
+ "learning_rate": 4.0258215962441316e-05,
230
+ "loss": 0.2267,
231
+ "step": 1000
232
+ },
233
+ {
234
+ "epoch": 1.3091028159790439,
235
+ "eval_accuracy": 0.8549905838041432,
236
+ "eval_loss": 0.23283295333385468,
237
+ "eval_runtime": 17.5288,
238
+ "eval_samples_per_second": 696.739,
239
+ "eval_steps_per_second": 43.585,
240
+ "step": 1000
241
+ },
242
+ {
243
+ "epoch": 1.3745907007203666,
244
+ "grad_norm": 0.2951018214225769,
245
+ "learning_rate": 3.928012519561816e-05,
246
+ "loss": 0.2347,
247
+ "step": 1050
248
+ },
249
+ {
250
+ "epoch": 1.4400785854616895,
251
+ "grad_norm": 0.9093786478042603,
252
+ "learning_rate": 3.830203442879499e-05,
253
+ "loss": 0.2381,
254
+ "step": 1100
255
+ },
256
+ {
257
+ "epoch": 1.4400785854616895,
258
+ "eval_accuracy": 0.8542536641283878,
259
+ "eval_loss": 0.2354062795639038,
260
+ "eval_runtime": 17.5883,
261
+ "eval_samples_per_second": 694.384,
262
+ "eval_steps_per_second": 43.438,
263
+ "step": 1100
264
+ },
265
+ {
266
+ "epoch": 1.5055664702030125,
267
+ "grad_norm": 0.43629541993141174,
268
+ "learning_rate": 3.7323943661971835e-05,
269
+ "loss": 0.2343,
270
+ "step": 1150
271
+ },
272
+ {
273
+ "epoch": 1.5710543549443354,
274
+ "grad_norm": 0.46896249055862427,
275
+ "learning_rate": 3.634585289514868e-05,
276
+ "loss": 0.2448,
277
+ "step": 1200
278
+ },
279
+ {
280
+ "epoch": 1.5710543549443354,
281
+ "eval_accuracy": 0.8555637435519529,
282
+ "eval_loss": 0.231441468000412,
283
+ "eval_runtime": 17.7028,
284
+ "eval_samples_per_second": 689.891,
285
+ "eval_steps_per_second": 43.157,
286
+ "step": 1200
287
+ },
288
+ {
289
+ "epoch": 1.6365422396856582,
290
+ "grad_norm": 3.0710296630859375,
291
+ "learning_rate": 3.536776212832551e-05,
292
+ "loss": 0.215,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "epoch": 1.7020301244269809,
297
+ "grad_norm": 0.28947174549102783,
298
+ "learning_rate": 3.4389671361502353e-05,
299
+ "loss": 0.2341,
300
+ "step": 1300
301
+ },
302
+ {
303
+ "epoch": 1.7020301244269809,
304
+ "eval_accuracy": 0.8565463031196267,
305
+ "eval_loss": 0.22934912145137787,
306
+ "eval_runtime": 17.7508,
307
+ "eval_samples_per_second": 688.025,
308
+ "eval_steps_per_second": 43.04,
309
+ "step": 1300
310
+ },
311
+ {
312
+ "epoch": 1.7675180091683038,
313
+ "grad_norm": 0.916119396686554,
314
+ "learning_rate": 3.341158059467919e-05,
315
+ "loss": 0.2291,
316
+ "step": 1350
317
+ },
318
+ {
319
+ "epoch": 1.8330058939096268,
320
+ "grad_norm": 0.46822306513786316,
321
+ "learning_rate": 3.243348982785603e-05,
322
+ "loss": 0.2289,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "epoch": 1.8330058939096268,
327
+ "eval_accuracy": 0.8552362236960616,
328
+ "eval_loss": 0.23168529570102692,
329
+ "eval_runtime": 17.7733,
330
+ "eval_samples_per_second": 687.153,
331
+ "eval_steps_per_second": 42.986,
332
+ "step": 1400
333
+ },
334
+ {
335
+ "epoch": 1.8984937786509497,
336
+ "grad_norm": 0.5012661218643188,
337
+ "learning_rate": 3.1455399061032865e-05,
338
+ "loss": 0.2325,
339
+ "step": 1450
340
+ },
341
+ {
342
+ "epoch": 1.9639816633922724,
343
+ "grad_norm": 1.2745088338851929,
344
+ "learning_rate": 3.0477308294209707e-05,
345
+ "loss": 0.24,
346
+ "step": 1500
347
+ },
348
+ {
349
+ "epoch": 1.9639816633922724,
350
+ "eval_accuracy": 0.8562187832637354,
351
+ "eval_loss": 0.22923418879508972,
352
+ "eval_runtime": 17.8388,
353
+ "eval_samples_per_second": 684.631,
354
+ "eval_steps_per_second": 42.828,
355
+ "step": 1500
356
+ },
357
+ {
358
+ "epoch": 2.028814669286182,
359
+ "grad_norm": 0.3664182722568512,
360
+ "learning_rate": 2.9499217527386542e-05,
361
+ "loss": 0.2262,
362
+ "step": 1550
363
+ },
364
+ {
365
+ "epoch": 2.094302554027505,
366
+ "grad_norm": 0.544244647026062,
367
+ "learning_rate": 2.8521126760563384e-05,
368
+ "loss": 0.229,
369
+ "step": 1600
370
+ },
371
+ {
372
+ "epoch": 2.094302554027505,
373
+ "eval_accuracy": 0.8558093834438713,
374
+ "eval_loss": 0.23112046718597412,
375
+ "eval_runtime": 17.8003,
376
+ "eval_samples_per_second": 686.112,
377
+ "eval_steps_per_second": 42.921,
378
+ "step": 1600
379
+ },
380
+ {
381
+ "epoch": 2.1597904387688276,
382
+ "grad_norm": 0.7998932600021362,
383
+ "learning_rate": 2.754303599374022e-05,
384
+ "loss": 0.2269,
385
+ "step": 1650
386
+ },
387
+ {
388
+ "epoch": 2.2252783235101505,
389
+ "grad_norm": 1.73899245262146,
390
+ "learning_rate": 2.656494522691706e-05,
391
+ "loss": 0.2266,
392
+ "step": 1700
393
+ },
394
+ {
395
+ "epoch": 2.2252783235101505,
396
+ "eval_accuracy": 0.8564644231556539,
397
+ "eval_loss": 0.23248396813869476,
398
+ "eval_runtime": 17.8458,
399
+ "eval_samples_per_second": 684.363,
400
+ "eval_steps_per_second": 42.811,
401
+ "step": 1700
402
+ },
403
+ {
404
+ "epoch": 2.2907662082514735,
405
+ "grad_norm": 0.7401416301727295,
406
+ "learning_rate": 2.5586854460093895e-05,
407
+ "loss": 0.2304,
408
+ "step": 1750
409
+ },
410
+ {
411
+ "epoch": 2.3562540929927964,
412
+ "grad_norm": 1.3307729959487915,
413
+ "learning_rate": 2.4608763693270737e-05,
414
+ "loss": 0.2229,
415
+ "step": 1800
416
+ },
417
+ {
418
+ "epoch": 2.3562540929927964,
419
+ "eval_accuracy": 0.8578563825431916,
420
+ "eval_loss": 0.22864140570163727,
421
+ "eval_runtime": 17.9078,
422
+ "eval_samples_per_second": 681.993,
423
+ "eval_steps_per_second": 42.663,
424
+ "step": 1800
425
+ },
426
+ {
427
+ "epoch": 2.4217419777341194,
428
+ "grad_norm": 0.16573481261730194,
429
+ "learning_rate": 2.3630672926447576e-05,
430
+ "loss": 0.2141,
431
+ "step": 1850
432
+ },
433
+ {
434
+ "epoch": 2.487229862475442,
435
+ "grad_norm": 0.6072602272033691,
436
+ "learning_rate": 2.2652582159624414e-05,
437
+ "loss": 0.2171,
438
+ "step": 1900
439
+ },
440
+ {
441
+ "epoch": 2.487229862475442,
442
+ "eval_accuracy": 0.856791943011545,
443
+ "eval_loss": 0.2283138483762741,
444
+ "eval_runtime": 17.9011,
445
+ "eval_samples_per_second": 682.247,
446
+ "eval_steps_per_second": 42.679,
447
+ "step": 1900
448
+ },
449
+ {
450
+ "epoch": 2.552717747216765,
451
+ "grad_norm": 0.3190251588821411,
452
+ "learning_rate": 2.1674491392801252e-05,
453
+ "loss": 0.2173,
454
+ "step": 1950
455
+ },
456
+ {
457
+ "epoch": 2.6182056319580878,
458
+ "grad_norm": 0.4229159355163574,
459
+ "learning_rate": 2.069640062597809e-05,
460
+ "loss": 0.2267,
461
+ "step": 2000
462
+ },
463
+ {
464
+ "epoch": 2.6182056319580878,
465
+ "eval_accuracy": 0.8599033816425121,
466
+ "eval_loss": 0.22846245765686035,
467
+ "eval_runtime": 17.8748,
468
+ "eval_samples_per_second": 683.254,
469
+ "eval_steps_per_second": 42.742,
470
+ "step": 2000
471
+ },
472
+ {
473
+ "epoch": 2.6836935166994107,
474
+ "grad_norm": 0.6909223198890686,
475
+ "learning_rate": 1.971830985915493e-05,
476
+ "loss": 0.2287,
477
+ "step": 2050
478
+ },
479
+ {
480
+ "epoch": 2.749181401440733,
481
+ "grad_norm": 0.530602216720581,
482
+ "learning_rate": 1.874021909233177e-05,
483
+ "loss": 0.2332,
484
+ "step": 2100
485
+ },
486
+ {
487
+ "epoch": 2.749181401440733,
488
+ "eval_accuracy": 0.8573651027593547,
489
+ "eval_loss": 0.22631122171878815,
490
+ "eval_runtime": 17.8614,
491
+ "eval_samples_per_second": 683.766,
492
+ "eval_steps_per_second": 42.774,
493
+ "step": 2100
494
+ },
495
+ {
496
+ "epoch": 2.814669286182056,
497
+ "grad_norm": 0.4919154942035675,
498
+ "learning_rate": 1.776212832550861e-05,
499
+ "loss": 0.2116,
500
+ "step": 2150
501
+ },
502
+ {
503
+ "epoch": 2.880157170923379,
504
+ "grad_norm": 1.3828797340393066,
505
+ "learning_rate": 1.6784037558685448e-05,
506
+ "loss": 0.2218,
507
+ "step": 2200
508
+ },
509
+ {
510
+ "epoch": 2.880157170923379,
511
+ "eval_accuracy": 0.8577745025792188,
512
+ "eval_loss": 0.2293933629989624,
513
+ "eval_runtime": 17.8541,
514
+ "eval_samples_per_second": 684.044,
515
+ "eval_steps_per_second": 42.791,
516
+ "step": 2200
517
+ },
518
+ {
519
+ "epoch": 2.945645055664702,
520
+ "grad_norm": 0.24504908919334412,
521
+ "learning_rate": 1.5805946791862286e-05,
522
+ "loss": 0.2287,
523
+ "step": 2250
524
+ },
525
+ {
526
+ "epoch": 3.0104780615586115,
527
+ "grad_norm": 0.5326477885246277,
528
+ "learning_rate": 1.4827856025039124e-05,
529
+ "loss": 0.2272,
530
+ "step": 2300
531
+ },
532
+ {
533
+ "epoch": 3.0104780615586115,
534
+ "eval_accuracy": 0.8573651027593547,
535
+ "eval_loss": 0.2295144498348236,
536
+ "eval_runtime": 17.8719,
537
+ "eval_samples_per_second": 683.363,
538
+ "eval_steps_per_second": 42.749,
539
+ "step": 2300
540
+ },
541
+ {
542
+ "epoch": 3.0759659462999345,
543
+ "grad_norm": 0.6676012277603149,
544
+ "learning_rate": 1.3849765258215963e-05,
545
+ "loss": 0.2126,
546
+ "step": 2350
547
+ },
548
+ {
549
+ "epoch": 3.1414538310412574,
550
+ "grad_norm": 0.3665286898612976,
551
+ "learning_rate": 1.2871674491392801e-05,
552
+ "loss": 0.2173,
553
+ "step": 2400
554
+ },
555
+ {
556
+ "epoch": 3.1414538310412574,
557
+ "eval_accuracy": 0.8602309014984033,
558
+ "eval_loss": 0.23410645127296448,
559
+ "eval_runtime": 17.8958,
560
+ "eval_samples_per_second": 682.449,
561
+ "eval_steps_per_second": 42.691,
562
+ "step": 2400
563
+ },
564
+ {
565
+ "epoch": 3.2069417157825804,
566
+ "grad_norm": 0.44082948565483093,
567
+ "learning_rate": 1.189358372456964e-05,
568
+ "loss": 0.2117,
569
+ "step": 2450
570
+ },
571
+ {
572
+ "epoch": 3.2724296005239033,
573
+ "grad_norm": 1.7528928518295288,
574
+ "learning_rate": 1.0915492957746478e-05,
575
+ "loss": 0.2176,
576
+ "step": 2500
577
+ },
578
+ {
579
+ "epoch": 3.2724296005239033,
580
+ "eval_accuracy": 0.8612134610660771,
581
+ "eval_loss": 0.22918100655078888,
582
+ "eval_runtime": 17.8879,
583
+ "eval_samples_per_second": 682.753,
584
+ "eval_steps_per_second": 42.71,
585
+ "step": 2500
586
+ },
587
+ {
588
+ "epoch": 3.337917485265226,
589
+ "grad_norm": 1.001876711845398,
590
+ "learning_rate": 9.937402190923318e-06,
591
+ "loss": 0.2141,
592
+ "step": 2550
593
+ },
594
+ {
595
+ "epoch": 3.4034053700065487,
596
+ "grad_norm": 0.3455939292907715,
597
+ "learning_rate": 8.959311424100156e-06,
598
+ "loss": 0.2147,
599
+ "step": 2600
600
+ },
601
+ {
602
+ "epoch": 3.4034053700065487,
603
+ "eval_accuracy": 0.8619503807418325,
604
+ "eval_loss": 0.2289513647556305,
605
+ "eval_runtime": 17.8583,
606
+ "eval_samples_per_second": 683.883,
607
+ "eval_steps_per_second": 42.781,
608
+ "step": 2600
609
+ },
610
+ {
611
+ "epoch": 3.4688932547478717,
612
+ "grad_norm": 0.39306333661079407,
613
+ "learning_rate": 7.981220657276996e-06,
614
+ "loss": 0.2212,
615
+ "step": 2650
616
+ },
617
+ {
618
+ "epoch": 3.5343811394891946,
619
+ "grad_norm": 0.29782700538635254,
620
+ "learning_rate": 7.003129890453834e-06,
621
+ "loss": 0.2225,
622
+ "step": 2700
623
+ },
624
+ {
625
+ "epoch": 3.5343811394891946,
626
+ "eval_accuracy": 0.8574469827233276,
627
+ "eval_loss": 0.2323383092880249,
628
+ "eval_runtime": 17.8359,
629
+ "eval_samples_per_second": 684.744,
630
+ "eval_steps_per_second": 42.835,
631
+ "step": 2700
632
+ },
633
+ {
634
+ "epoch": 3.599869024230517,
635
+ "grad_norm": 0.5452626943588257,
636
+ "learning_rate": 6.025039123630673e-06,
637
+ "loss": 0.2121,
638
+ "step": 2750
639
+ },
640
+ {
641
+ "epoch": 3.66535690897184,
642
+ "grad_norm": 3.8590140342712402,
643
+ "learning_rate": 5.046948356807512e-06,
644
+ "loss": 0.2208,
645
+ "step": 2800
646
+ },
647
+ {
648
+ "epoch": 3.66535690897184,
649
+ "eval_accuracy": 0.8591664619667567,
650
+ "eval_loss": 0.23002713918685913,
651
+ "eval_runtime": 17.874,
652
+ "eval_samples_per_second": 683.285,
653
+ "eval_steps_per_second": 42.744,
654
+ "step": 2800
655
+ },
656
+ {
657
+ "epoch": 3.730844793713163,
658
+ "grad_norm": 0.6683679223060608,
659
+ "learning_rate": 4.068857589984351e-06,
660
+ "loss": 0.2273,
661
+ "step": 2850
662
+ },
663
+ {
664
+ "epoch": 3.796332678454486,
665
+ "grad_norm": 0.24069799482822418,
666
+ "learning_rate": 3.0907668231611893e-06,
667
+ "loss": 0.2084,
668
+ "step": 2900
669
+ },
670
+ {
671
+ "epoch": 3.796332678454486,
672
+ "eval_accuracy": 0.8621960206337509,
673
+ "eval_loss": 0.22760269045829773,
674
+ "eval_runtime": 17.8864,
675
+ "eval_samples_per_second": 682.809,
676
+ "eval_steps_per_second": 42.714,
677
+ "step": 2900
678
+ },
679
+ {
680
+ "epoch": 3.861820563195809,
681
+ "grad_norm": 1.0248568058013916,
682
+ "learning_rate": 2.112676056338028e-06,
683
+ "loss": 0.2082,
684
+ "step": 2950
685
+ },
686
+ {
687
+ "epoch": 3.927308447937132,
688
+ "grad_norm": 1.0928360223770142,
689
+ "learning_rate": 1.1541471048513302e-06,
690
+ "loss": 0.2159,
691
+ "step": 3000
692
+ },
693
+ {
694
+ "epoch": 3.927308447937132,
695
+ "eval_accuracy": 0.8622779005977237,
696
+ "eval_loss": 0.22847115993499756,
697
+ "eval_runtime": 17.9258,
698
+ "eval_samples_per_second": 681.307,
699
+ "eval_steps_per_second": 42.62,
700
+ "step": 3000
701
+ }
702
+ ],
703
+ "logging_steps": 50,
704
+ "max_steps": 3056,
705
+ "num_input_tokens_seen": 0,
706
+ "num_train_epochs": 4,
707
+ "save_steps": 500,
708
+ "stateful_callbacks": {
709
+ "TrainerControl": {
710
+ "args": {
711
+ "should_epoch_stop": false,
712
+ "should_evaluate": false,
713
+ "should_log": false,
714
+ "should_save": true,
715
+ "should_training_stop": false
716
+ },
717
+ "attributes": {}
718
+ }
719
+ },
720
+ "total_flos": 6354063711332352.0,
721
+ "train_batch_size": 16,
722
+ "trial_name": null,
723
+ "trial_params": null
724
+ }