morganchen1007 commited on
Commit
61e3a42
1 Parent(s): 618c6d8

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +1240 -0
trainer_state.json ADDED
@@ -0,0 +1,1240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9638336347197106,
3
+ "best_model_checkpoint": "resnet-50-finetuned-resnet50/checkpoint-1705",
4
+ "epoch": 11.996784565916398,
5
+ "global_step": 1860,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.06,
12
+ "learning_rate": 2.688172043010753e-06,
13
+ "loss": 1.0948,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.13,
18
+ "learning_rate": 5.376344086021506e-06,
19
+ "loss": 1.0931,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.19,
24
+ "learning_rate": 8.064516129032258e-06,
25
+ "loss": 1.0889,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.26,
30
+ "learning_rate": 1.0752688172043012e-05,
31
+ "loss": 1.0861,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.32,
36
+ "learning_rate": 1.3440860215053763e-05,
37
+ "loss": 1.0796,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.39,
42
+ "learning_rate": 1.6129032258064517e-05,
43
+ "loss": 1.0725,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.45,
48
+ "learning_rate": 1.881720430107527e-05,
49
+ "loss": 1.0602,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.51,
54
+ "learning_rate": 2.1505376344086024e-05,
55
+ "loss": 1.0512,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.58,
60
+ "learning_rate": 2.4193548387096777e-05,
61
+ "loss": 1.0386,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.64,
66
+ "learning_rate": 2.6881720430107527e-05,
67
+ "loss": 1.0228,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.71,
72
+ "learning_rate": 2.9569892473118284e-05,
73
+ "loss": 1.0067,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.77,
78
+ "learning_rate": 3.2258064516129034e-05,
79
+ "loss": 0.981,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.84,
84
+ "learning_rate": 3.494623655913979e-05,
85
+ "loss": 0.9705,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.9,
90
+ "learning_rate": 3.763440860215054e-05,
91
+ "loss": 0.942,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.96,
96
+ "learning_rate": 4.032258064516129e-05,
97
+ "loss": 0.9229,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 1.0,
102
+ "eval_accuracy": 0.6424050632911392,
103
+ "eval_loss": 0.8797757625579834,
104
+ "eval_runtime": 237.5636,
105
+ "eval_samples_per_second": 9.311,
106
+ "eval_steps_per_second": 0.295,
107
+ "step": 155
108
+ },
109
+ {
110
+ "epoch": 1.03,
111
+ "learning_rate": 4.301075268817205e-05,
112
+ "loss": 0.9396,
113
+ "step": 160
114
+ },
115
+ {
116
+ "epoch": 1.1,
117
+ "learning_rate": 4.56989247311828e-05,
118
+ "loss": 0.8696,
119
+ "step": 170
120
+ },
121
+ {
122
+ "epoch": 1.16,
123
+ "learning_rate": 4.8387096774193554e-05,
124
+ "loss": 0.8369,
125
+ "step": 180
126
+ },
127
+ {
128
+ "epoch": 1.23,
129
+ "learning_rate": 4.98805256869773e-05,
130
+ "loss": 0.8137,
131
+ "step": 190
132
+ },
133
+ {
134
+ "epoch": 1.29,
135
+ "learning_rate": 4.9581839904420555e-05,
136
+ "loss": 0.7895,
137
+ "step": 200
138
+ },
139
+ {
140
+ "epoch": 1.35,
141
+ "learning_rate": 4.92831541218638e-05,
142
+ "loss": 0.7494,
143
+ "step": 210
144
+ },
145
+ {
146
+ "epoch": 1.42,
147
+ "learning_rate": 4.898446833930705e-05,
148
+ "loss": 0.7313,
149
+ "step": 220
150
+ },
151
+ {
152
+ "epoch": 1.48,
153
+ "learning_rate": 4.86857825567503e-05,
154
+ "loss": 0.6833,
155
+ "step": 230
156
+ },
157
+ {
158
+ "epoch": 1.55,
159
+ "learning_rate": 4.8387096774193554e-05,
160
+ "loss": 0.6755,
161
+ "step": 240
162
+ },
163
+ {
164
+ "epoch": 1.61,
165
+ "learning_rate": 4.80884109916368e-05,
166
+ "loss": 0.6313,
167
+ "step": 250
168
+ },
169
+ {
170
+ "epoch": 1.68,
171
+ "learning_rate": 4.778972520908005e-05,
172
+ "loss": 0.6044,
173
+ "step": 260
174
+ },
175
+ {
176
+ "epoch": 1.74,
177
+ "learning_rate": 4.74910394265233e-05,
178
+ "loss": 0.5762,
179
+ "step": 270
180
+ },
181
+ {
182
+ "epoch": 1.8,
183
+ "learning_rate": 4.7192353643966546e-05,
184
+ "loss": 0.5494,
185
+ "step": 280
186
+ },
187
+ {
188
+ "epoch": 1.87,
189
+ "learning_rate": 4.6893667861409805e-05,
190
+ "loss": 0.5436,
191
+ "step": 290
192
+ },
193
+ {
194
+ "epoch": 1.93,
195
+ "learning_rate": 4.659498207885305e-05,
196
+ "loss": 0.5149,
197
+ "step": 300
198
+ },
199
+ {
200
+ "epoch": 2.0,
201
+ "learning_rate": 4.62962962962963e-05,
202
+ "loss": 0.4797,
203
+ "step": 310
204
+ },
205
+ {
206
+ "epoch": 2.0,
207
+ "eval_accuracy": 0.8747739602169982,
208
+ "eval_loss": 0.37069129943847656,
209
+ "eval_runtime": 244.0414,
210
+ "eval_samples_per_second": 9.064,
211
+ "eval_steps_per_second": 0.287,
212
+ "step": 310
213
+ },
214
+ {
215
+ "epoch": 2.06,
216
+ "learning_rate": 4.5997610513739546e-05,
217
+ "loss": 0.4886,
218
+ "step": 320
219
+ },
220
+ {
221
+ "epoch": 2.13,
222
+ "learning_rate": 4.56989247311828e-05,
223
+ "loss": 0.45,
224
+ "step": 330
225
+ },
226
+ {
227
+ "epoch": 2.19,
228
+ "learning_rate": 4.540023894862604e-05,
229
+ "loss": 0.4523,
230
+ "step": 340
231
+ },
232
+ {
233
+ "epoch": 2.26,
234
+ "learning_rate": 4.51015531660693e-05,
235
+ "loss": 0.4588,
236
+ "step": 350
237
+ },
238
+ {
239
+ "epoch": 2.32,
240
+ "learning_rate": 4.4802867383512545e-05,
241
+ "loss": 0.4219,
242
+ "step": 360
243
+ },
244
+ {
245
+ "epoch": 2.39,
246
+ "learning_rate": 4.4504181600955796e-05,
247
+ "loss": 0.4036,
248
+ "step": 370
249
+ },
250
+ {
251
+ "epoch": 2.45,
252
+ "learning_rate": 4.420549581839905e-05,
253
+ "loss": 0.3875,
254
+ "step": 380
255
+ },
256
+ {
257
+ "epoch": 2.51,
258
+ "learning_rate": 4.390681003584229e-05,
259
+ "loss": 0.3735,
260
+ "step": 390
261
+ },
262
+ {
263
+ "epoch": 2.58,
264
+ "learning_rate": 4.360812425328555e-05,
265
+ "loss": 0.3719,
266
+ "step": 400
267
+ },
268
+ {
269
+ "epoch": 2.64,
270
+ "learning_rate": 4.3309438470728796e-05,
271
+ "loss": 0.3958,
272
+ "step": 410
273
+ },
274
+ {
275
+ "epoch": 2.71,
276
+ "learning_rate": 4.301075268817205e-05,
277
+ "loss": 0.3896,
278
+ "step": 420
279
+ },
280
+ {
281
+ "epoch": 2.77,
282
+ "learning_rate": 4.271206690561529e-05,
283
+ "loss": 0.38,
284
+ "step": 430
285
+ },
286
+ {
287
+ "epoch": 2.84,
288
+ "learning_rate": 4.241338112305854e-05,
289
+ "loss": 0.3629,
290
+ "step": 440
291
+ },
292
+ {
293
+ "epoch": 2.9,
294
+ "learning_rate": 4.2114695340501795e-05,
295
+ "loss": 0.3358,
296
+ "step": 450
297
+ },
298
+ {
299
+ "epoch": 2.96,
300
+ "learning_rate": 4.1816009557945046e-05,
301
+ "loss": 0.3201,
302
+ "step": 460
303
+ },
304
+ {
305
+ "epoch": 3.0,
306
+ "eval_accuracy": 0.9231464737793852,
307
+ "eval_loss": 0.21964775025844574,
308
+ "eval_runtime": 228.0728,
309
+ "eval_samples_per_second": 9.699,
310
+ "eval_steps_per_second": 0.307,
311
+ "step": 465
312
+ },
313
+ {
314
+ "epoch": 3.03,
315
+ "learning_rate": 4.15173237753883e-05,
316
+ "loss": 0.367,
317
+ "step": 470
318
+ },
319
+ {
320
+ "epoch": 3.1,
321
+ "learning_rate": 4.121863799283154e-05,
322
+ "loss": 0.3273,
323
+ "step": 480
324
+ },
325
+ {
326
+ "epoch": 3.16,
327
+ "learning_rate": 4.0919952210274794e-05,
328
+ "loss": 0.3418,
329
+ "step": 490
330
+ },
331
+ {
332
+ "epoch": 3.23,
333
+ "learning_rate": 4.062126642771804e-05,
334
+ "loss": 0.3462,
335
+ "step": 500
336
+ },
337
+ {
338
+ "epoch": 3.29,
339
+ "learning_rate": 4.032258064516129e-05,
340
+ "loss": 0.3211,
341
+ "step": 510
342
+ },
343
+ {
344
+ "epoch": 3.35,
345
+ "learning_rate": 4.002389486260454e-05,
346
+ "loss": 0.3173,
347
+ "step": 520
348
+ },
349
+ {
350
+ "epoch": 3.42,
351
+ "learning_rate": 3.972520908004779e-05,
352
+ "loss": 0.3148,
353
+ "step": 530
354
+ },
355
+ {
356
+ "epoch": 3.48,
357
+ "learning_rate": 3.9426523297491045e-05,
358
+ "loss": 0.3089,
359
+ "step": 540
360
+ },
361
+ {
362
+ "epoch": 3.55,
363
+ "learning_rate": 3.912783751493429e-05,
364
+ "loss": 0.3132,
365
+ "step": 550
366
+ },
367
+ {
368
+ "epoch": 3.61,
369
+ "learning_rate": 3.882915173237754e-05,
370
+ "loss": 0.3126,
371
+ "step": 560
372
+ },
373
+ {
374
+ "epoch": 3.68,
375
+ "learning_rate": 3.8530465949820786e-05,
376
+ "loss": 0.3335,
377
+ "step": 570
378
+ },
379
+ {
380
+ "epoch": 3.74,
381
+ "learning_rate": 3.8231780167264044e-05,
382
+ "loss": 0.3107,
383
+ "step": 580
384
+ },
385
+ {
386
+ "epoch": 3.8,
387
+ "learning_rate": 3.793309438470729e-05,
388
+ "loss": 0.3289,
389
+ "step": 590
390
+ },
391
+ {
392
+ "epoch": 3.87,
393
+ "learning_rate": 3.763440860215054e-05,
394
+ "loss": 0.3225,
395
+ "step": 600
396
+ },
397
+ {
398
+ "epoch": 3.93,
399
+ "learning_rate": 3.7335722819593785e-05,
400
+ "loss": 0.3188,
401
+ "step": 610
402
+ },
403
+ {
404
+ "epoch": 4.0,
405
+ "learning_rate": 3.7037037037037037e-05,
406
+ "loss": 0.2978,
407
+ "step": 620
408
+ },
409
+ {
410
+ "epoch": 4.0,
411
+ "eval_accuracy": 0.9452983725135624,
412
+ "eval_loss": 0.16960099339485168,
413
+ "eval_runtime": 228.3891,
414
+ "eval_samples_per_second": 9.685,
415
+ "eval_steps_per_second": 0.306,
416
+ "step": 620
417
+ },
418
+ {
419
+ "epoch": 4.06,
420
+ "learning_rate": 3.673835125448029e-05,
421
+ "loss": 0.3221,
422
+ "step": 630
423
+ },
424
+ {
425
+ "epoch": 4.13,
426
+ "learning_rate": 3.643966547192354e-05,
427
+ "loss": 0.3077,
428
+ "step": 640
429
+ },
430
+ {
431
+ "epoch": 4.19,
432
+ "learning_rate": 3.614097968936679e-05,
433
+ "loss": 0.2895,
434
+ "step": 650
435
+ },
436
+ {
437
+ "epoch": 4.26,
438
+ "learning_rate": 3.5842293906810036e-05,
439
+ "loss": 0.2822,
440
+ "step": 660
441
+ },
442
+ {
443
+ "epoch": 4.32,
444
+ "learning_rate": 3.554360812425329e-05,
445
+ "loss": 0.2864,
446
+ "step": 670
447
+ },
448
+ {
449
+ "epoch": 4.39,
450
+ "learning_rate": 3.524492234169653e-05,
451
+ "loss": 0.2868,
452
+ "step": 680
453
+ },
454
+ {
455
+ "epoch": 4.45,
456
+ "learning_rate": 3.494623655913979e-05,
457
+ "loss": 0.2721,
458
+ "step": 690
459
+ },
460
+ {
461
+ "epoch": 4.51,
462
+ "learning_rate": 3.4647550776583035e-05,
463
+ "loss": 0.2455,
464
+ "step": 700
465
+ },
466
+ {
467
+ "epoch": 4.58,
468
+ "learning_rate": 3.4348864994026287e-05,
469
+ "loss": 0.2657,
470
+ "step": 710
471
+ },
472
+ {
473
+ "epoch": 4.64,
474
+ "learning_rate": 3.405017921146954e-05,
475
+ "loss": 0.2797,
476
+ "step": 720
477
+ },
478
+ {
479
+ "epoch": 4.71,
480
+ "learning_rate": 3.375149342891278e-05,
481
+ "loss": 0.3008,
482
+ "step": 730
483
+ },
484
+ {
485
+ "epoch": 4.77,
486
+ "learning_rate": 3.3452807646356034e-05,
487
+ "loss": 0.2696,
488
+ "step": 740
489
+ },
490
+ {
491
+ "epoch": 4.84,
492
+ "learning_rate": 3.3154121863799286e-05,
493
+ "loss": 0.2978,
494
+ "step": 750
495
+ },
496
+ {
497
+ "epoch": 4.9,
498
+ "learning_rate": 3.285543608124254e-05,
499
+ "loss": 0.2775,
500
+ "step": 760
501
+ },
502
+ {
503
+ "epoch": 4.96,
504
+ "learning_rate": 3.255675029868578e-05,
505
+ "loss": 0.3208,
506
+ "step": 770
507
+ },
508
+ {
509
+ "epoch": 5.0,
510
+ "eval_accuracy": 0.9525316455696202,
511
+ "eval_loss": 0.13934803009033203,
512
+ "eval_runtime": 228.8309,
513
+ "eval_samples_per_second": 9.667,
514
+ "eval_steps_per_second": 0.306,
515
+ "step": 775
516
+ },
517
+ {
518
+ "epoch": 5.03,
519
+ "learning_rate": 3.2258064516129034e-05,
520
+ "loss": 0.3063,
521
+ "step": 780
522
+ },
523
+ {
524
+ "epoch": 5.1,
525
+ "learning_rate": 3.195937873357228e-05,
526
+ "loss": 0.2632,
527
+ "step": 790
528
+ },
529
+ {
530
+ "epoch": 5.16,
531
+ "learning_rate": 3.1660692951015537e-05,
532
+ "loss": 0.2518,
533
+ "step": 800
534
+ },
535
+ {
536
+ "epoch": 5.23,
537
+ "learning_rate": 3.136200716845878e-05,
538
+ "loss": 0.253,
539
+ "step": 810
540
+ },
541
+ {
542
+ "epoch": 5.29,
543
+ "learning_rate": 3.106332138590203e-05,
544
+ "loss": 0.2748,
545
+ "step": 820
546
+ },
547
+ {
548
+ "epoch": 5.35,
549
+ "learning_rate": 3.0764635603345284e-05,
550
+ "loss": 0.2349,
551
+ "step": 830
552
+ },
553
+ {
554
+ "epoch": 5.42,
555
+ "learning_rate": 3.046594982078853e-05,
556
+ "loss": 0.2857,
557
+ "step": 840
558
+ },
559
+ {
560
+ "epoch": 5.48,
561
+ "learning_rate": 3.016726403823178e-05,
562
+ "loss": 0.2364,
563
+ "step": 850
564
+ },
565
+ {
566
+ "epoch": 5.55,
567
+ "learning_rate": 2.9868578255675032e-05,
568
+ "loss": 0.2772,
569
+ "step": 860
570
+ },
571
+ {
572
+ "epoch": 5.61,
573
+ "learning_rate": 2.9569892473118284e-05,
574
+ "loss": 0.2522,
575
+ "step": 870
576
+ },
577
+ {
578
+ "epoch": 5.68,
579
+ "learning_rate": 2.9271206690561532e-05,
580
+ "loss": 0.2404,
581
+ "step": 880
582
+ },
583
+ {
584
+ "epoch": 5.74,
585
+ "learning_rate": 2.897252090800478e-05,
586
+ "loss": 0.2913,
587
+ "step": 890
588
+ },
589
+ {
590
+ "epoch": 5.8,
591
+ "learning_rate": 2.8673835125448028e-05,
592
+ "loss": 0.2843,
593
+ "step": 900
594
+ },
595
+ {
596
+ "epoch": 5.87,
597
+ "learning_rate": 2.8375149342891276e-05,
598
+ "loss": 0.2918,
599
+ "step": 910
600
+ },
601
+ {
602
+ "epoch": 5.93,
603
+ "learning_rate": 2.807646356033453e-05,
604
+ "loss": 0.2479,
605
+ "step": 920
606
+ },
607
+ {
608
+ "epoch": 6.0,
609
+ "learning_rate": 2.777777777777778e-05,
610
+ "loss": 0.2599,
611
+ "step": 930
612
+ },
613
+ {
614
+ "epoch": 6.0,
615
+ "eval_accuracy": 0.9561482820976492,
616
+ "eval_loss": 0.1326771080493927,
617
+ "eval_runtime": 235.7718,
618
+ "eval_samples_per_second": 9.382,
619
+ "eval_steps_per_second": 0.297,
620
+ "step": 930
621
+ },
622
+ {
623
+ "epoch": 6.06,
624
+ "learning_rate": 2.747909199522103e-05,
625
+ "loss": 0.2479,
626
+ "step": 940
627
+ },
628
+ {
629
+ "epoch": 6.13,
630
+ "learning_rate": 2.718040621266428e-05,
631
+ "loss": 0.2865,
632
+ "step": 950
633
+ },
634
+ {
635
+ "epoch": 6.19,
636
+ "learning_rate": 2.6881720430107527e-05,
637
+ "loss": 0.2508,
638
+ "step": 960
639
+ },
640
+ {
641
+ "epoch": 6.26,
642
+ "learning_rate": 2.6583034647550775e-05,
643
+ "loss": 0.279,
644
+ "step": 970
645
+ },
646
+ {
647
+ "epoch": 6.32,
648
+ "learning_rate": 2.628434886499403e-05,
649
+ "loss": 0.2534,
650
+ "step": 980
651
+ },
652
+ {
653
+ "epoch": 6.39,
654
+ "learning_rate": 2.5985663082437278e-05,
655
+ "loss": 0.2779,
656
+ "step": 990
657
+ },
658
+ {
659
+ "epoch": 6.45,
660
+ "learning_rate": 2.5686977299880526e-05,
661
+ "loss": 0.2627,
662
+ "step": 1000
663
+ },
664
+ {
665
+ "epoch": 6.51,
666
+ "learning_rate": 2.5388291517323774e-05,
667
+ "loss": 0.2588,
668
+ "step": 1010
669
+ },
670
+ {
671
+ "epoch": 6.58,
672
+ "learning_rate": 2.5089605734767026e-05,
673
+ "loss": 0.2611,
674
+ "step": 1020
675
+ },
676
+ {
677
+ "epoch": 6.64,
678
+ "learning_rate": 2.4790919952210277e-05,
679
+ "loss": 0.2677,
680
+ "step": 1030
681
+ },
682
+ {
683
+ "epoch": 6.71,
684
+ "learning_rate": 2.4492234169653525e-05,
685
+ "loss": 0.2389,
686
+ "step": 1040
687
+ },
688
+ {
689
+ "epoch": 6.77,
690
+ "learning_rate": 2.4193548387096777e-05,
691
+ "loss": 0.252,
692
+ "step": 1050
693
+ },
694
+ {
695
+ "epoch": 6.84,
696
+ "learning_rate": 2.3894862604540025e-05,
697
+ "loss": 0.2363,
698
+ "step": 1060
699
+ },
700
+ {
701
+ "epoch": 6.9,
702
+ "learning_rate": 2.3596176821983273e-05,
703
+ "loss": 0.2602,
704
+ "step": 1070
705
+ },
706
+ {
707
+ "epoch": 6.96,
708
+ "learning_rate": 2.3297491039426525e-05,
709
+ "loss": 0.2407,
710
+ "step": 1080
711
+ },
712
+ {
713
+ "epoch": 7.0,
714
+ "eval_accuracy": 0.9593128390596745,
715
+ "eval_loss": 0.12799127399921417,
716
+ "eval_runtime": 237.4274,
717
+ "eval_samples_per_second": 9.317,
718
+ "eval_steps_per_second": 0.295,
719
+ "step": 1085
720
+ },
721
+ {
722
+ "epoch": 7.03,
723
+ "learning_rate": 2.2998805256869773e-05,
724
+ "loss": 0.2479,
725
+ "step": 1090
726
+ },
727
+ {
728
+ "epoch": 7.1,
729
+ "learning_rate": 2.270011947431302e-05,
730
+ "loss": 0.2604,
731
+ "step": 1100
732
+ },
733
+ {
734
+ "epoch": 7.16,
735
+ "learning_rate": 2.2401433691756272e-05,
736
+ "loss": 0.2578,
737
+ "step": 1110
738
+ },
739
+ {
740
+ "epoch": 7.23,
741
+ "learning_rate": 2.2102747909199524e-05,
742
+ "loss": 0.2373,
743
+ "step": 1120
744
+ },
745
+ {
746
+ "epoch": 7.29,
747
+ "learning_rate": 2.1804062126642775e-05,
748
+ "loss": 0.2429,
749
+ "step": 1130
750
+ },
751
+ {
752
+ "epoch": 7.35,
753
+ "learning_rate": 2.1505376344086024e-05,
754
+ "loss": 0.2534,
755
+ "step": 1140
756
+ },
757
+ {
758
+ "epoch": 7.42,
759
+ "learning_rate": 2.120669056152927e-05,
760
+ "loss": 0.2726,
761
+ "step": 1150
762
+ },
763
+ {
764
+ "epoch": 7.48,
765
+ "learning_rate": 2.0908004778972523e-05,
766
+ "loss": 0.2441,
767
+ "step": 1160
768
+ },
769
+ {
770
+ "epoch": 7.55,
771
+ "learning_rate": 2.060931899641577e-05,
772
+ "loss": 0.2613,
773
+ "step": 1170
774
+ },
775
+ {
776
+ "epoch": 7.61,
777
+ "learning_rate": 2.031063321385902e-05,
778
+ "loss": 0.254,
779
+ "step": 1180
780
+ },
781
+ {
782
+ "epoch": 7.68,
783
+ "learning_rate": 2.001194743130227e-05,
784
+ "loss": 0.2415,
785
+ "step": 1190
786
+ },
787
+ {
788
+ "epoch": 7.74,
789
+ "learning_rate": 1.9713261648745522e-05,
790
+ "loss": 0.2539,
791
+ "step": 1200
792
+ },
793
+ {
794
+ "epoch": 7.8,
795
+ "learning_rate": 1.941457586618877e-05,
796
+ "loss": 0.2473,
797
+ "step": 1210
798
+ },
799
+ {
800
+ "epoch": 7.87,
801
+ "learning_rate": 1.9115890083632022e-05,
802
+ "loss": 0.2419,
803
+ "step": 1220
804
+ },
805
+ {
806
+ "epoch": 7.93,
807
+ "learning_rate": 1.881720430107527e-05,
808
+ "loss": 0.2377,
809
+ "step": 1230
810
+ },
811
+ {
812
+ "epoch": 8.0,
813
+ "learning_rate": 1.8518518518518518e-05,
814
+ "loss": 0.2364,
815
+ "step": 1240
816
+ },
817
+ {
818
+ "epoch": 8.0,
819
+ "eval_accuracy": 0.9593128390596745,
820
+ "eval_loss": 0.11666559427976608,
821
+ "eval_runtime": 231.3229,
822
+ "eval_samples_per_second": 9.562,
823
+ "eval_steps_per_second": 0.303,
824
+ "step": 1240
825
+ },
826
+ {
827
+ "epoch": 8.06,
828
+ "learning_rate": 1.821983273596177e-05,
829
+ "loss": 0.2481,
830
+ "step": 1250
831
+ },
832
+ {
833
+ "epoch": 8.13,
834
+ "learning_rate": 1.7921146953405018e-05,
835
+ "loss": 0.2723,
836
+ "step": 1260
837
+ },
838
+ {
839
+ "epoch": 8.19,
840
+ "learning_rate": 1.7622461170848266e-05,
841
+ "loss": 0.2392,
842
+ "step": 1270
843
+ },
844
+ {
845
+ "epoch": 8.26,
846
+ "learning_rate": 1.7323775388291518e-05,
847
+ "loss": 0.2257,
848
+ "step": 1280
849
+ },
850
+ {
851
+ "epoch": 8.32,
852
+ "learning_rate": 1.702508960573477e-05,
853
+ "loss": 0.23,
854
+ "step": 1290
855
+ },
856
+ {
857
+ "epoch": 8.39,
858
+ "learning_rate": 1.6726403823178017e-05,
859
+ "loss": 0.2237,
860
+ "step": 1300
861
+ },
862
+ {
863
+ "epoch": 8.45,
864
+ "learning_rate": 1.642771804062127e-05,
865
+ "loss": 0.2384,
866
+ "step": 1310
867
+ },
868
+ {
869
+ "epoch": 8.51,
870
+ "learning_rate": 1.6129032258064517e-05,
871
+ "loss": 0.2336,
872
+ "step": 1320
873
+ },
874
+ {
875
+ "epoch": 8.58,
876
+ "learning_rate": 1.5830346475507768e-05,
877
+ "loss": 0.2745,
878
+ "step": 1330
879
+ },
880
+ {
881
+ "epoch": 8.64,
882
+ "learning_rate": 1.5531660692951016e-05,
883
+ "loss": 0.2079,
884
+ "step": 1340
885
+ },
886
+ {
887
+ "epoch": 8.71,
888
+ "learning_rate": 1.5232974910394265e-05,
889
+ "loss": 0.2189,
890
+ "step": 1350
891
+ },
892
+ {
893
+ "epoch": 8.77,
894
+ "learning_rate": 1.4934289127837516e-05,
895
+ "loss": 0.2357,
896
+ "step": 1360
897
+ },
898
+ {
899
+ "epoch": 8.84,
900
+ "learning_rate": 1.4635603345280766e-05,
901
+ "loss": 0.2575,
902
+ "step": 1370
903
+ },
904
+ {
905
+ "epoch": 8.9,
906
+ "learning_rate": 1.4336917562724014e-05,
907
+ "loss": 0.2583,
908
+ "step": 1380
909
+ },
910
+ {
911
+ "epoch": 8.96,
912
+ "learning_rate": 1.4038231780167265e-05,
913
+ "loss": 0.2147,
914
+ "step": 1390
915
+ },
916
+ {
917
+ "epoch": 9.0,
918
+ "eval_accuracy": 0.9597649186256781,
919
+ "eval_loss": 0.11601197719573975,
920
+ "eval_runtime": 233.1414,
921
+ "eval_samples_per_second": 9.488,
922
+ "eval_steps_per_second": 0.3,
923
+ "step": 1395
924
+ },
925
+ {
926
+ "epoch": 9.03,
927
+ "learning_rate": 1.3739545997610515e-05,
928
+ "loss": 0.2431,
929
+ "step": 1400
930
+ },
931
+ {
932
+ "epoch": 9.1,
933
+ "learning_rate": 1.3440860215053763e-05,
934
+ "loss": 0.2511,
935
+ "step": 1410
936
+ },
937
+ {
938
+ "epoch": 9.16,
939
+ "learning_rate": 1.3142174432497015e-05,
940
+ "loss": 0.207,
941
+ "step": 1420
942
+ },
943
+ {
944
+ "epoch": 9.23,
945
+ "learning_rate": 1.2843488649940263e-05,
946
+ "loss": 0.2803,
947
+ "step": 1430
948
+ },
949
+ {
950
+ "epoch": 9.29,
951
+ "learning_rate": 1.2544802867383513e-05,
952
+ "loss": 0.2462,
953
+ "step": 1440
954
+ },
955
+ {
956
+ "epoch": 9.35,
957
+ "learning_rate": 1.2246117084826763e-05,
958
+ "loss": 0.2465,
959
+ "step": 1450
960
+ },
961
+ {
962
+ "epoch": 9.42,
963
+ "learning_rate": 1.1947431302270013e-05,
964
+ "loss": 0.2425,
965
+ "step": 1460
966
+ },
967
+ {
968
+ "epoch": 9.48,
969
+ "learning_rate": 1.1648745519713262e-05,
970
+ "loss": 0.2511,
971
+ "step": 1470
972
+ },
973
+ {
974
+ "epoch": 9.55,
975
+ "learning_rate": 1.135005973715651e-05,
976
+ "loss": 0.2333,
977
+ "step": 1480
978
+ },
979
+ {
980
+ "epoch": 9.61,
981
+ "learning_rate": 1.1051373954599762e-05,
982
+ "loss": 0.2188,
983
+ "step": 1490
984
+ },
985
+ {
986
+ "epoch": 9.68,
987
+ "learning_rate": 1.0752688172043012e-05,
988
+ "loss": 0.2549,
989
+ "step": 1500
990
+ },
991
+ {
992
+ "epoch": 9.74,
993
+ "learning_rate": 1.0454002389486262e-05,
994
+ "loss": 0.2827,
995
+ "step": 1510
996
+ },
997
+ {
998
+ "epoch": 9.8,
999
+ "learning_rate": 1.015531660692951e-05,
1000
+ "loss": 0.2465,
1001
+ "step": 1520
1002
+ },
1003
+ {
1004
+ "epoch": 9.87,
1005
+ "learning_rate": 9.856630824372761e-06,
1006
+ "loss": 0.2231,
1007
+ "step": 1530
1008
+ },
1009
+ {
1010
+ "epoch": 9.93,
1011
+ "learning_rate": 9.557945041816011e-06,
1012
+ "loss": 0.2576,
1013
+ "step": 1540
1014
+ },
1015
+ {
1016
+ "epoch": 10.0,
1017
+ "learning_rate": 9.259259259259259e-06,
1018
+ "loss": 0.2634,
1019
+ "step": 1550
1020
+ },
1021
+ {
1022
+ "epoch": 10.0,
1023
+ "eval_accuracy": 0.9624773960216998,
1024
+ "eval_loss": 0.10948529094457626,
1025
+ "eval_runtime": 227.8347,
1026
+ "eval_samples_per_second": 9.709,
1027
+ "eval_steps_per_second": 0.307,
1028
+ "step": 1550
1029
+ },
1030
+ {
1031
+ "epoch": 10.06,
1032
+ "learning_rate": 8.960573476702509e-06,
1033
+ "loss": 0.2642,
1034
+ "step": 1560
1035
+ },
1036
+ {
1037
+ "epoch": 10.13,
1038
+ "learning_rate": 8.661887694145759e-06,
1039
+ "loss": 0.2714,
1040
+ "step": 1570
1041
+ },
1042
+ {
1043
+ "epoch": 10.19,
1044
+ "learning_rate": 8.363201911589009e-06,
1045
+ "loss": 0.2354,
1046
+ "step": 1580
1047
+ },
1048
+ {
1049
+ "epoch": 10.26,
1050
+ "learning_rate": 8.064516129032258e-06,
1051
+ "loss": 0.2322,
1052
+ "step": 1590
1053
+ },
1054
+ {
1055
+ "epoch": 10.32,
1056
+ "learning_rate": 7.765830346475508e-06,
1057
+ "loss": 0.2227,
1058
+ "step": 1600
1059
+ },
1060
+ {
1061
+ "epoch": 10.39,
1062
+ "learning_rate": 7.467144563918758e-06,
1063
+ "loss": 0.2399,
1064
+ "step": 1610
1065
+ },
1066
+ {
1067
+ "epoch": 10.45,
1068
+ "learning_rate": 7.168458781362007e-06,
1069
+ "loss": 0.233,
1070
+ "step": 1620
1071
+ },
1072
+ {
1073
+ "epoch": 10.51,
1074
+ "learning_rate": 6.869772998805258e-06,
1075
+ "loss": 0.2572,
1076
+ "step": 1630
1077
+ },
1078
+ {
1079
+ "epoch": 10.58,
1080
+ "learning_rate": 6.5710872162485075e-06,
1081
+ "loss": 0.2219,
1082
+ "step": 1640
1083
+ },
1084
+ {
1085
+ "epoch": 10.64,
1086
+ "learning_rate": 6.2724014336917564e-06,
1087
+ "loss": 0.2517,
1088
+ "step": 1650
1089
+ },
1090
+ {
1091
+ "epoch": 10.71,
1092
+ "learning_rate": 5.973715651135006e-06,
1093
+ "loss": 0.2471,
1094
+ "step": 1660
1095
+ },
1096
+ {
1097
+ "epoch": 10.77,
1098
+ "learning_rate": 5.675029868578255e-06,
1099
+ "loss": 0.2378,
1100
+ "step": 1670
1101
+ },
1102
+ {
1103
+ "epoch": 10.84,
1104
+ "learning_rate": 5.376344086021506e-06,
1105
+ "loss": 0.2384,
1106
+ "step": 1680
1107
+ },
1108
+ {
1109
+ "epoch": 10.9,
1110
+ "learning_rate": 5.077658303464755e-06,
1111
+ "loss": 0.2426,
1112
+ "step": 1690
1113
+ },
1114
+ {
1115
+ "epoch": 10.96,
1116
+ "learning_rate": 4.7789725209080055e-06,
1117
+ "loss": 0.2159,
1118
+ "step": 1700
1119
+ },
1120
+ {
1121
+ "epoch": 11.0,
1122
+ "eval_accuracy": 0.9638336347197106,
1123
+ "eval_loss": 0.10932076722383499,
1124
+ "eval_runtime": 252.327,
1125
+ "eval_samples_per_second": 8.766,
1126
+ "eval_steps_per_second": 0.277,
1127
+ "step": 1705
1128
+ },
1129
+ {
1130
+ "epoch": 11.03,
1131
+ "learning_rate": 4.4802867383512545e-06,
1132
+ "loss": 0.2404,
1133
+ "step": 1710
1134
+ },
1135
+ {
1136
+ "epoch": 11.1,
1137
+ "learning_rate": 4.181600955794504e-06,
1138
+ "loss": 0.2461,
1139
+ "step": 1720
1140
+ },
1141
+ {
1142
+ "epoch": 11.16,
1143
+ "learning_rate": 3.882915173237754e-06,
1144
+ "loss": 0.239,
1145
+ "step": 1730
1146
+ },
1147
+ {
1148
+ "epoch": 11.23,
1149
+ "learning_rate": 3.5842293906810035e-06,
1150
+ "loss": 0.2171,
1151
+ "step": 1740
1152
+ },
1153
+ {
1154
+ "epoch": 11.29,
1155
+ "learning_rate": 3.2855436081242537e-06,
1156
+ "loss": 0.2376,
1157
+ "step": 1750
1158
+ },
1159
+ {
1160
+ "epoch": 11.35,
1161
+ "learning_rate": 2.986857825567503e-06,
1162
+ "loss": 0.2301,
1163
+ "step": 1760
1164
+ },
1165
+ {
1166
+ "epoch": 11.42,
1167
+ "learning_rate": 2.688172043010753e-06,
1168
+ "loss": 0.2307,
1169
+ "step": 1770
1170
+ },
1171
+ {
1172
+ "epoch": 11.48,
1173
+ "learning_rate": 2.3894862604540028e-06,
1174
+ "loss": 0.2216,
1175
+ "step": 1780
1176
+ },
1177
+ {
1178
+ "epoch": 11.55,
1179
+ "learning_rate": 2.090800477897252e-06,
1180
+ "loss": 0.2246,
1181
+ "step": 1790
1182
+ },
1183
+ {
1184
+ "epoch": 11.61,
1185
+ "learning_rate": 1.7921146953405017e-06,
1186
+ "loss": 0.2639,
1187
+ "step": 1800
1188
+ },
1189
+ {
1190
+ "epoch": 11.68,
1191
+ "learning_rate": 1.4934289127837516e-06,
1192
+ "loss": 0.2243,
1193
+ "step": 1810
1194
+ },
1195
+ {
1196
+ "epoch": 11.74,
1197
+ "learning_rate": 1.1947431302270014e-06,
1198
+ "loss": 0.2217,
1199
+ "step": 1820
1200
+ },
1201
+ {
1202
+ "epoch": 11.8,
1203
+ "learning_rate": 8.960573476702509e-07,
1204
+ "loss": 0.2324,
1205
+ "step": 1830
1206
+ },
1207
+ {
1208
+ "epoch": 11.87,
1209
+ "learning_rate": 5.973715651135007e-07,
1210
+ "loss": 0.2496,
1211
+ "step": 1840
1212
+ },
1213
+ {
1214
+ "epoch": 11.93,
1215
+ "learning_rate": 2.9868578255675034e-07,
1216
+ "loss": 0.2464,
1217
+ "step": 1850
1218
+ },
1219
+ {
1220
+ "epoch": 12.0,
1221
+ "learning_rate": 0.0,
1222
+ "loss": 0.2355,
1223
+ "step": 1860
1224
+ },
1225
+ {
1226
+ "epoch": 12.0,
1227
+ "eval_accuracy": 0.9638336347197106,
1228
+ "eval_loss": 0.11173169314861298,
1229
+ "eval_runtime": 228.1044,
1230
+ "eval_samples_per_second": 9.697,
1231
+ "eval_steps_per_second": 0.307,
1232
+ "step": 1860
1233
+ }
1234
+ ],
1235
+ "max_steps": 1860,
1236
+ "num_train_epochs": 12,
1237
+ "total_flos": 5.070182024513249e+18,
1238
+ "trial_name": null,
1239
+ "trial_params": null
1240
+ }