rshrott commited on
Commit
01ad907
1 Parent(s): 5481906

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  model-index:
7
  - name: ryan_model314
@@ -13,11 +14,11 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # ryan_model314
15
 
16
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.3635
19
- - Na Accuracy: 0.939
20
- - Ordinal Accuracy: 0.6609
21
 
22
  ## Model description
23
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  model-index:
8
  - name: ryan_model314
 
14
 
15
  # ryan_model314
16
 
17
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.2532
20
+ - Na Accuracy: 0.947
21
+ - Ordinal Accuracy: 0.5952
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 4.0,
3
  "eval_accuracy": 0.628,
4
- "eval_loss": 0.24043463170528412,
5
- "eval_na_accuracy": 0.96,
6
- "eval_ordinal_accuracy": 0.5920745920745921,
7
- "eval_runtime": 19.2933,
8
- "eval_samples_per_second": 25.916,
9
- "eval_steps_per_second": 3.265,
10
- "train_loss": 0.15608444792060808,
11
- "train_runtime": 1745.0942,
12
- "train_samples_per_second": 11.461,
13
- "train_steps_per_second": 0.717
14
  }
 
1
  {
2
  "epoch": 4.0,
3
  "eval_accuracy": 0.628,
4
+ "eval_loss": 0.2531912922859192,
5
+ "eval_na_accuracy": 0.947,
6
+ "eval_ordinal_accuracy": 0.5951557093425606,
7
+ "eval_runtime": 41.4892,
8
+ "eval_samples_per_second": 24.103,
9
+ "eval_steps_per_second": 3.013,
10
+ "train_loss": 0.15650403581261635,
11
+ "train_runtime": 3981.9696,
12
+ "train_samples_per_second": 10.045,
13
+ "train_steps_per_second": 0.628
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_loss": 0.24043463170528412,
4
- "eval_na_accuracy": 0.96,
5
- "eval_ordinal_accuracy": 0.5920745920745921,
6
- "eval_runtime": 19.2933,
7
- "eval_samples_per_second": 25.916,
8
- "eval_steps_per_second": 3.265
9
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_loss": 0.2531912922859192,
4
+ "eval_na_accuracy": 0.947,
5
+ "eval_ordinal_accuracy": 0.5951557093425606,
6
+ "eval_runtime": 41.4892,
7
+ "eval_samples_per_second": 24.103,
8
+ "eval_steps_per_second": 3.013
9
  }
runs/Mar26_20-28-31_ryanserver/events.out.tfevents.1711503391.ryanserver.7179.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79bb34c6fad13e7161dccdd423749c0e0c7f10a793282d952ea4b3f94316cd47
3
+ size 474
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 4.0,
3
- "train_loss": 0.15608444792060808,
4
- "train_runtime": 1745.0942,
5
- "train_samples_per_second": 11.461,
6
- "train_steps_per_second": 0.717
7
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "train_loss": 0.15650403581261635,
4
+ "train_runtime": 3981.9696,
5
+ "train_samples_per_second": 10.045,
6
+ "train_steps_per_second": 0.628
7
  }
trainer_state.json CHANGED
@@ -1,1024 +1,2029 @@
1
  {
2
- "best_metric": 0.24043463170528412,
3
- "best_model_checkpoint": "./ryan_model314/checkpoint-500",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 1252,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "grad_norm": 0.962435245513916,
14
- "learning_rate": 0.00019840255591054313,
15
- "loss": 0.5392,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.06,
20
- "grad_norm": 1.411149024963379,
21
- "learning_rate": 0.00019680511182108628,
22
- "loss": 0.3886,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.1,
27
- "grad_norm": 0.624254584312439,
28
- "learning_rate": 0.0001952076677316294,
29
- "loss": 0.3602,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.13,
34
- "grad_norm": 1.0637298822402954,
35
- "learning_rate": 0.00019361022364217253,
36
- "loss": 0.3841,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.16,
41
- "grad_norm": 1.1236467361450195,
42
- "learning_rate": 0.00019201277955271565,
43
- "loss": 0.3312,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.19,
48
- "grad_norm": 0.6737650632858276,
49
- "learning_rate": 0.0001904153354632588,
50
- "loss": 0.3419,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.22,
55
- "grad_norm": 1.0528877973556519,
56
- "learning_rate": 0.00018881789137380192,
57
- "loss": 0.348,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.26,
62
- "grad_norm": 1.2546306848526,
63
- "learning_rate": 0.00018722044728434505,
64
- "loss": 0.3884,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.29,
69
- "grad_norm": 1.552256464958191,
70
- "learning_rate": 0.0001856230031948882,
71
- "loss": 0.4075,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.32,
76
- "grad_norm": 1.334892988204956,
77
- "learning_rate": 0.00018402555910543132,
78
- "loss": 0.3682,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.32,
83
- "eval_loss": 0.3208509385585785,
84
- "eval_na_accuracy": 0.942,
85
- "eval_ordinal_accuracy": 0.5268065268065268,
86
- "eval_runtime": 54.8874,
87
- "eval_samples_per_second": 9.11,
88
- "eval_steps_per_second": 1.148,
89
  "step": 100
90
  },
91
  {
92
- "epoch": 0.35,
93
- "grad_norm": 1.1707366704940796,
94
- "learning_rate": 0.00018242811501597444,
95
- "loss": 0.274,
96
  "step": 110
97
  },
98
  {
99
- "epoch": 0.38,
100
- "grad_norm": 0.9099497199058533,
101
- "learning_rate": 0.00018083067092651756,
102
- "loss": 0.3335,
103
  "step": 120
104
  },
105
  {
106
- "epoch": 0.42,
107
- "grad_norm": 0.9688892960548401,
108
- "learning_rate": 0.00017923322683706071,
109
- "loss": 0.2867,
110
  "step": 130
111
  },
112
  {
113
- "epoch": 0.45,
114
- "grad_norm": 0.5258199572563171,
115
- "learning_rate": 0.00017763578274760384,
116
- "loss": 0.2557,
117
  "step": 140
118
  },
119
  {
120
- "epoch": 0.48,
121
- "grad_norm": 0.8469595313072205,
122
- "learning_rate": 0.000176038338658147,
123
- "loss": 0.3041,
124
  "step": 150
125
  },
126
  {
127
- "epoch": 0.51,
128
- "grad_norm": 1.846753716468811,
129
- "learning_rate": 0.0001744408945686901,
130
- "loss": 0.2766,
131
  "step": 160
132
  },
133
  {
134
- "epoch": 0.54,
135
- "grad_norm": 0.4423494338989258,
136
- "learning_rate": 0.00017284345047923323,
137
- "loss": 0.2555,
138
  "step": 170
139
  },
140
  {
141
- "epoch": 0.58,
142
- "grad_norm": 2.977750062942505,
143
- "learning_rate": 0.00017124600638977638,
144
- "loss": 0.2742,
145
  "step": 180
146
  },
147
  {
148
- "epoch": 0.61,
149
- "grad_norm": 0.8587075471878052,
150
- "learning_rate": 0.00016964856230031948,
151
- "loss": 0.3036,
152
  "step": 190
153
  },
154
  {
155
- "epoch": 0.64,
156
- "grad_norm": 0.7394629120826721,
157
- "learning_rate": 0.00016805111821086263,
158
- "loss": 0.2877,
159
  "step": 200
160
  },
161
  {
162
- "epoch": 0.64,
163
- "eval_loss": 0.2782871127128601,
164
- "eval_na_accuracy": 0.944,
165
- "eval_ordinal_accuracy": 0.5920745920745921,
166
- "eval_runtime": 19.4606,
167
- "eval_samples_per_second": 25.693,
168
- "eval_steps_per_second": 3.237,
169
  "step": 200
170
  },
171
  {
172
- "epoch": 0.67,
173
- "grad_norm": 1.093324899673462,
174
- "learning_rate": 0.00016645367412140575,
175
- "loss": 0.2535,
176
  "step": 210
177
  },
178
  {
179
- "epoch": 0.7,
180
- "grad_norm": 1.0213277339935303,
181
- "learning_rate": 0.0001648562300319489,
182
- "loss": 0.2513,
183
  "step": 220
184
  },
185
  {
186
- "epoch": 0.73,
187
- "grad_norm": 1.1321722269058228,
188
- "learning_rate": 0.00016325878594249202,
189
- "loss": 0.3015,
190
  "step": 230
191
  },
192
  {
193
- "epoch": 0.77,
194
- "grad_norm": 3.1191928386688232,
195
- "learning_rate": 0.00016166134185303515,
196
- "loss": 0.2307,
197
  "step": 240
198
  },
199
  {
200
- "epoch": 0.8,
201
- "grad_norm": 1.107519507408142,
202
- "learning_rate": 0.0001600638977635783,
203
- "loss": 0.3198,
204
  "step": 250
205
  },
206
  {
207
- "epoch": 0.83,
208
- "grad_norm": 0.818793535232544,
209
- "learning_rate": 0.00015846645367412142,
210
- "loss": 0.2959,
211
  "step": 260
212
  },
213
  {
214
- "epoch": 0.86,
215
- "grad_norm": 0.4491446912288666,
216
- "learning_rate": 0.00015686900958466454,
217
- "loss": 0.212,
218
  "step": 270
219
  },
220
  {
221
- "epoch": 0.89,
222
- "grad_norm": 1.838297963142395,
223
- "learning_rate": 0.00015527156549520767,
224
- "loss": 0.2836,
225
  "step": 280
226
  },
227
  {
228
- "epoch": 0.93,
229
- "grad_norm": 1.4379059076309204,
230
- "learning_rate": 0.00015367412140575082,
231
- "loss": 0.309,
232
  "step": 290
233
  },
234
  {
235
- "epoch": 0.96,
236
- "grad_norm": 2.274575710296631,
237
- "learning_rate": 0.00015207667731629394,
238
- "loss": 0.3079,
239
  "step": 300
240
  },
241
  {
242
- "epoch": 0.96,
243
- "eval_loss": 0.2544197142124176,
244
- "eval_na_accuracy": 0.958,
245
- "eval_ordinal_accuracy": 0.5337995337995338,
246
- "eval_runtime": 19.3178,
247
- "eval_samples_per_second": 25.883,
248
- "eval_steps_per_second": 3.261,
249
  "step": 300
250
  },
251
  {
252
- "epoch": 0.99,
253
- "grad_norm": 0.7160611748695374,
254
- "learning_rate": 0.00015047923322683706,
255
- "loss": 0.223,
256
  "step": 310
257
  },
258
  {
259
- "epoch": 1.02,
260
- "grad_norm": 0.5523737668991089,
261
- "learning_rate": 0.0001488817891373802,
262
- "loss": 0.2784,
263
  "step": 320
264
  },
265
  {
266
- "epoch": 1.05,
267
- "grad_norm": 0.5268480777740479,
268
- "learning_rate": 0.00014728434504792333,
269
- "loss": 0.1671,
270
  "step": 330
271
  },
272
  {
273
- "epoch": 1.09,
274
- "grad_norm": 1.7597477436065674,
275
- "learning_rate": 0.00014568690095846646,
276
- "loss": 0.1519,
277
  "step": 340
278
  },
279
  {
280
- "epoch": 1.12,
281
- "grad_norm": 0.7296491861343384,
282
- "learning_rate": 0.00014408945686900958,
283
- "loss": 0.223,
284
  "step": 350
285
  },
286
  {
287
- "epoch": 1.15,
288
- "grad_norm": 0.3579563796520233,
289
- "learning_rate": 0.00014249201277955273,
290
- "loss": 0.1383,
291
  "step": 360
292
  },
293
  {
294
- "epoch": 1.18,
295
- "grad_norm": 1.5826038122177124,
296
- "learning_rate": 0.00014089456869009585,
297
- "loss": 0.2555,
298
  "step": 370
299
  },
300
  {
301
- "epoch": 1.21,
302
- "grad_norm": 1.290208339691162,
303
- "learning_rate": 0.000139297124600639,
304
- "loss": 0.1482,
305
  "step": 380
306
  },
307
  {
308
- "epoch": 1.25,
309
- "grad_norm": 0.9879806637763977,
310
- "learning_rate": 0.00013769968051118212,
311
- "loss": 0.1714,
312
  "step": 390
313
  },
314
  {
315
- "epoch": 1.28,
316
- "grad_norm": 0.6665166616439819,
317
- "learning_rate": 0.00013610223642172525,
318
- "loss": 0.1901,
319
  "step": 400
320
  },
321
  {
322
- "epoch": 1.28,
323
- "eval_loss": 0.26315367221832275,
324
- "eval_na_accuracy": 0.948,
325
- "eval_ordinal_accuracy": 0.6060606060606061,
326
- "eval_runtime": 19.5095,
327
- "eval_samples_per_second": 25.628,
328
- "eval_steps_per_second": 3.229,
329
  "step": 400
330
  },
331
  {
332
- "epoch": 1.31,
333
- "grad_norm": 0.5691338181495667,
334
- "learning_rate": 0.00013450479233226837,
335
- "loss": 0.21,
336
  "step": 410
337
  },
338
  {
339
- "epoch": 1.34,
340
- "grad_norm": 0.76593017578125,
341
- "learning_rate": 0.0001329073482428115,
342
- "loss": 0.1704,
343
  "step": 420
344
  },
345
  {
346
- "epoch": 1.37,
347
- "grad_norm": 0.6972767114639282,
348
- "learning_rate": 0.00013130990415335464,
349
- "loss": 0.1911,
350
  "step": 430
351
  },
352
  {
353
- "epoch": 1.41,
354
- "grad_norm": 0.6530088186264038,
355
- "learning_rate": 0.00012971246006389777,
356
- "loss": 0.128,
357
  "step": 440
358
  },
359
  {
360
- "epoch": 1.44,
361
- "grad_norm": 0.8316710591316223,
362
- "learning_rate": 0.00012811501597444092,
363
- "loss": 0.2046,
364
  "step": 450
365
  },
366
  {
367
- "epoch": 1.47,
368
- "grad_norm": 0.5624408721923828,
369
- "learning_rate": 0.00012651757188498404,
370
- "loss": 0.1888,
371
  "step": 460
372
  },
373
  {
374
- "epoch": 1.5,
375
- "grad_norm": 0.5218725800514221,
376
- "learning_rate": 0.00012492012779552716,
377
- "loss": 0.2096,
378
  "step": 470
379
  },
380
  {
381
- "epoch": 1.53,
382
- "grad_norm": 0.7621105909347534,
383
- "learning_rate": 0.00012332268370607028,
384
- "loss": 0.1682,
385
  "step": 480
386
  },
387
  {
388
- "epoch": 1.57,
389
- "grad_norm": 0.9870132803916931,
390
- "learning_rate": 0.00012172523961661342,
391
- "loss": 0.1606,
392
  "step": 490
393
  },
394
  {
395
- "epoch": 1.6,
396
- "grad_norm": 0.5003547072410583,
397
- "learning_rate": 0.00012012779552715656,
398
- "loss": 0.277,
399
  "step": 500
400
  },
401
  {
402
- "epoch": 1.6,
403
- "eval_loss": 0.24043463170528412,
404
- "eval_na_accuracy": 0.96,
405
- "eval_ordinal_accuracy": 0.5920745920745921,
406
- "eval_runtime": 19.0104,
407
- "eval_samples_per_second": 26.301,
408
- "eval_steps_per_second": 3.314,
409
  "step": 500
410
  },
411
  {
412
- "epoch": 1.63,
413
- "grad_norm": 2.554410457611084,
414
- "learning_rate": 0.00011853035143769968,
415
- "loss": 0.2163,
416
  "step": 510
417
  },
418
  {
419
- "epoch": 1.66,
420
- "grad_norm": 0.577261745929718,
421
- "learning_rate": 0.00011693290734824283,
422
- "loss": 0.1652,
423
  "step": 520
424
  },
425
  {
426
- "epoch": 1.69,
427
- "grad_norm": 0.722175121307373,
428
- "learning_rate": 0.00011533546325878595,
429
- "loss": 0.1061,
430
  "step": 530
431
  },
432
  {
433
- "epoch": 1.73,
434
- "grad_norm": 2.013876438140869,
435
- "learning_rate": 0.00011373801916932908,
436
- "loss": 0.2078,
437
  "step": 540
438
  },
439
  {
440
- "epoch": 1.76,
441
- "grad_norm": 2.5454280376434326,
442
- "learning_rate": 0.00011214057507987221,
443
- "loss": 0.238,
444
  "step": 550
445
  },
446
  {
447
- "epoch": 1.79,
448
- "grad_norm": 0.7590613961219788,
449
- "learning_rate": 0.00011054313099041533,
450
- "loss": 0.1868,
451
  "step": 560
452
  },
453
  {
454
- "epoch": 1.82,
455
- "grad_norm": 1.2678501605987549,
456
- "learning_rate": 0.00010894568690095847,
457
- "loss": 0.2365,
458
  "step": 570
459
  },
460
  {
461
- "epoch": 1.85,
462
- "grad_norm": 0.9851756691932678,
463
- "learning_rate": 0.0001073482428115016,
464
- "loss": 0.2362,
465
  "step": 580
466
  },
467
  {
468
- "epoch": 1.88,
469
- "grad_norm": 1.6259474754333496,
470
- "learning_rate": 0.00010575079872204474,
471
- "loss": 0.2368,
472
  "step": 590
473
  },
474
  {
475
- "epoch": 1.92,
476
- "grad_norm": 0.7762842178344727,
477
- "learning_rate": 0.00010415335463258787,
478
- "loss": 0.2081,
479
  "step": 600
480
  },
481
  {
482
- "epoch": 1.92,
483
- "eval_loss": 0.2585134506225586,
484
- "eval_na_accuracy": 0.95,
485
- "eval_ordinal_accuracy": 0.627039627039627,
486
- "eval_runtime": 19.183,
487
- "eval_samples_per_second": 26.065,
488
- "eval_steps_per_second": 3.284,
489
  "step": 600
490
  },
491
  {
492
- "epoch": 1.95,
493
- "grad_norm": 0.8312740325927734,
494
- "learning_rate": 0.000102555910543131,
495
- "loss": 0.1747,
496
  "step": 610
497
  },
498
  {
499
- "epoch": 1.98,
500
- "grad_norm": 1.1986323595046997,
501
- "learning_rate": 0.00010095846645367413,
502
- "loss": 0.146,
503
  "step": 620
504
  },
505
  {
506
- "epoch": 2.01,
507
- "grad_norm": 0.6852553486824036,
508
- "learning_rate": 9.936102236421726e-05,
509
- "loss": 0.1098,
510
  "step": 630
511
  },
512
  {
513
- "epoch": 2.04,
514
- "grad_norm": 0.49778783321380615,
515
- "learning_rate": 9.77635782747604e-05,
516
- "loss": 0.0807,
517
  "step": 640
518
  },
519
  {
520
- "epoch": 2.08,
521
- "grad_norm": 0.43836385011672974,
522
- "learning_rate": 9.616613418530351e-05,
523
- "loss": 0.0785,
524
  "step": 650
525
  },
526
  {
527
- "epoch": 2.11,
528
- "grad_norm": 0.4496062695980072,
529
- "learning_rate": 9.456869009584664e-05,
530
- "loss": 0.1007,
531
  "step": 660
532
  },
533
  {
534
- "epoch": 2.14,
535
- "grad_norm": 0.6552535891532898,
536
- "learning_rate": 9.297124600638978e-05,
537
- "loss": 0.0837,
538
  "step": 670
539
  },
540
  {
541
- "epoch": 2.17,
542
- "grad_norm": 0.8565073013305664,
543
- "learning_rate": 9.137380191693292e-05,
544
- "loss": 0.1288,
545
  "step": 680
546
  },
547
  {
548
- "epoch": 2.2,
549
- "grad_norm": 0.3386971056461334,
550
- "learning_rate": 8.977635782747604e-05,
551
- "loss": 0.0671,
552
  "step": 690
553
  },
554
  {
555
- "epoch": 2.24,
556
- "grad_norm": 0.6647264957427979,
557
- "learning_rate": 8.817891373801918e-05,
558
- "loss": 0.0809,
559
  "step": 700
560
  },
561
  {
562
- "epoch": 2.24,
563
- "eval_loss": 0.26309889554977417,
564
- "eval_na_accuracy": 0.954,
565
- "eval_ordinal_accuracy": 0.6736596736596736,
566
- "eval_runtime": 18.6607,
567
- "eval_samples_per_second": 26.794,
568
- "eval_steps_per_second": 3.376,
569
  "step": 700
570
  },
571
  {
572
- "epoch": 2.27,
573
- "grad_norm": 0.6069409251213074,
574
- "learning_rate": 8.658146964856231e-05,
575
- "loss": 0.0765,
576
  "step": 710
577
  },
578
  {
579
- "epoch": 2.3,
580
- "grad_norm": 1.3801295757293701,
581
- "learning_rate": 8.498402555910544e-05,
582
- "loss": 0.1077,
583
  "step": 720
584
  },
585
  {
586
- "epoch": 2.33,
587
- "grad_norm": 0.7153878808021545,
588
- "learning_rate": 8.338658146964856e-05,
589
- "loss": 0.075,
590
  "step": 730
591
  },
592
  {
593
- "epoch": 2.36,
594
- "grad_norm": 0.727155327796936,
595
- "learning_rate": 8.17891373801917e-05,
596
- "loss": 0.0639,
597
  "step": 740
598
  },
599
  {
600
- "epoch": 2.4,
601
- "grad_norm": 0.7251117825508118,
602
- "learning_rate": 8.019169329073483e-05,
603
- "loss": 0.1146,
604
  "step": 750
605
  },
606
  {
607
- "epoch": 2.43,
608
- "grad_norm": 0.7786515951156616,
609
- "learning_rate": 7.859424920127795e-05,
610
- "loss": 0.1122,
611
  "step": 760
612
  },
613
  {
614
- "epoch": 2.46,
615
- "grad_norm": 0.6191527843475342,
616
- "learning_rate": 7.699680511182109e-05,
617
- "loss": 0.0867,
618
  "step": 770
619
  },
620
  {
621
- "epoch": 2.49,
622
- "grad_norm": 0.4263085722923279,
623
- "learning_rate": 7.539936102236423e-05,
624
- "loss": 0.1325,
625
  "step": 780
626
  },
627
  {
628
- "epoch": 2.52,
629
- "grad_norm": 1.6009018421173096,
630
- "learning_rate": 7.380191693290735e-05,
631
- "loss": 0.0693,
632
  "step": 790
633
  },
634
  {
635
- "epoch": 2.56,
636
- "grad_norm": 0.37769052386283875,
637
- "learning_rate": 7.220447284345049e-05,
638
- "loss": 0.1,
639
  "step": 800
640
  },
641
  {
642
- "epoch": 2.56,
643
- "eval_loss": 0.2692907154560089,
644
- "eval_na_accuracy": 0.958,
645
- "eval_ordinal_accuracy": 0.6363636363636364,
646
- "eval_runtime": 19.3788,
647
- "eval_samples_per_second": 25.801,
648
- "eval_steps_per_second": 3.251,
649
  "step": 800
650
  },
651
  {
652
- "epoch": 2.59,
653
- "grad_norm": 0.45006510615348816,
654
- "learning_rate": 7.060702875399361e-05,
655
- "loss": 0.0645,
656
  "step": 810
657
  },
658
  {
659
- "epoch": 2.62,
660
- "grad_norm": 0.813955545425415,
661
- "learning_rate": 6.900958466453674e-05,
662
- "loss": 0.0583,
663
  "step": 820
664
  },
665
  {
666
- "epoch": 2.65,
667
- "grad_norm": 0.7967355847358704,
668
- "learning_rate": 6.741214057507987e-05,
669
- "loss": 0.1065,
670
  "step": 830
671
  },
672
  {
673
- "epoch": 2.68,
674
- "grad_norm": 0.8020057678222656,
675
- "learning_rate": 6.5814696485623e-05,
676
- "loss": 0.0821,
677
  "step": 840
678
  },
679
  {
680
- "epoch": 2.72,
681
- "grad_norm": 0.7547793388366699,
682
- "learning_rate": 6.421725239616614e-05,
683
- "loss": 0.0983,
684
  "step": 850
685
  },
686
  {
687
- "epoch": 2.75,
688
- "grad_norm": 0.42487284541130066,
689
- "learning_rate": 6.261980830670928e-05,
690
- "loss": 0.1221,
691
  "step": 860
692
  },
693
  {
694
- "epoch": 2.78,
695
- "grad_norm": 0.3974541425704956,
696
- "learning_rate": 6.1022364217252406e-05,
697
- "loss": 0.108,
698
  "step": 870
699
  },
700
  {
701
- "epoch": 2.81,
702
- "grad_norm": 0.42565950751304626,
703
- "learning_rate": 5.942492012779552e-05,
704
- "loss": 0.106,
705
  "step": 880
706
  },
707
  {
708
- "epoch": 2.84,
709
- "grad_norm": 0.5634491443634033,
710
- "learning_rate": 5.782747603833866e-05,
711
- "loss": 0.0765,
712
  "step": 890
713
  },
714
  {
715
- "epoch": 2.88,
716
- "grad_norm": 0.7806987166404724,
717
- "learning_rate": 5.623003194888179e-05,
718
- "loss": 0.0924,
719
  "step": 900
720
  },
721
  {
722
- "epoch": 2.88,
723
- "eval_loss": 0.2566128671169281,
724
- "eval_na_accuracy": 0.962,
725
- "eval_ordinal_accuracy": 0.6503496503496503,
726
- "eval_runtime": 19.1674,
727
- "eval_samples_per_second": 26.086,
728
- "eval_steps_per_second": 3.287,
729
  "step": 900
730
  },
731
  {
732
- "epoch": 2.91,
733
- "grad_norm": 1.5311517715454102,
734
- "learning_rate": 5.4632587859424925e-05,
735
- "loss": 0.0779,
736
  "step": 910
737
  },
738
  {
739
- "epoch": 2.94,
740
- "grad_norm": 0.8630849123001099,
741
- "learning_rate": 5.3035143769968054e-05,
742
- "loss": 0.0697,
743
  "step": 920
744
  },
745
  {
746
- "epoch": 2.97,
747
- "grad_norm": 0.39383065700531006,
748
- "learning_rate": 5.1437699680511184e-05,
749
- "loss": 0.065,
750
  "step": 930
751
  },
752
  {
753
- "epoch": 3.0,
754
- "grad_norm": 0.45358964800834656,
755
- "learning_rate": 4.984025559105431e-05,
756
- "loss": 0.0489,
757
  "step": 940
758
  },
759
  {
760
- "epoch": 3.04,
761
- "grad_norm": 0.39319777488708496,
762
- "learning_rate": 4.824281150159744e-05,
763
- "loss": 0.0311,
764
  "step": 950
765
  },
766
  {
767
- "epoch": 3.07,
768
- "grad_norm": 0.6434493064880371,
769
- "learning_rate": 4.664536741214058e-05,
770
- "loss": 0.0424,
771
  "step": 960
772
  },
773
  {
774
- "epoch": 3.1,
775
- "grad_norm": 0.4773056209087372,
776
- "learning_rate": 4.504792332268371e-05,
777
- "loss": 0.0326,
778
  "step": 970
779
  },
780
  {
781
- "epoch": 3.13,
782
- "grad_norm": 0.48347094655036926,
783
- "learning_rate": 4.345047923322684e-05,
784
- "loss": 0.0312,
785
  "step": 980
786
  },
787
  {
788
- "epoch": 3.16,
789
- "grad_norm": 0.410236120223999,
790
- "learning_rate": 4.185303514376997e-05,
791
- "loss": 0.0283,
792
  "step": 990
793
  },
794
  {
795
- "epoch": 3.19,
796
- "grad_norm": 0.3112216889858246,
797
- "learning_rate": 4.0255591054313104e-05,
798
- "loss": 0.0304,
799
  "step": 1000
800
  },
801
  {
802
- "epoch": 3.19,
803
- "eval_loss": 0.2782021760940552,
804
- "eval_na_accuracy": 0.954,
805
- "eval_ordinal_accuracy": 0.6526806526806527,
806
- "eval_runtime": 18.9723,
807
- "eval_samples_per_second": 26.354,
808
- "eval_steps_per_second": 3.321,
809
  "step": 1000
810
  },
811
  {
812
- "epoch": 3.23,
813
- "grad_norm": 0.3799457550048828,
814
- "learning_rate": 3.8658146964856234e-05,
815
- "loss": 0.0622,
816
  "step": 1010
817
  },
818
  {
819
- "epoch": 3.26,
820
- "grad_norm": 0.3940556049346924,
821
- "learning_rate": 3.7060702875399364e-05,
822
- "loss": 0.0275,
823
  "step": 1020
824
  },
825
  {
826
- "epoch": 3.29,
827
- "grad_norm": 0.5390946269035339,
828
- "learning_rate": 3.546325878594249e-05,
829
- "loss": 0.0531,
830
  "step": 1030
831
  },
832
  {
833
- "epoch": 3.32,
834
- "grad_norm": 0.5048585534095764,
835
- "learning_rate": 3.386581469648562e-05,
836
- "loss": 0.03,
837
  "step": 1040
838
  },
839
  {
840
- "epoch": 3.35,
841
- "grad_norm": 0.4280671775341034,
842
- "learning_rate": 3.242811501597444e-05,
843
- "loss": 0.0474,
844
  "step": 1050
845
  },
846
  {
847
- "epoch": 3.39,
848
- "grad_norm": 0.34902918338775635,
849
- "learning_rate": 3.083067092651757e-05,
850
- "loss": 0.0318,
851
  "step": 1060
852
  },
853
  {
854
- "epoch": 3.42,
855
- "grad_norm": 0.4760427176952362,
856
- "learning_rate": 2.9233226837060707e-05,
857
- "loss": 0.0285,
858
  "step": 1070
859
  },
860
  {
861
- "epoch": 3.45,
862
- "grad_norm": 0.15729285776615143,
863
- "learning_rate": 2.7635782747603834e-05,
864
- "loss": 0.0496,
865
  "step": 1080
866
  },
867
  {
868
- "epoch": 3.48,
869
- "grad_norm": 0.38401302695274353,
870
- "learning_rate": 2.6038338658146967e-05,
871
- "loss": 0.0252,
872
  "step": 1090
873
  },
874
  {
875
- "epoch": 3.51,
876
- "grad_norm": 0.3598209023475647,
877
- "learning_rate": 2.44408945686901e-05,
878
- "loss": 0.0256,
879
  "step": 1100
880
  },
881
  {
882
- "epoch": 3.51,
883
- "eval_loss": 0.26621493697166443,
884
- "eval_na_accuracy": 0.96,
885
- "eval_ordinal_accuracy": 0.6643356643356644,
886
- "eval_runtime": 19.2883,
887
- "eval_samples_per_second": 25.922,
888
- "eval_steps_per_second": 3.266,
889
  "step": 1100
890
  },
891
  {
892
- "epoch": 3.55,
893
- "grad_norm": 0.4313770830631256,
894
- "learning_rate": 2.284345047923323e-05,
895
- "loss": 0.0269,
896
  "step": 1110
897
  },
898
  {
899
- "epoch": 3.58,
900
- "grad_norm": 0.4230475425720215,
901
- "learning_rate": 2.124600638977636e-05,
902
- "loss": 0.0304,
903
  "step": 1120
904
  },
905
  {
906
- "epoch": 3.61,
907
- "grad_norm": 0.2662275433540344,
908
- "learning_rate": 1.964856230031949e-05,
909
- "loss": 0.0176,
910
  "step": 1130
911
  },
912
  {
913
- "epoch": 3.64,
914
- "grad_norm": 0.791589617729187,
915
- "learning_rate": 1.805111821086262e-05,
916
- "loss": 0.0307,
917
  "step": 1140
918
  },
919
  {
920
- "epoch": 3.67,
921
- "grad_norm": 0.49746203422546387,
922
- "learning_rate": 1.645367412140575e-05,
923
- "loss": 0.0217,
924
  "step": 1150
925
  },
926
  {
927
- "epoch": 3.71,
928
- "grad_norm": 0.18784604966640472,
929
- "learning_rate": 1.485623003194888e-05,
930
- "loss": 0.0206,
931
  "step": 1160
932
  },
933
  {
934
- "epoch": 3.74,
935
- "grad_norm": 0.47459328174591064,
936
- "learning_rate": 1.3258785942492014e-05,
937
- "loss": 0.0265,
938
  "step": 1170
939
  },
940
  {
941
- "epoch": 3.77,
942
- "grad_norm": 0.2313966453075409,
943
- "learning_rate": 1.1661341853035145e-05,
944
- "loss": 0.0313,
945
  "step": 1180
946
  },
947
  {
948
- "epoch": 3.8,
949
- "grad_norm": 0.4523174464702606,
950
- "learning_rate": 1.0063897763578276e-05,
951
- "loss": 0.0622,
952
  "step": 1190
953
  },
954
  {
955
- "epoch": 3.83,
956
- "grad_norm": 0.17250552773475647,
957
- "learning_rate": 8.466453674121406e-06,
958
- "loss": 0.0424,
959
  "step": 1200
960
  },
961
  {
962
- "epoch": 3.83,
963
- "eval_loss": 0.2670270502567291,
964
- "eval_na_accuracy": 0.962,
965
- "eval_ordinal_accuracy": 0.675990675990676,
966
- "eval_runtime": 18.6815,
967
- "eval_samples_per_second": 26.764,
968
- "eval_steps_per_second": 3.372,
969
  "step": 1200
970
  },
971
  {
972
- "epoch": 3.87,
973
- "grad_norm": 0.3159545660018921,
974
- "learning_rate": 6.869009584664538e-06,
975
- "loss": 0.0209,
976
  "step": 1210
977
  },
978
  {
979
- "epoch": 3.9,
980
- "grad_norm": 0.5657308101654053,
981
- "learning_rate": 5.2715654952076674e-06,
982
- "loss": 0.019,
983
  "step": 1220
984
  },
985
  {
986
- "epoch": 3.93,
987
- "grad_norm": 0.4505397379398346,
988
- "learning_rate": 3.6741214057507987e-06,
989
- "loss": 0.0249,
990
  "step": 1230
991
  },
992
  {
993
- "epoch": 3.96,
994
- "grad_norm": 0.21818359196186066,
995
- "learning_rate": 2.0766773162939296e-06,
996
- "loss": 0.0226,
997
  "step": 1240
998
  },
999
  {
1000
- "epoch": 3.99,
1001
- "grad_norm": 0.28112998604774475,
1002
- "learning_rate": 4.792332268370607e-07,
1003
- "loss": 0.027,
1004
  "step": 1250
1005
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1006
  {
1007
  "epoch": 4.0,
1008
- "step": 1252,
1009
- "total_flos": 1.5498953551872e+18,
1010
- "train_loss": 0.15608444792060808,
1011
- "train_runtime": 1745.0942,
1012
- "train_samples_per_second": 11.461,
1013
- "train_steps_per_second": 0.717
1014
  }
1015
  ],
1016
  "logging_steps": 10,
1017
- "max_steps": 1252,
1018
  "num_input_tokens_seen": 0,
1019
  "num_train_epochs": 4,
1020
  "save_steps": 100,
1021
- "total_flos": 1.5498953551872e+18,
1022
  "train_batch_size": 16,
1023
  "trial_name": null,
1024
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.2531912922859192,
3
+ "best_model_checkpoint": "./ryan_model314/checkpoint-600",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "grad_norm": 1.1103402376174927,
14
+ "learning_rate": 0.00019920000000000002,
15
+ "loss": 0.5731,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.03,
20
+ "grad_norm": 0.8193413019180298,
21
+ "learning_rate": 0.0001984,
22
+ "loss": 0.4217,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.05,
27
+ "grad_norm": 1.3536686897277832,
28
+ "learning_rate": 0.0001976,
29
+ "loss": 0.3709,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.06,
34
+ "grad_norm": 0.9998810887336731,
35
+ "learning_rate": 0.0001968,
36
+ "loss": 0.3398,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.08,
41
+ "grad_norm": 1.5689244270324707,
42
+ "learning_rate": 0.000196,
43
+ "loss": 0.3346,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.1,
48
+ "grad_norm": 1.1778826713562012,
49
+ "learning_rate": 0.0001952,
50
+ "loss": 0.3406,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.11,
55
+ "grad_norm": 1.3193926811218262,
56
+ "learning_rate": 0.0001944,
57
+ "loss": 0.2755,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.13,
62
+ "grad_norm": 1.1302804946899414,
63
+ "learning_rate": 0.00019360000000000002,
64
+ "loss": 0.3944,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.14,
69
+ "grad_norm": 0.8255844712257385,
70
+ "learning_rate": 0.0001928,
71
+ "loss": 0.3473,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.16,
76
+ "grad_norm": 1.0871790647506714,
77
+ "learning_rate": 0.000192,
78
+ "loss": 0.3042,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.16,
83
+ "eval_loss": 0.3673088252544403,
84
+ "eval_na_accuracy": 0.928,
85
+ "eval_ordinal_accuracy": 0.4671280276816609,
86
+ "eval_runtime": 110.8646,
87
+ "eval_samples_per_second": 9.02,
88
+ "eval_steps_per_second": 1.128,
89
  "step": 100
90
  },
91
  {
92
+ "epoch": 0.18,
93
+ "grad_norm": 1.5816177129745483,
94
+ "learning_rate": 0.0001912,
95
+ "loss": 0.4058,
96
  "step": 110
97
  },
98
  {
99
+ "epoch": 0.19,
100
+ "grad_norm": 0.8431822061538696,
101
+ "learning_rate": 0.0001904,
102
+ "loss": 0.2781,
103
  "step": 120
104
  },
105
  {
106
+ "epoch": 0.21,
107
+ "grad_norm": 1.0826754570007324,
108
+ "learning_rate": 0.0001896,
109
+ "loss": 0.2587,
110
  "step": 130
111
  },
112
  {
113
+ "epoch": 0.22,
114
+ "grad_norm": 3.7366294860839844,
115
+ "learning_rate": 0.0001888,
116
+ "loss": 0.3432,
117
  "step": 140
118
  },
119
  {
120
+ "epoch": 0.24,
121
+ "grad_norm": 0.5233088731765747,
122
+ "learning_rate": 0.000188,
123
+ "loss": 0.3484,
124
  "step": 150
125
  },
126
  {
127
+ "epoch": 0.26,
128
+ "grad_norm": 1.8766111135482788,
129
+ "learning_rate": 0.00018720000000000002,
130
+ "loss": 0.3597,
131
  "step": 160
132
  },
133
  {
134
+ "epoch": 0.27,
135
+ "grad_norm": 1.0037935972213745,
136
+ "learning_rate": 0.00018640000000000003,
137
+ "loss": 0.288,
138
  "step": 170
139
  },
140
  {
141
+ "epoch": 0.29,
142
+ "grad_norm": 1.3281046152114868,
143
+ "learning_rate": 0.0001856,
144
+ "loss": 0.3207,
145
  "step": 180
146
  },
147
  {
148
+ "epoch": 0.3,
149
+ "grad_norm": 1.4793013334274292,
150
+ "learning_rate": 0.00018480000000000002,
151
+ "loss": 0.3372,
152
  "step": 190
153
  },
154
  {
155
+ "epoch": 0.32,
156
+ "grad_norm": 0.8796727657318115,
157
+ "learning_rate": 0.00018400000000000003,
158
+ "loss": 0.2904,
159
  "step": 200
160
  },
161
  {
162
+ "epoch": 0.32,
163
+ "eval_loss": 0.29769936203956604,
164
+ "eval_na_accuracy": 0.933,
165
+ "eval_ordinal_accuracy": 0.5790080738177624,
166
+ "eval_runtime": 39.6284,
167
+ "eval_samples_per_second": 25.234,
168
+ "eval_steps_per_second": 3.154,
169
  "step": 200
170
  },
171
  {
172
+ "epoch": 0.34,
173
+ "grad_norm": 1.3502057790756226,
174
+ "learning_rate": 0.0001832,
175
+ "loss": 0.3519,
176
  "step": 210
177
  },
178
  {
179
+ "epoch": 0.35,
180
+ "grad_norm": 1.5546174049377441,
181
+ "learning_rate": 0.00018240000000000002,
182
+ "loss": 0.3243,
183
  "step": 220
184
  },
185
  {
186
+ "epoch": 0.37,
187
+ "grad_norm": 0.7677227854728699,
188
+ "learning_rate": 0.00018160000000000002,
189
+ "loss": 0.2914,
190
  "step": 230
191
  },
192
  {
193
+ "epoch": 0.38,
194
+ "grad_norm": 1.1754639148712158,
195
+ "learning_rate": 0.0001808,
196
+ "loss": 0.3539,
197
  "step": 240
198
  },
199
  {
200
+ "epoch": 0.4,
201
+ "grad_norm": 0.8472470641136169,
202
+ "learning_rate": 0.00018,
203
+ "loss": 0.2395,
204
  "step": 250
205
  },
206
  {
207
+ "epoch": 0.42,
208
+ "grad_norm": 1.1917964220046997,
209
+ "learning_rate": 0.00017920000000000002,
210
+ "loss": 0.2295,
211
  "step": 260
212
  },
213
  {
214
+ "epoch": 0.43,
215
+ "grad_norm": 0.7398644685745239,
216
+ "learning_rate": 0.0001784,
217
+ "loss": 0.2398,
218
  "step": 270
219
  },
220
  {
221
+ "epoch": 0.45,
222
+ "grad_norm": 0.5953208804130554,
223
+ "learning_rate": 0.0001776,
224
+ "loss": 0.2786,
225
  "step": 280
226
  },
227
  {
228
+ "epoch": 0.46,
229
+ "grad_norm": 2.0648913383483887,
230
+ "learning_rate": 0.00017680000000000001,
231
+ "loss": 0.3661,
232
  "step": 290
233
  },
234
  {
235
+ "epoch": 0.48,
236
+ "grad_norm": 1.4048805236816406,
237
+ "learning_rate": 0.00017600000000000002,
238
+ "loss": 0.2648,
239
  "step": 300
240
  },
241
  {
242
+ "epoch": 0.48,
243
+ "eval_loss": 0.2830840051174164,
244
+ "eval_na_accuracy": 0.944,
245
+ "eval_ordinal_accuracy": 0.5940023068050749,
246
+ "eval_runtime": 39.7255,
247
+ "eval_samples_per_second": 25.173,
248
+ "eval_steps_per_second": 3.147,
249
  "step": 300
250
  },
251
  {
252
+ "epoch": 0.5,
253
+ "grad_norm": 0.8102580904960632,
254
+ "learning_rate": 0.0001752,
255
+ "loss": 0.2359,
256
  "step": 310
257
  },
258
  {
259
+ "epoch": 0.51,
260
+ "grad_norm": 2.0220913887023926,
261
+ "learning_rate": 0.0001744,
262
+ "loss": 0.2557,
263
  "step": 320
264
  },
265
  {
266
+ "epoch": 0.53,
267
+ "grad_norm": 1.2111886739730835,
268
+ "learning_rate": 0.00017360000000000002,
269
+ "loss": 0.3025,
270
  "step": 330
271
  },
272
  {
273
+ "epoch": 0.54,
274
+ "grad_norm": 1.788378119468689,
275
+ "learning_rate": 0.0001728,
276
+ "loss": 0.3067,
277
  "step": 340
278
  },
279
  {
280
+ "epoch": 0.56,
281
+ "grad_norm": 0.7332974076271057,
282
+ "learning_rate": 0.000172,
283
+ "loss": 0.2612,
284
  "step": 350
285
  },
286
  {
287
+ "epoch": 0.58,
288
+ "grad_norm": 0.5220205783843994,
289
+ "learning_rate": 0.00017120000000000001,
290
+ "loss": 0.2924,
291
  "step": 360
292
  },
293
  {
294
+ "epoch": 0.59,
295
+ "grad_norm": 0.8991191387176514,
296
+ "learning_rate": 0.0001704,
297
+ "loss": 0.2379,
298
  "step": 370
299
  },
300
  {
301
+ "epoch": 0.61,
302
+ "grad_norm": 1.6633837223052979,
303
+ "learning_rate": 0.0001696,
304
+ "loss": 0.2792,
305
  "step": 380
306
  },
307
  {
308
+ "epoch": 0.62,
309
+ "grad_norm": 0.9553330540657043,
310
+ "learning_rate": 0.0001688,
311
+ "loss": 0.2512,
312
  "step": 390
313
  },
314
  {
315
+ "epoch": 0.64,
316
+ "grad_norm": 0.4544942080974579,
317
+ "learning_rate": 0.000168,
318
+ "loss": 0.3036,
319
  "step": 400
320
  },
321
  {
322
+ "epoch": 0.64,
323
+ "eval_loss": 0.27759096026420593,
324
+ "eval_na_accuracy": 0.949,
325
+ "eval_ordinal_accuracy": 0.5870818915801614,
326
+ "eval_runtime": 39.7611,
327
+ "eval_samples_per_second": 25.15,
328
+ "eval_steps_per_second": 3.144,
329
  "step": 400
330
  },
331
  {
332
+ "epoch": 0.66,
333
+ "grad_norm": 1.2751814126968384,
334
+ "learning_rate": 0.0001672,
335
+ "loss": 0.3042,
336
  "step": 410
337
  },
338
  {
339
+ "epoch": 0.67,
340
+ "grad_norm": 1.791074514389038,
341
+ "learning_rate": 0.0001664,
342
+ "loss": 0.3341,
343
  "step": 420
344
  },
345
  {
346
+ "epoch": 0.69,
347
+ "grad_norm": 0.9887642860412598,
348
+ "learning_rate": 0.0001656,
349
+ "loss": 0.2868,
350
  "step": 430
351
  },
352
  {
353
+ "epoch": 0.7,
354
+ "grad_norm": 1.3511923551559448,
355
+ "learning_rate": 0.0001648,
356
+ "loss": 0.3763,
357
  "step": 440
358
  },
359
  {
360
+ "epoch": 0.72,
361
+ "grad_norm": 1.7992609739303589,
362
+ "learning_rate": 0.000164,
363
+ "loss": 0.2264,
364
  "step": 450
365
  },
366
  {
367
+ "epoch": 0.74,
368
+ "grad_norm": 1.0241813659667969,
369
+ "learning_rate": 0.0001632,
370
+ "loss": 0.3018,
371
  "step": 460
372
  },
373
  {
374
+ "epoch": 0.75,
375
+ "grad_norm": 0.628193736076355,
376
+ "learning_rate": 0.00016240000000000002,
377
+ "loss": 0.3323,
378
  "step": 470
379
  },
380
  {
381
+ "epoch": 0.77,
382
+ "grad_norm": 0.8471026420593262,
383
+ "learning_rate": 0.00016160000000000002,
384
+ "loss": 0.2005,
385
  "step": 480
386
  },
387
  {
388
+ "epoch": 0.78,
389
+ "grad_norm": 1.1799852848052979,
390
+ "learning_rate": 0.0001608,
391
+ "loss": 0.2984,
392
  "step": 490
393
  },
394
  {
395
+ "epoch": 0.8,
396
+ "grad_norm": 1.595058560371399,
397
+ "learning_rate": 0.00016,
398
+ "loss": 0.2656,
399
  "step": 500
400
  },
401
  {
402
+ "epoch": 0.8,
403
+ "eval_loss": 0.2846027612686157,
404
+ "eval_na_accuracy": 0.931,
405
+ "eval_ordinal_accuracy": 0.6101499423298731,
406
+ "eval_runtime": 39.9154,
407
+ "eval_samples_per_second": 25.053,
408
+ "eval_steps_per_second": 3.132,
409
  "step": 500
410
  },
411
  {
412
+ "epoch": 0.82,
413
+ "grad_norm": 0.9246352910995483,
414
+ "learning_rate": 0.00015920000000000002,
415
+ "loss": 0.3591,
416
  "step": 510
417
  },
418
  {
419
+ "epoch": 0.83,
420
+ "grad_norm": 0.9456105828285217,
421
+ "learning_rate": 0.00015840000000000003,
422
+ "loss": 0.3569,
423
  "step": 520
424
  },
425
  {
426
+ "epoch": 0.85,
427
+ "grad_norm": 1.111274003982544,
428
+ "learning_rate": 0.0001576,
429
+ "loss": 0.3243,
430
  "step": 530
431
  },
432
  {
433
+ "epoch": 0.86,
434
+ "grad_norm": NaN,
435
+ "learning_rate": 0.00015688,
436
+ "loss": 0.2911,
437
  "step": 540
438
  },
439
  {
440
+ "epoch": 0.88,
441
+ "grad_norm": 0.8232502341270447,
442
+ "learning_rate": 0.00015616000000000002,
443
+ "loss": 0.3236,
444
  "step": 550
445
  },
446
  {
447
+ "epoch": 0.9,
448
+ "grad_norm": 0.6359846591949463,
449
+ "learning_rate": 0.00015536,
450
+ "loss": 0.3211,
451
  "step": 560
452
  },
453
  {
454
+ "epoch": 0.91,
455
+ "grad_norm": 0.545005738735199,
456
+ "learning_rate": 0.00015456,
457
+ "loss": 0.205,
458
  "step": 570
459
  },
460
  {
461
+ "epoch": 0.93,
462
+ "grad_norm": 0.6029797196388245,
463
+ "learning_rate": 0.00015376000000000002,
464
+ "loss": 0.1928,
465
  "step": 580
466
  },
467
  {
468
+ "epoch": 0.94,
469
+ "grad_norm": 0.7442355155944824,
470
+ "learning_rate": 0.00015296000000000003,
471
+ "loss": 0.3273,
472
  "step": 590
473
  },
474
  {
475
+ "epoch": 0.96,
476
+ "grad_norm": 0.6751519441604614,
477
+ "learning_rate": 0.00015216,
478
+ "loss": 0.2954,
479
  "step": 600
480
  },
481
  {
482
+ "epoch": 0.96,
483
+ "eval_loss": 0.2531912922859192,
484
+ "eval_na_accuracy": 0.947,
485
+ "eval_ordinal_accuracy": 0.5951557093425606,
486
+ "eval_runtime": 39.4037,
487
+ "eval_samples_per_second": 25.378,
488
+ "eval_steps_per_second": 3.172,
489
  "step": 600
490
  },
491
  {
492
+ "epoch": 0.98,
493
+ "grad_norm": 1.6347012519836426,
494
+ "learning_rate": 0.00015136000000000001,
495
+ "loss": 0.2256,
496
  "step": 610
497
  },
498
  {
499
+ "epoch": 0.99,
500
+ "grad_norm": 6.180319309234619,
501
+ "learning_rate": 0.00015056000000000002,
502
+ "loss": 0.2003,
503
  "step": 620
504
  },
505
  {
506
+ "epoch": 1.01,
507
+ "grad_norm": 0.8919633626937866,
508
+ "learning_rate": 0.00014976,
509
+ "loss": 0.198,
510
  "step": 630
511
  },
512
  {
513
+ "epoch": 1.02,
514
+ "grad_norm": 0.9197341203689575,
515
+ "learning_rate": 0.00014896,
516
+ "loss": 0.1685,
517
  "step": 640
518
  },
519
  {
520
+ "epoch": 1.04,
521
+ "grad_norm": 0.37014976143836975,
522
+ "learning_rate": 0.00014816000000000002,
523
+ "loss": 0.1729,
524
  "step": 650
525
  },
526
  {
527
+ "epoch": 1.06,
528
+ "grad_norm": 0.8919755220413208,
529
+ "learning_rate": 0.00014736,
530
+ "loss": 0.1993,
531
  "step": 660
532
  },
533
  {
534
+ "epoch": 1.07,
535
+ "grad_norm": 0.7291600704193115,
536
+ "learning_rate": 0.00014656,
537
+ "loss": 0.1893,
538
  "step": 670
539
  },
540
  {
541
+ "epoch": 1.09,
542
+ "grad_norm": 2.347400665283203,
543
+ "learning_rate": 0.00014576000000000001,
544
+ "loss": 0.1799,
545
  "step": 680
546
  },
547
  {
548
+ "epoch": 1.1,
549
+ "grad_norm": 0.3188568949699402,
550
+ "learning_rate": 0.00014496,
551
+ "loss": 0.167,
552
  "step": 690
553
  },
554
  {
555
+ "epoch": 1.12,
556
+ "grad_norm": 1.342278242111206,
557
+ "learning_rate": 0.00014416,
558
+ "loss": 0.1991,
559
  "step": 700
560
  },
561
  {
562
+ "epoch": 1.12,
563
+ "eval_loss": 0.26034072041511536,
564
+ "eval_na_accuracy": 0.942,
565
+ "eval_ordinal_accuracy": 0.6078431372549019,
566
+ "eval_runtime": 39.5088,
567
+ "eval_samples_per_second": 25.311,
568
+ "eval_steps_per_second": 3.164,
569
  "step": 700
570
  },
571
  {
572
+ "epoch": 1.14,
573
+ "grad_norm": 0.48655757308006287,
574
+ "learning_rate": 0.00014336,
575
+ "loss": 0.1885,
576
  "step": 710
577
  },
578
  {
579
+ "epoch": 1.15,
580
+ "grad_norm": 0.556333065032959,
581
+ "learning_rate": 0.00014256000000000002,
582
+ "loss": 0.1449,
583
  "step": 720
584
  },
585
  {
586
+ "epoch": 1.17,
587
+ "grad_norm": 0.4880894422531128,
588
+ "learning_rate": 0.00014176,
589
+ "loss": 0.1164,
590
  "step": 730
591
  },
592
  {
593
+ "epoch": 1.18,
594
+ "grad_norm": 0.599926233291626,
595
+ "learning_rate": 0.00014096,
596
+ "loss": 0.2113,
597
  "step": 740
598
  },
599
  {
600
+ "epoch": 1.2,
601
+ "grad_norm": 0.6070149540901184,
602
+ "learning_rate": 0.00014016,
603
+ "loss": 0.1534,
604
  "step": 750
605
  },
606
  {
607
+ "epoch": 1.22,
608
+ "grad_norm": 0.7789746522903442,
609
+ "learning_rate": 0.00013936,
610
+ "loss": 0.1655,
611
  "step": 760
612
  },
613
  {
614
+ "epoch": 1.23,
615
+ "grad_norm": 0.5523375868797302,
616
+ "learning_rate": 0.00013856,
617
+ "loss": 0.298,
618
  "step": 770
619
  },
620
  {
621
+ "epoch": 1.25,
622
+ "grad_norm": 2.4257819652557373,
623
+ "learning_rate": 0.00013776,
624
+ "loss": 0.2101,
625
  "step": 780
626
  },
627
  {
628
+ "epoch": 1.26,
629
+ "grad_norm": 0.5729731321334839,
630
+ "learning_rate": 0.00013696,
631
+ "loss": 0.133,
632
  "step": 790
633
  },
634
  {
635
+ "epoch": 1.28,
636
+ "grad_norm": 0.4050444960594177,
637
+ "learning_rate": 0.00013616,
638
+ "loss": 0.1678,
639
  "step": 800
640
  },
641
  {
642
+ "epoch": 1.28,
643
+ "eval_loss": 0.2904650568962097,
644
+ "eval_na_accuracy": 0.942,
645
+ "eval_ordinal_accuracy": 0.6332179930795848,
646
+ "eval_runtime": 39.7139,
647
+ "eval_samples_per_second": 25.18,
648
+ "eval_steps_per_second": 3.148,
649
  "step": 800
650
  },
651
  {
652
+ "epoch": 1.3,
653
+ "grad_norm": 4.782747745513916,
654
+ "learning_rate": 0.00013536,
655
+ "loss": 0.207,
656
  "step": 810
657
  },
658
  {
659
+ "epoch": 1.31,
660
+ "grad_norm": 2.577669143676758,
661
+ "learning_rate": 0.00013455999999999999,
662
+ "loss": 0.1818,
663
  "step": 820
664
  },
665
  {
666
+ "epoch": 1.33,
667
+ "grad_norm": 2.8163273334503174,
668
+ "learning_rate": 0.00013376,
669
+ "loss": 0.1761,
670
  "step": 830
671
  },
672
  {
673
+ "epoch": 1.34,
674
+ "grad_norm": 2.213799238204956,
675
+ "learning_rate": 0.00013296,
676
+ "loss": 0.2966,
677
  "step": 840
678
  },
679
  {
680
+ "epoch": 1.36,
681
+ "grad_norm": 0.8946444988250732,
682
+ "learning_rate": 0.00013216,
683
+ "loss": 0.1569,
684
  "step": 850
685
  },
686
  {
687
+ "epoch": 1.38,
688
+ "grad_norm": 0.6494708061218262,
689
+ "learning_rate": 0.00013136000000000002,
690
+ "loss": 0.1746,
691
  "step": 860
692
  },
693
  {
694
+ "epoch": 1.39,
695
+ "grad_norm": 1.0058079957962036,
696
+ "learning_rate": 0.00013056000000000002,
697
+ "loss": 0.1204,
698
  "step": 870
699
  },
700
  {
701
+ "epoch": 1.41,
702
+ "grad_norm": 1.1752161979675293,
703
+ "learning_rate": 0.00012976,
704
+ "loss": 0.2082,
705
  "step": 880
706
  },
707
  {
708
+ "epoch": 1.42,
709
+ "grad_norm": 0.5655858516693115,
710
+ "learning_rate": 0.00012896,
711
+ "loss": 0.1971,
712
  "step": 890
713
  },
714
  {
715
+ "epoch": 1.44,
716
+ "grad_norm": 2.5486743450164795,
717
+ "learning_rate": 0.00012816000000000002,
718
+ "loss": 0.2514,
719
  "step": 900
720
  },
721
  {
722
+ "epoch": 1.44,
723
+ "eval_loss": 0.25656750798225403,
724
+ "eval_na_accuracy": 0.94,
725
+ "eval_ordinal_accuracy": 0.6089965397923875,
726
+ "eval_runtime": 39.7194,
727
+ "eval_samples_per_second": 25.177,
728
+ "eval_steps_per_second": 3.147,
729
  "step": 900
730
  },
731
  {
732
+ "epoch": 1.46,
733
+ "grad_norm": 0.878511369228363,
734
+ "learning_rate": 0.00012736,
735
+ "loss": 0.162,
736
  "step": 910
737
  },
738
  {
739
+ "epoch": 1.47,
740
+ "grad_norm": 1.1985282897949219,
741
+ "learning_rate": 0.00012656,
742
+ "loss": 0.2268,
743
  "step": 920
744
  },
745
  {
746
+ "epoch": 1.49,
747
+ "grad_norm": 0.521425187587738,
748
+ "learning_rate": 0.00012576000000000002,
749
+ "loss": 0.1556,
750
  "step": 930
751
  },
752
  {
753
+ "epoch": 1.5,
754
+ "grad_norm": 0.9773241877555847,
755
+ "learning_rate": 0.00012496000000000002,
756
+ "loss": 0.1457,
757
  "step": 940
758
  },
759
  {
760
+ "epoch": 1.52,
761
+ "grad_norm": 1.6476322412490845,
762
+ "learning_rate": 0.00012416,
763
+ "loss": 0.1913,
764
  "step": 950
765
  },
766
  {
767
+ "epoch": 1.54,
768
+ "grad_norm": 1.7127236127853394,
769
+ "learning_rate": 0.00012336,
770
+ "loss": 0.1961,
771
  "step": 960
772
  },
773
  {
774
+ "epoch": 1.55,
775
+ "grad_norm": 4.41243314743042,
776
+ "learning_rate": 0.00012256000000000002,
777
+ "loss": 0.2061,
778
  "step": 970
779
  },
780
  {
781
+ "epoch": 1.57,
782
+ "grad_norm": 1.5907992124557495,
783
+ "learning_rate": 0.00012176000000000001,
784
+ "loss": 0.1299,
785
  "step": 980
786
  },
787
  {
788
+ "epoch": 1.58,
789
+ "grad_norm": 0.5711427927017212,
790
+ "learning_rate": 0.00012096000000000001,
791
+ "loss": 0.1755,
792
  "step": 990
793
  },
794
  {
795
+ "epoch": 1.6,
796
+ "grad_norm": 2.925363302230835,
797
+ "learning_rate": 0.00012016,
798
+ "loss": 0.2328,
799
  "step": 1000
800
  },
801
  {
802
+ "epoch": 1.6,
803
+ "eval_loss": 0.2884255647659302,
804
+ "eval_na_accuracy": 0.94,
805
+ "eval_ordinal_accuracy": 0.5617070357554786,
806
+ "eval_runtime": 39.4898,
807
+ "eval_samples_per_second": 25.323,
808
+ "eval_steps_per_second": 3.165,
809
  "step": 1000
810
  },
811
  {
812
+ "epoch": 1.62,
813
+ "grad_norm": 1.1306260824203491,
814
+ "learning_rate": 0.00011936000000000001,
815
+ "loss": 0.1595,
816
  "step": 1010
817
  },
818
  {
819
+ "epoch": 1.63,
820
+ "grad_norm": 1.8953267335891724,
821
+ "learning_rate": 0.00011856,
822
+ "loss": 0.2489,
823
  "step": 1020
824
  },
825
  {
826
+ "epoch": 1.65,
827
+ "grad_norm": 0.7074128985404968,
828
+ "learning_rate": 0.00011776,
829
+ "loss": 0.2485,
830
  "step": 1030
831
  },
832
  {
833
+ "epoch": 1.66,
834
+ "grad_norm": 0.7052355408668518,
835
+ "learning_rate": 0.00011696,
836
+ "loss": 0.2075,
837
  "step": 1040
838
  },
839
  {
840
+ "epoch": 1.68,
841
+ "grad_norm": 0.7830259203910828,
842
+ "learning_rate": 0.00011616,
843
+ "loss": 0.2346,
844
  "step": 1050
845
  },
846
  {
847
+ "epoch": 1.7,
848
+ "grad_norm": 0.5882430672645569,
849
+ "learning_rate": 0.00011536000000000001,
850
+ "loss": 0.2136,
851
  "step": 1060
852
  },
853
  {
854
+ "epoch": 1.71,
855
+ "grad_norm": 1.0235962867736816,
856
+ "learning_rate": 0.00011456,
857
+ "loss": 0.1753,
858
  "step": 1070
859
  },
860
  {
861
+ "epoch": 1.73,
862
+ "grad_norm": 0.9401603937149048,
863
+ "learning_rate": 0.00011376,
864
+ "loss": 0.173,
865
  "step": 1080
866
  },
867
  {
868
+ "epoch": 1.74,
869
+ "grad_norm": 1.0735399723052979,
870
+ "learning_rate": 0.00011296,
871
+ "loss": 0.1993,
872
  "step": 1090
873
  },
874
  {
875
+ "epoch": 1.76,
876
+ "grad_norm": 0.6592912673950195,
877
+ "learning_rate": 0.00011216,
878
+ "loss": 0.1826,
879
  "step": 1100
880
  },
881
  {
882
+ "epoch": 1.76,
883
+ "eval_loss": 0.2869604229927063,
884
+ "eval_na_accuracy": 0.943,
885
+ "eval_ordinal_accuracy": 0.6043829296424452,
886
+ "eval_runtime": 39.3069,
887
+ "eval_samples_per_second": 25.441,
888
+ "eval_steps_per_second": 3.18,
889
  "step": 1100
890
  },
891
  {
892
+ "epoch": 1.78,
893
+ "grad_norm": 2.1649601459503174,
894
+ "learning_rate": 0.00011135999999999999,
895
+ "loss": 0.221,
896
  "step": 1110
897
  },
898
  {
899
+ "epoch": 1.79,
900
+ "grad_norm": 1.7881801128387451,
901
+ "learning_rate": 0.00011056,
902
+ "loss": 0.1765,
903
  "step": 1120
904
  },
905
  {
906
+ "epoch": 1.81,
907
+ "grad_norm": 1.3527191877365112,
908
+ "learning_rate": 0.00010975999999999999,
909
+ "loss": 0.1325,
910
  "step": 1130
911
  },
912
  {
913
+ "epoch": 1.82,
914
+ "grad_norm": 0.7212499976158142,
915
+ "learning_rate": 0.00010896,
916
+ "loss": 0.2445,
917
  "step": 1140
918
  },
919
  {
920
+ "epoch": 1.84,
921
+ "grad_norm": 0.9492518901824951,
922
+ "learning_rate": 0.00010816,
923
+ "loss": 0.2704,
924
  "step": 1150
925
  },
926
  {
927
+ "epoch": 1.86,
928
+ "grad_norm": 0.4344118535518646,
929
+ "learning_rate": 0.00010736000000000002,
930
+ "loss": 0.1624,
931
  "step": 1160
932
  },
933
  {
934
+ "epoch": 1.87,
935
+ "grad_norm": 0.4115823209285736,
936
+ "learning_rate": 0.00010656000000000001,
937
+ "loss": 0.2069,
938
  "step": 1170
939
  },
940
  {
941
+ "epoch": 1.89,
942
+ "grad_norm": 0.6738015413284302,
943
+ "learning_rate": 0.00010576000000000002,
944
+ "loss": 0.208,
945
  "step": 1180
946
  },
947
  {
948
+ "epoch": 1.9,
949
+ "grad_norm": 0.9090007543563843,
950
+ "learning_rate": 0.00010496000000000001,
951
+ "loss": 0.1793,
952
  "step": 1190
953
  },
954
  {
955
+ "epoch": 1.92,
956
+ "grad_norm": 1.1480025053024292,
957
+ "learning_rate": 0.00010416000000000002,
958
+ "loss": 0.2013,
959
  "step": 1200
960
  },
961
  {
962
+ "epoch": 1.92,
963
+ "eval_loss": 0.29365527629852295,
964
+ "eval_na_accuracy": 0.941,
965
+ "eval_ordinal_accuracy": 0.5905420991926182,
966
+ "eval_runtime": 39.6842,
967
+ "eval_samples_per_second": 25.199,
968
+ "eval_steps_per_second": 3.15,
969
  "step": 1200
970
  },
971
  {
972
+ "epoch": 1.94,
973
+ "grad_norm": 0.62380051612854,
974
+ "learning_rate": 0.00010336000000000001,
975
+ "loss": 0.192,
976
  "step": 1210
977
  },
978
  {
979
+ "epoch": 1.95,
980
+ "grad_norm": 0.9949710965156555,
981
+ "learning_rate": 0.00010256000000000001,
982
+ "loss": 0.226,
983
  "step": 1220
984
  },
985
  {
986
+ "epoch": 1.97,
987
+ "grad_norm": 1.0634446144104004,
988
+ "learning_rate": 0.00010176000000000002,
989
+ "loss": 0.142,
990
  "step": 1230
991
  },
992
  {
993
+ "epoch": 1.98,
994
+ "grad_norm": 0.8875225782394409,
995
+ "learning_rate": 0.00010096000000000001,
996
+ "loss": 0.1729,
997
  "step": 1240
998
  },
999
  {
1000
+ "epoch": 2.0,
1001
+ "grad_norm": 0.6193259358406067,
1002
+ "learning_rate": 0.00010016,
1003
+ "loss": 0.102,
1004
  "step": 1250
1005
  },
1006
+ {
1007
+ "epoch": 2.02,
1008
+ "grad_norm": 0.4042517840862274,
1009
+ "learning_rate": 9.936000000000001e-05,
1010
+ "loss": 0.0976,
1011
+ "step": 1260
1012
+ },
1013
+ {
1014
+ "epoch": 2.03,
1015
+ "grad_norm": 0.4051195979118347,
1016
+ "learning_rate": 9.856e-05,
1017
+ "loss": 0.1448,
1018
+ "step": 1270
1019
+ },
1020
+ {
1021
+ "epoch": 2.05,
1022
+ "grad_norm": 0.46061789989471436,
1023
+ "learning_rate": 9.776000000000001e-05,
1024
+ "loss": 0.0768,
1025
+ "step": 1280
1026
+ },
1027
+ {
1028
+ "epoch": 2.06,
1029
+ "grad_norm": 0.5934004783630371,
1030
+ "learning_rate": 9.696000000000001e-05,
1031
+ "loss": 0.1404,
1032
+ "step": 1290
1033
+ },
1034
+ {
1035
+ "epoch": 2.08,
1036
+ "grad_norm": 0.6819984316825867,
1037
+ "learning_rate": 9.616e-05,
1038
+ "loss": 0.0663,
1039
+ "step": 1300
1040
+ },
1041
+ {
1042
+ "epoch": 2.08,
1043
+ "eval_loss": 0.2954486608505249,
1044
+ "eval_na_accuracy": 0.938,
1045
+ "eval_ordinal_accuracy": 0.6251441753171857,
1046
+ "eval_runtime": 39.658,
1047
+ "eval_samples_per_second": 25.216,
1048
+ "eval_steps_per_second": 3.152,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 2.1,
1053
+ "grad_norm": 0.5849266052246094,
1054
+ "learning_rate": 9.536000000000001e-05,
1055
+ "loss": 0.1574,
1056
+ "step": 1310
1057
+ },
1058
+ {
1059
+ "epoch": 2.11,
1060
+ "grad_norm": 0.9393780827522278,
1061
+ "learning_rate": 9.456e-05,
1062
+ "loss": 0.0979,
1063
+ "step": 1320
1064
+ },
1065
+ {
1066
+ "epoch": 2.13,
1067
+ "grad_norm": 0.47529059648513794,
1068
+ "learning_rate": 9.376e-05,
1069
+ "loss": 0.1073,
1070
+ "step": 1330
1071
+ },
1072
+ {
1073
+ "epoch": 2.14,
1074
+ "grad_norm": 0.4079722464084625,
1075
+ "learning_rate": 9.296e-05,
1076
+ "loss": 0.0868,
1077
+ "step": 1340
1078
+ },
1079
+ {
1080
+ "epoch": 2.16,
1081
+ "grad_norm": 0.7292589545249939,
1082
+ "learning_rate": 9.216e-05,
1083
+ "loss": 0.1446,
1084
+ "step": 1350
1085
+ },
1086
+ {
1087
+ "epoch": 2.18,
1088
+ "grad_norm": 0.9205511212348938,
1089
+ "learning_rate": 9.136e-05,
1090
+ "loss": 0.0907,
1091
+ "step": 1360
1092
+ },
1093
+ {
1094
+ "epoch": 2.19,
1095
+ "grad_norm": 0.9218105673789978,
1096
+ "learning_rate": 9.056e-05,
1097
+ "loss": 0.1387,
1098
+ "step": 1370
1099
+ },
1100
+ {
1101
+ "epoch": 2.21,
1102
+ "grad_norm": 0.5730422139167786,
1103
+ "learning_rate": 8.976e-05,
1104
+ "loss": 0.0882,
1105
+ "step": 1380
1106
+ },
1107
+ {
1108
+ "epoch": 2.22,
1109
+ "grad_norm": 0.6922823190689087,
1110
+ "learning_rate": 8.896e-05,
1111
+ "loss": 0.0741,
1112
+ "step": 1390
1113
+ },
1114
+ {
1115
+ "epoch": 2.24,
1116
+ "grad_norm": 1.1872971057891846,
1117
+ "learning_rate": 8.816000000000001e-05,
1118
+ "loss": 0.1503,
1119
+ "step": 1400
1120
+ },
1121
+ {
1122
+ "epoch": 2.24,
1123
+ "eval_loss": 0.3187769651412964,
1124
+ "eval_na_accuracy": 0.937,
1125
+ "eval_ordinal_accuracy": 0.5986159169550173,
1126
+ "eval_runtime": 39.7243,
1127
+ "eval_samples_per_second": 25.173,
1128
+ "eval_steps_per_second": 3.147,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 2.26,
1133
+ "grad_norm": 2.3350443840026855,
1134
+ "learning_rate": 8.736e-05,
1135
+ "loss": 0.1083,
1136
+ "step": 1410
1137
+ },
1138
+ {
1139
+ "epoch": 2.27,
1140
+ "grad_norm": 0.8266046643257141,
1141
+ "learning_rate": 8.656000000000001e-05,
1142
+ "loss": 0.0684,
1143
+ "step": 1420
1144
+ },
1145
+ {
1146
+ "epoch": 2.29,
1147
+ "grad_norm": 0.41480687260627747,
1148
+ "learning_rate": 8.576e-05,
1149
+ "loss": 0.0809,
1150
+ "step": 1430
1151
+ },
1152
+ {
1153
+ "epoch": 2.3,
1154
+ "grad_norm": 0.4657377600669861,
1155
+ "learning_rate": 8.496e-05,
1156
+ "loss": 0.0718,
1157
+ "step": 1440
1158
+ },
1159
+ {
1160
+ "epoch": 2.32,
1161
+ "grad_norm": 0.5419800877571106,
1162
+ "learning_rate": 8.416000000000001e-05,
1163
+ "loss": 0.1322,
1164
+ "step": 1450
1165
+ },
1166
+ {
1167
+ "epoch": 2.34,
1168
+ "grad_norm": 1.167611837387085,
1169
+ "learning_rate": 8.336e-05,
1170
+ "loss": 0.1017,
1171
+ "step": 1460
1172
+ },
1173
+ {
1174
+ "epoch": 2.35,
1175
+ "grad_norm": 0.449034184217453,
1176
+ "learning_rate": 8.256000000000001e-05,
1177
+ "loss": 0.0636,
1178
+ "step": 1470
1179
+ },
1180
+ {
1181
+ "epoch": 2.37,
1182
+ "grad_norm": 0.6716451048851013,
1183
+ "learning_rate": 8.176e-05,
1184
+ "loss": 0.1109,
1185
+ "step": 1480
1186
+ },
1187
+ {
1188
+ "epoch": 2.38,
1189
+ "grad_norm": 4.306596755981445,
1190
+ "learning_rate": 8.096e-05,
1191
+ "loss": 0.0898,
1192
+ "step": 1490
1193
+ },
1194
+ {
1195
+ "epoch": 2.4,
1196
+ "grad_norm": 0.41288742423057556,
1197
+ "learning_rate": 8.016e-05,
1198
+ "loss": 0.0611,
1199
+ "step": 1500
1200
+ },
1201
+ {
1202
+ "epoch": 2.4,
1203
+ "eval_loss": 0.33932703733444214,
1204
+ "eval_na_accuracy": 0.945,
1205
+ "eval_ordinal_accuracy": 0.5997693194925029,
1206
+ "eval_runtime": 39.236,
1207
+ "eval_samples_per_second": 25.487,
1208
+ "eval_steps_per_second": 3.186,
1209
+ "step": 1500
1210
+ },
1211
+ {
1212
+ "epoch": 2.42,
1213
+ "grad_norm": 0.7951626181602478,
1214
+ "learning_rate": 7.936e-05,
1215
+ "loss": 0.0799,
1216
+ "step": 1510
1217
+ },
1218
+ {
1219
+ "epoch": 2.43,
1220
+ "grad_norm": 1.0197049379348755,
1221
+ "learning_rate": 7.856000000000001e-05,
1222
+ "loss": 0.0928,
1223
+ "step": 1520
1224
+ },
1225
+ {
1226
+ "epoch": 2.45,
1227
+ "grad_norm": 0.6486759185791016,
1228
+ "learning_rate": 7.776e-05,
1229
+ "loss": 0.0964,
1230
+ "step": 1530
1231
+ },
1232
+ {
1233
+ "epoch": 2.46,
1234
+ "grad_norm": 1.0220657587051392,
1235
+ "learning_rate": 7.696e-05,
1236
+ "loss": 0.1736,
1237
+ "step": 1540
1238
+ },
1239
+ {
1240
+ "epoch": 2.48,
1241
+ "grad_norm": 2.3006441593170166,
1242
+ "learning_rate": 7.616e-05,
1243
+ "loss": 0.0993,
1244
+ "step": 1550
1245
+ },
1246
+ {
1247
+ "epoch": 2.5,
1248
+ "grad_norm": 0.4701670706272125,
1249
+ "learning_rate": 7.536000000000001e-05,
1250
+ "loss": 0.0779,
1251
+ "step": 1560
1252
+ },
1253
+ {
1254
+ "epoch": 2.51,
1255
+ "grad_norm": 0.641832172870636,
1256
+ "learning_rate": 7.456e-05,
1257
+ "loss": 0.0586,
1258
+ "step": 1570
1259
+ },
1260
+ {
1261
+ "epoch": 2.53,
1262
+ "grad_norm": 0.5836305618286133,
1263
+ "learning_rate": 7.376000000000001e-05,
1264
+ "loss": 0.053,
1265
+ "step": 1580
1266
+ },
1267
+ {
1268
+ "epoch": 2.54,
1269
+ "grad_norm": 0.6500815153121948,
1270
+ "learning_rate": 7.296e-05,
1271
+ "loss": 0.0779,
1272
+ "step": 1590
1273
+ },
1274
+ {
1275
+ "epoch": 2.56,
1276
+ "grad_norm": 0.5682386755943298,
1277
+ "learning_rate": 7.216e-05,
1278
+ "loss": 0.0743,
1279
+ "step": 1600
1280
+ },
1281
+ {
1282
+ "epoch": 2.56,
1283
+ "eval_loss": 0.3182476758956909,
1284
+ "eval_na_accuracy": 0.942,
1285
+ "eval_ordinal_accuracy": 0.6482122260668973,
1286
+ "eval_runtime": 40.5507,
1287
+ "eval_samples_per_second": 24.66,
1288
+ "eval_steps_per_second": 3.083,
1289
+ "step": 1600
1290
+ },
1291
+ {
1292
+ "epoch": 2.58,
1293
+ "grad_norm": 0.5527540445327759,
1294
+ "learning_rate": 7.136000000000001e-05,
1295
+ "loss": 0.094,
1296
+ "step": 1610
1297
+ },
1298
+ {
1299
+ "epoch": 2.59,
1300
+ "grad_norm": 0.8710426092147827,
1301
+ "learning_rate": 7.056e-05,
1302
+ "loss": 0.0839,
1303
+ "step": 1620
1304
+ },
1305
+ {
1306
+ "epoch": 2.61,
1307
+ "grad_norm": 0.9312260746955872,
1308
+ "learning_rate": 6.976000000000001e-05,
1309
+ "loss": 0.155,
1310
+ "step": 1630
1311
+ },
1312
+ {
1313
+ "epoch": 2.62,
1314
+ "grad_norm": 0.48695412278175354,
1315
+ "learning_rate": 6.896e-05,
1316
+ "loss": 0.0606,
1317
+ "step": 1640
1318
+ },
1319
+ {
1320
+ "epoch": 2.64,
1321
+ "grad_norm": 0.525652289390564,
1322
+ "learning_rate": 6.816e-05,
1323
+ "loss": 0.0715,
1324
+ "step": 1650
1325
+ },
1326
+ {
1327
+ "epoch": 2.66,
1328
+ "grad_norm": 0.7670960426330566,
1329
+ "learning_rate": 6.736e-05,
1330
+ "loss": 0.1108,
1331
+ "step": 1660
1332
+ },
1333
+ {
1334
+ "epoch": 2.67,
1335
+ "grad_norm": 1.0041375160217285,
1336
+ "learning_rate": 6.656e-05,
1337
+ "loss": 0.1257,
1338
+ "step": 1670
1339
+ },
1340
+ {
1341
+ "epoch": 2.69,
1342
+ "grad_norm": 0.3819805681705475,
1343
+ "learning_rate": 6.576e-05,
1344
+ "loss": 0.1004,
1345
+ "step": 1680
1346
+ },
1347
+ {
1348
+ "epoch": 2.7,
1349
+ "grad_norm": 0.5372006893157959,
1350
+ "learning_rate": 6.496e-05,
1351
+ "loss": 0.0825,
1352
+ "step": 1690
1353
+ },
1354
+ {
1355
+ "epoch": 2.72,
1356
+ "grad_norm": 0.5835949182510376,
1357
+ "learning_rate": 6.416e-05,
1358
+ "loss": 0.0908,
1359
+ "step": 1700
1360
+ },
1361
+ {
1362
+ "epoch": 2.72,
1363
+ "eval_loss": 0.3332485854625702,
1364
+ "eval_na_accuracy": 0.942,
1365
+ "eval_ordinal_accuracy": 0.6482122260668973,
1366
+ "eval_runtime": 39.9642,
1367
+ "eval_samples_per_second": 25.022,
1368
+ "eval_steps_per_second": 3.128,
1369
+ "step": 1700
1370
+ },
1371
+ {
1372
+ "epoch": 2.74,
1373
+ "grad_norm": 0.677947461605072,
1374
+ "learning_rate": 6.336e-05,
1375
+ "loss": 0.1086,
1376
+ "step": 1710
1377
+ },
1378
+ {
1379
+ "epoch": 2.75,
1380
+ "grad_norm": 0.7373325228691101,
1381
+ "learning_rate": 6.256000000000001e-05,
1382
+ "loss": 0.0698,
1383
+ "step": 1720
1384
+ },
1385
+ {
1386
+ "epoch": 2.77,
1387
+ "grad_norm": 0.7738047242164612,
1388
+ "learning_rate": 6.176e-05,
1389
+ "loss": 0.1118,
1390
+ "step": 1730
1391
+ },
1392
+ {
1393
+ "epoch": 2.78,
1394
+ "grad_norm": 2.052891254425049,
1395
+ "learning_rate": 6.0960000000000006e-05,
1396
+ "loss": 0.1002,
1397
+ "step": 1740
1398
+ },
1399
+ {
1400
+ "epoch": 2.8,
1401
+ "grad_norm": 0.26311352849006653,
1402
+ "learning_rate": 6.016000000000001e-05,
1403
+ "loss": 0.0944,
1404
+ "step": 1750
1405
+ },
1406
+ {
1407
+ "epoch": 2.82,
1408
+ "grad_norm": 0.8190409541130066,
1409
+ "learning_rate": 5.936000000000001e-05,
1410
+ "loss": 0.0717,
1411
+ "step": 1760
1412
+ },
1413
+ {
1414
+ "epoch": 2.83,
1415
+ "grad_norm": 0.5824436545372009,
1416
+ "learning_rate": 5.856e-05,
1417
+ "loss": 0.0746,
1418
+ "step": 1770
1419
+ },
1420
+ {
1421
+ "epoch": 2.85,
1422
+ "grad_norm": 0.5489352941513062,
1423
+ "learning_rate": 5.776e-05,
1424
+ "loss": 0.1063,
1425
+ "step": 1780
1426
+ },
1427
+ {
1428
+ "epoch": 2.86,
1429
+ "grad_norm": 0.656225323677063,
1430
+ "learning_rate": 5.6960000000000004e-05,
1431
+ "loss": 0.0763,
1432
+ "step": 1790
1433
+ },
1434
+ {
1435
+ "epoch": 2.88,
1436
+ "grad_norm": 0.8495000600814819,
1437
+ "learning_rate": 5.6160000000000004e-05,
1438
+ "loss": 0.1108,
1439
+ "step": 1800
1440
+ },
1441
+ {
1442
+ "epoch": 2.88,
1443
+ "eval_loss": 0.32561835646629333,
1444
+ "eval_na_accuracy": 0.943,
1445
+ "eval_ordinal_accuracy": 0.6459054209919262,
1446
+ "eval_runtime": 39.4673,
1447
+ "eval_samples_per_second": 25.337,
1448
+ "eval_steps_per_second": 3.167,
1449
+ "step": 1800
1450
+ },
1451
+ {
1452
+ "epoch": 2.9,
1453
+ "grad_norm": 0.617258608341217,
1454
+ "learning_rate": 5.536e-05,
1455
+ "loss": 0.1203,
1456
+ "step": 1810
1457
+ },
1458
+ {
1459
+ "epoch": 2.91,
1460
+ "grad_norm": 0.4484919011592865,
1461
+ "learning_rate": 5.456e-05,
1462
+ "loss": 0.0573,
1463
+ "step": 1820
1464
+ },
1465
+ {
1466
+ "epoch": 2.93,
1467
+ "grad_norm": 0.533388614654541,
1468
+ "learning_rate": 5.376e-05,
1469
+ "loss": 0.0762,
1470
+ "step": 1830
1471
+ },
1472
+ {
1473
+ "epoch": 2.94,
1474
+ "grad_norm": 0.4078121483325958,
1475
+ "learning_rate": 5.296e-05,
1476
+ "loss": 0.0643,
1477
+ "step": 1840
1478
+ },
1479
+ {
1480
+ "epoch": 2.96,
1481
+ "grad_norm": 0.5678732395172119,
1482
+ "learning_rate": 5.2159999999999995e-05,
1483
+ "loss": 0.1126,
1484
+ "step": 1850
1485
+ },
1486
+ {
1487
+ "epoch": 2.98,
1488
+ "grad_norm": 0.6543716192245483,
1489
+ "learning_rate": 5.1359999999999996e-05,
1490
+ "loss": 0.0763,
1491
+ "step": 1860
1492
+ },
1493
+ {
1494
+ "epoch": 2.99,
1495
+ "grad_norm": 0.6005885601043701,
1496
+ "learning_rate": 5.056000000000001e-05,
1497
+ "loss": 0.0949,
1498
+ "step": 1870
1499
+ },
1500
+ {
1501
+ "epoch": 3.01,
1502
+ "grad_norm": 0.4467845559120178,
1503
+ "learning_rate": 4.976e-05,
1504
+ "loss": 0.0481,
1505
+ "step": 1880
1506
+ },
1507
+ {
1508
+ "epoch": 3.02,
1509
+ "grad_norm": 0.48746606707572937,
1510
+ "learning_rate": 4.896e-05,
1511
+ "loss": 0.0415,
1512
+ "step": 1890
1513
+ },
1514
+ {
1515
+ "epoch": 3.04,
1516
+ "grad_norm": 0.7011713981628418,
1517
+ "learning_rate": 4.816e-05,
1518
+ "loss": 0.0786,
1519
+ "step": 1900
1520
+ },
1521
+ {
1522
+ "epoch": 3.04,
1523
+ "eval_loss": 0.3222349286079407,
1524
+ "eval_na_accuracy": 0.944,
1525
+ "eval_ordinal_accuracy": 0.6539792387543253,
1526
+ "eval_runtime": 39.2271,
1527
+ "eval_samples_per_second": 25.493,
1528
+ "eval_steps_per_second": 3.187,
1529
+ "step": 1900
1530
+ },
1531
+ {
1532
+ "epoch": 3.06,
1533
+ "grad_norm": 0.5153183341026306,
1534
+ "learning_rate": 4.736000000000001e-05,
1535
+ "loss": 0.0387,
1536
+ "step": 1910
1537
+ },
1538
+ {
1539
+ "epoch": 3.07,
1540
+ "grad_norm": 0.43751129508018494,
1541
+ "learning_rate": 4.656e-05,
1542
+ "loss": 0.0364,
1543
+ "step": 1920
1544
+ },
1545
+ {
1546
+ "epoch": 3.09,
1547
+ "grad_norm": 0.7584701776504517,
1548
+ "learning_rate": 4.576e-05,
1549
+ "loss": 0.0397,
1550
+ "step": 1930
1551
+ },
1552
+ {
1553
+ "epoch": 3.1,
1554
+ "grad_norm": 0.20170661807060242,
1555
+ "learning_rate": 4.496e-05,
1556
+ "loss": 0.0288,
1557
+ "step": 1940
1558
+ },
1559
+ {
1560
+ "epoch": 3.12,
1561
+ "grad_norm": 0.2583639919757843,
1562
+ "learning_rate": 4.4160000000000004e-05,
1563
+ "loss": 0.0302,
1564
+ "step": 1950
1565
+ },
1566
+ {
1567
+ "epoch": 3.14,
1568
+ "grad_norm": 3.9720704555511475,
1569
+ "learning_rate": 4.336e-05,
1570
+ "loss": 0.0484,
1571
+ "step": 1960
1572
+ },
1573
+ {
1574
+ "epoch": 3.15,
1575
+ "grad_norm": 0.3367606997489929,
1576
+ "learning_rate": 4.256e-05,
1577
+ "loss": 0.0387,
1578
+ "step": 1970
1579
+ },
1580
+ {
1581
+ "epoch": 3.17,
1582
+ "grad_norm": 0.7610962986946106,
1583
+ "learning_rate": 4.176000000000001e-05,
1584
+ "loss": 0.0424,
1585
+ "step": 1980
1586
+ },
1587
+ {
1588
+ "epoch": 3.18,
1589
+ "grad_norm": 0.6901140213012695,
1590
+ "learning_rate": 4.096e-05,
1591
+ "loss": 0.035,
1592
+ "step": 1990
1593
+ },
1594
+ {
1595
+ "epoch": 3.2,
1596
+ "grad_norm": 0.2873363196849823,
1597
+ "learning_rate": 4.016e-05,
1598
+ "loss": 0.043,
1599
+ "step": 2000
1600
+ },
1601
+ {
1602
+ "epoch": 3.2,
1603
+ "eval_loss": 0.35012441873550415,
1604
+ "eval_na_accuracy": 0.941,
1605
+ "eval_ordinal_accuracy": 0.6482122260668973,
1606
+ "eval_runtime": 39.427,
1607
+ "eval_samples_per_second": 25.363,
1608
+ "eval_steps_per_second": 3.17,
1609
+ "step": 2000
1610
+ },
1611
+ {
1612
+ "epoch": 3.22,
1613
+ "grad_norm": 0.5277103781700134,
1614
+ "learning_rate": 3.936e-05,
1615
+ "loss": 0.0423,
1616
+ "step": 2010
1617
+ },
1618
+ {
1619
+ "epoch": 3.23,
1620
+ "grad_norm": 0.3088182508945465,
1621
+ "learning_rate": 3.8560000000000004e-05,
1622
+ "loss": 0.0305,
1623
+ "step": 2020
1624
+ },
1625
+ {
1626
+ "epoch": 3.25,
1627
+ "grad_norm": 0.3621159791946411,
1628
+ "learning_rate": 3.776e-05,
1629
+ "loss": 0.0398,
1630
+ "step": 2030
1631
+ },
1632
+ {
1633
+ "epoch": 3.26,
1634
+ "grad_norm": 0.6761226654052734,
1635
+ "learning_rate": 3.696e-05,
1636
+ "loss": 0.0606,
1637
+ "step": 2040
1638
+ },
1639
+ {
1640
+ "epoch": 3.28,
1641
+ "grad_norm": 0.9860779047012329,
1642
+ "learning_rate": 3.616e-05,
1643
+ "loss": 0.0437,
1644
+ "step": 2050
1645
+ },
1646
+ {
1647
+ "epoch": 3.3,
1648
+ "grad_norm": 0.6743874549865723,
1649
+ "learning_rate": 3.536000000000001e-05,
1650
+ "loss": 0.0328,
1651
+ "step": 2060
1652
+ },
1653
+ {
1654
+ "epoch": 3.31,
1655
+ "grad_norm": 1.5928354263305664,
1656
+ "learning_rate": 3.456e-05,
1657
+ "loss": 0.0261,
1658
+ "step": 2070
1659
+ },
1660
+ {
1661
+ "epoch": 3.33,
1662
+ "grad_norm": 0.9067389965057373,
1663
+ "learning_rate": 3.376e-05,
1664
+ "loss": 0.0268,
1665
+ "step": 2080
1666
+ },
1667
+ {
1668
+ "epoch": 3.34,
1669
+ "grad_norm": 0.5733221173286438,
1670
+ "learning_rate": 3.296e-05,
1671
+ "loss": 0.0335,
1672
+ "step": 2090
1673
+ },
1674
+ {
1675
+ "epoch": 3.36,
1676
+ "grad_norm": 0.7042862772941589,
1677
+ "learning_rate": 3.2160000000000004e-05,
1678
+ "loss": 0.0472,
1679
+ "step": 2100
1680
+ },
1681
+ {
1682
+ "epoch": 3.36,
1683
+ "eval_loss": 0.34554365277290344,
1684
+ "eval_na_accuracy": 0.943,
1685
+ "eval_ordinal_accuracy": 0.6608996539792388,
1686
+ "eval_runtime": 40.1655,
1687
+ "eval_samples_per_second": 24.897,
1688
+ "eval_steps_per_second": 3.112,
1689
+ "step": 2100
1690
+ },
1691
+ {
1692
+ "epoch": 3.38,
1693
+ "grad_norm": 0.8036783933639526,
1694
+ "learning_rate": 3.136e-05,
1695
+ "loss": 0.0402,
1696
+ "step": 2110
1697
+ },
1698
+ {
1699
+ "epoch": 3.39,
1700
+ "grad_norm": 0.4863825738430023,
1701
+ "learning_rate": 3.056e-05,
1702
+ "loss": 0.0507,
1703
+ "step": 2120
1704
+ },
1705
+ {
1706
+ "epoch": 3.41,
1707
+ "grad_norm": 0.5171158313751221,
1708
+ "learning_rate": 2.976e-05,
1709
+ "loss": 0.0333,
1710
+ "step": 2130
1711
+ },
1712
+ {
1713
+ "epoch": 3.42,
1714
+ "grad_norm": 0.21965381503105164,
1715
+ "learning_rate": 2.8960000000000004e-05,
1716
+ "loss": 0.0277,
1717
+ "step": 2140
1718
+ },
1719
+ {
1720
+ "epoch": 3.44,
1721
+ "grad_norm": 0.20841450989246368,
1722
+ "learning_rate": 2.816e-05,
1723
+ "loss": 0.0259,
1724
+ "step": 2150
1725
+ },
1726
+ {
1727
+ "epoch": 3.46,
1728
+ "grad_norm": 0.5015869736671448,
1729
+ "learning_rate": 2.7360000000000002e-05,
1730
+ "loss": 0.0316,
1731
+ "step": 2160
1732
+ },
1733
+ {
1734
+ "epoch": 3.47,
1735
+ "grad_norm": 0.7938678860664368,
1736
+ "learning_rate": 2.6560000000000003e-05,
1737
+ "loss": 0.0301,
1738
+ "step": 2170
1739
+ },
1740
+ {
1741
+ "epoch": 3.49,
1742
+ "grad_norm": 0.44840845465660095,
1743
+ "learning_rate": 2.576e-05,
1744
+ "loss": 0.0406,
1745
+ "step": 2180
1746
+ },
1747
+ {
1748
+ "epoch": 3.5,
1749
+ "grad_norm": 0.35510167479515076,
1750
+ "learning_rate": 2.496e-05,
1751
+ "loss": 0.0268,
1752
+ "step": 2190
1753
+ },
1754
+ {
1755
+ "epoch": 3.52,
1756
+ "grad_norm": 0.37328681349754333,
1757
+ "learning_rate": 2.4160000000000002e-05,
1758
+ "loss": 0.032,
1759
+ "step": 2200
1760
+ },
1761
+ {
1762
+ "epoch": 3.52,
1763
+ "eval_loss": 0.35616353154182434,
1764
+ "eval_na_accuracy": 0.94,
1765
+ "eval_ordinal_accuracy": 0.6516724336793541,
1766
+ "eval_runtime": 40.3825,
1767
+ "eval_samples_per_second": 24.763,
1768
+ "eval_steps_per_second": 3.095,
1769
+ "step": 2200
1770
+ },
1771
+ {
1772
+ "epoch": 3.54,
1773
+ "grad_norm": 0.24070143699645996,
1774
+ "learning_rate": 2.336e-05,
1775
+ "loss": 0.0262,
1776
+ "step": 2210
1777
+ },
1778
+ {
1779
+ "epoch": 3.55,
1780
+ "grad_norm": 1.0428861379623413,
1781
+ "learning_rate": 2.256e-05,
1782
+ "loss": 0.04,
1783
+ "step": 2220
1784
+ },
1785
+ {
1786
+ "epoch": 3.57,
1787
+ "grad_norm": 0.626348614692688,
1788
+ "learning_rate": 2.176e-05,
1789
+ "loss": 0.0275,
1790
+ "step": 2230
1791
+ },
1792
+ {
1793
+ "epoch": 3.58,
1794
+ "grad_norm": 0.47826460003852844,
1795
+ "learning_rate": 2.0960000000000003e-05,
1796
+ "loss": 0.0379,
1797
+ "step": 2240
1798
+ },
1799
+ {
1800
+ "epoch": 3.6,
1801
+ "grad_norm": 2.685340166091919,
1802
+ "learning_rate": 2.016e-05,
1803
+ "loss": 0.047,
1804
+ "step": 2250
1805
+ },
1806
+ {
1807
+ "epoch": 3.62,
1808
+ "grad_norm": 0.1495877057313919,
1809
+ "learning_rate": 1.936e-05,
1810
+ "loss": 0.0389,
1811
+ "step": 2260
1812
+ },
1813
+ {
1814
+ "epoch": 3.63,
1815
+ "grad_norm": 0.5789759755134583,
1816
+ "learning_rate": 1.856e-05,
1817
+ "loss": 0.0337,
1818
+ "step": 2270
1819
+ },
1820
+ {
1821
+ "epoch": 3.65,
1822
+ "grad_norm": 0.4255303144454956,
1823
+ "learning_rate": 1.7760000000000003e-05,
1824
+ "loss": 0.0282,
1825
+ "step": 2280
1826
+ },
1827
+ {
1828
+ "epoch": 3.66,
1829
+ "grad_norm": 0.7483348846435547,
1830
+ "learning_rate": 1.696e-05,
1831
+ "loss": 0.0668,
1832
+ "step": 2290
1833
+ },
1834
+ {
1835
+ "epoch": 3.68,
1836
+ "grad_norm": 0.23885439336299896,
1837
+ "learning_rate": 1.616e-05,
1838
+ "loss": 0.0434,
1839
+ "step": 2300
1840
+ },
1841
+ {
1842
+ "epoch": 3.68,
1843
+ "eval_loss": 0.34990155696868896,
1844
+ "eval_na_accuracy": 0.94,
1845
+ "eval_ordinal_accuracy": 0.6597462514417531,
1846
+ "eval_runtime": 40.407,
1847
+ "eval_samples_per_second": 24.748,
1848
+ "eval_steps_per_second": 3.094,
1849
+ "step": 2300
1850
+ },
1851
+ {
1852
+ "epoch": 3.7,
1853
+ "grad_norm": 0.3341818153858185,
1854
+ "learning_rate": 1.536e-05,
1855
+ "loss": 0.0358,
1856
+ "step": 2310
1857
+ },
1858
+ {
1859
+ "epoch": 3.71,
1860
+ "grad_norm": 0.6008884310722351,
1861
+ "learning_rate": 1.4560000000000001e-05,
1862
+ "loss": 0.0394,
1863
+ "step": 2320
1864
+ },
1865
+ {
1866
+ "epoch": 3.73,
1867
+ "grad_norm": 0.3966546654701233,
1868
+ "learning_rate": 1.376e-05,
1869
+ "loss": 0.0346,
1870
+ "step": 2330
1871
+ },
1872
+ {
1873
+ "epoch": 3.74,
1874
+ "grad_norm": 0.46933791041374207,
1875
+ "learning_rate": 1.296e-05,
1876
+ "loss": 0.0227,
1877
+ "step": 2340
1878
+ },
1879
+ {
1880
+ "epoch": 3.76,
1881
+ "grad_norm": 0.6652282476425171,
1882
+ "learning_rate": 1.216e-05,
1883
+ "loss": 0.0393,
1884
+ "step": 2350
1885
+ },
1886
+ {
1887
+ "epoch": 3.78,
1888
+ "grad_norm": 0.23938482999801636,
1889
+ "learning_rate": 1.1360000000000001e-05,
1890
+ "loss": 0.0267,
1891
+ "step": 2360
1892
+ },
1893
+ {
1894
+ "epoch": 3.79,
1895
+ "grad_norm": 0.6050881147384644,
1896
+ "learning_rate": 1.056e-05,
1897
+ "loss": 0.0287,
1898
+ "step": 2370
1899
+ },
1900
+ {
1901
+ "epoch": 3.81,
1902
+ "grad_norm": 0.22671189904212952,
1903
+ "learning_rate": 9.760000000000001e-06,
1904
+ "loss": 0.0491,
1905
+ "step": 2380
1906
+ },
1907
+ {
1908
+ "epoch": 3.82,
1909
+ "grad_norm": 0.5296955704689026,
1910
+ "learning_rate": 8.96e-06,
1911
+ "loss": 0.0266,
1912
+ "step": 2390
1913
+ },
1914
+ {
1915
+ "epoch": 3.84,
1916
+ "grad_norm": 0.5424560308456421,
1917
+ "learning_rate": 8.160000000000001e-06,
1918
+ "loss": 0.0341,
1919
+ "step": 2400
1920
+ },
1921
+ {
1922
+ "epoch": 3.84,
1923
+ "eval_loss": 0.3610887825489044,
1924
+ "eval_na_accuracy": 0.94,
1925
+ "eval_ordinal_accuracy": 0.6482122260668973,
1926
+ "eval_runtime": 40.741,
1927
+ "eval_samples_per_second": 24.545,
1928
+ "eval_steps_per_second": 3.068,
1929
+ "step": 2400
1930
+ },
1931
+ {
1932
+ "epoch": 3.86,
1933
+ "grad_norm": 0.4790421724319458,
1934
+ "learning_rate": 7.36e-06,
1935
+ "loss": 0.0319,
1936
+ "step": 2410
1937
+ },
1938
+ {
1939
+ "epoch": 3.87,
1940
+ "grad_norm": 0.4021483063697815,
1941
+ "learning_rate": 6.560000000000001e-06,
1942
+ "loss": 0.0551,
1943
+ "step": 2420
1944
+ },
1945
+ {
1946
+ "epoch": 3.89,
1947
+ "grad_norm": 0.43051794171333313,
1948
+ "learning_rate": 5.76e-06,
1949
+ "loss": 0.0281,
1950
+ "step": 2430
1951
+ },
1952
+ {
1953
+ "epoch": 3.9,
1954
+ "grad_norm": 0.23781944811344147,
1955
+ "learning_rate": 4.96e-06,
1956
+ "loss": 0.0306,
1957
+ "step": 2440
1958
+ },
1959
+ {
1960
+ "epoch": 3.92,
1961
+ "grad_norm": 0.6060004234313965,
1962
+ "learning_rate": 4.16e-06,
1963
+ "loss": 0.0326,
1964
+ "step": 2450
1965
+ },
1966
+ {
1967
+ "epoch": 3.94,
1968
+ "grad_norm": 0.5149852633476257,
1969
+ "learning_rate": 3.36e-06,
1970
+ "loss": 0.0266,
1971
+ "step": 2460
1972
+ },
1973
+ {
1974
+ "epoch": 3.95,
1975
+ "grad_norm": 0.579931914806366,
1976
+ "learning_rate": 2.56e-06,
1977
+ "loss": 0.0236,
1978
+ "step": 2470
1979
+ },
1980
+ {
1981
+ "epoch": 3.97,
1982
+ "grad_norm": 0.14379101991653442,
1983
+ "learning_rate": 1.76e-06,
1984
+ "loss": 0.0221,
1985
+ "step": 2480
1986
+ },
1987
+ {
1988
+ "epoch": 3.98,
1989
+ "grad_norm": 0.5184658765792847,
1990
+ "learning_rate": 9.6e-07,
1991
+ "loss": 0.0281,
1992
+ "step": 2490
1993
+ },
1994
+ {
1995
+ "epoch": 4.0,
1996
+ "grad_norm": 0.5299363732337952,
1997
+ "learning_rate": 1.6e-07,
1998
+ "loss": 0.0305,
1999
+ "step": 2500
2000
+ },
2001
+ {
2002
+ "epoch": 4.0,
2003
+ "eval_loss": 0.36354970932006836,
2004
+ "eval_na_accuracy": 0.939,
2005
+ "eval_ordinal_accuracy": 0.6608996539792388,
2006
+ "eval_runtime": 40.233,
2007
+ "eval_samples_per_second": 24.855,
2008
+ "eval_steps_per_second": 3.107,
2009
+ "step": 2500
2010
+ },
2011
  {
2012
  "epoch": 4.0,
2013
+ "step": 2500,
2014
+ "total_flos": 3.0997907103744e+18,
2015
+ "train_loss": 0.15650403581261635,
2016
+ "train_runtime": 3981.9696,
2017
+ "train_samples_per_second": 10.045,
2018
+ "train_steps_per_second": 0.628
2019
  }
2020
  ],
2021
  "logging_steps": 10,
2022
+ "max_steps": 2500,
2023
  "num_input_tokens_seen": 0,
2024
  "num_train_epochs": 4,
2025
  "save_steps": 100,
2026
+ "total_flos": 3.0997907103744e+18,
2027
  "train_batch_size": 16,
2028
  "trial_name": null,
2029
  "trial_params": null