rshrott commited on
Commit
2a7c83b
1 Parent(s): a77ede2

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  model-index:
7
  - name: ryan_model3272024
@@ -13,12 +14,12 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # ryan_model3272024
15
 
16
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.2868
19
  - Na Accuracy: 0.95
20
- - Ordinal Accuracy: 0.6711
21
- - Ordinal Mae: 1.1626
22
 
23
  ## Model description
24
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  model-index:
8
  - name: ryan_model3272024
 
14
 
15
  # ryan_model3272024
16
 
17
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.2552
20
  - Na Accuracy: 0.95
21
+ - Ordinal Accuracy: 0.6267
22
+ - Ordinal Mae: 1.1586
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "epoch": 3.83,
3
- "eval_loss": 1.0327022075653076,
4
- "eval_na_accuracy": 0.956,
5
- "eval_ordinal_accuracy": 0.568,
6
- "eval_ordinal_mae": 56.34840202213265,
7
- "eval_runtime": 20.4149,
8
- "eval_samples_per_second": 24.492,
9
- "eval_steps_per_second": 3.086,
10
- "train_loss": 0.6305676238735517,
11
- "train_runtime": 1636.5913,
12
- "train_samples_per_second": 12.221,
13
- "train_steps_per_second": 0.765
14
  }
 
1
  {
2
  "epoch": 3.83,
3
+ "eval_loss": 0.2552177309989929,
4
+ "eval_na_accuracy": 0.95,
5
+ "eval_ordinal_accuracy": 0.6266666666666667,
6
+ "eval_ordinal_mae": 1.158560517811113,
7
+ "eval_runtime": 19.6745,
8
+ "eval_samples_per_second": 25.414,
9
+ "eval_steps_per_second": 3.202,
10
+ "train_loss": 0.17935538868109385,
11
+ "train_runtime": 1702.5744,
12
+ "train_samples_per_second": 11.747,
13
+ "train_steps_per_second": 0.735
14
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 3.83,
3
- "eval_loss": 1.0327022075653076,
4
- "eval_na_accuracy": 0.956,
5
- "eval_ordinal_accuracy": 0.568,
6
- "eval_ordinal_mae": 56.34840202213265,
7
- "eval_runtime": 20.4149,
8
- "eval_samples_per_second": 24.492,
9
- "eval_steps_per_second": 3.086
10
  }
 
1
  {
2
  "epoch": 3.83,
3
+ "eval_loss": 0.2552177309989929,
4
+ "eval_na_accuracy": 0.95,
5
+ "eval_ordinal_accuracy": 0.6266666666666667,
6
+ "eval_ordinal_mae": 1.158560517811113,
7
+ "eval_runtime": 19.6745,
8
+ "eval_samples_per_second": 25.414,
9
+ "eval_steps_per_second": 3.202
10
  }
runs/Mar27_20-06-26_ryanserver/events.out.tfevents.1711586196.ryanserver.24064.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7608e0bba49dd02a3edba076af19d5783dcf52171b4e0113a42f37cdb474c484
3
+ size 529
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 3.83,
3
- "train_loss": 0.6305676238735517,
4
- "train_runtime": 1636.5913,
5
- "train_samples_per_second": 12.221,
6
- "train_steps_per_second": 0.765
7
  }
 
1
  {
2
  "epoch": 3.83,
3
+ "train_loss": 0.17935538868109385,
4
+ "train_runtime": 1702.5744,
5
+ "train_samples_per_second": 11.747,
6
+ "train_steps_per_second": 0.735
7
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 1.0327022075653076,
3
- "best_model_checkpoint": "./ryan_model3272024/checkpoint-500",
4
  "epoch": 3.8338658146964857,
5
  "eval_steps": 100,
6
  "global_step": 1200,
@@ -10,984 +10,984 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": 2.4977924823760986,
14
  "learning_rate": 0.00019840255591054313,
15
- "loss": 1.9988,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06,
20
- "grad_norm": 2.3356950283050537,
21
  "learning_rate": 0.00019680511182108628,
22
- "loss": 1.6981,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.1,
27
- "grad_norm": 2.1436116695404053,
28
  "learning_rate": 0.0001952076677316294,
29
- "loss": 1.6585,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.13,
34
- "grad_norm": 1.9430339336395264,
35
  "learning_rate": 0.00019361022364217253,
36
- "loss": 1.6929,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 1.7540597915649414,
42
  "learning_rate": 0.00019201277955271565,
43
- "loss": 1.4822,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.19,
48
- "grad_norm": 2.9073445796966553,
49
  "learning_rate": 0.0001904153354632588,
50
- "loss": 1.5968,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.22,
55
- "grad_norm": 1.6934313774108887,
56
  "learning_rate": 0.00018881789137380192,
57
- "loss": 1.4684,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.26,
62
- "grad_norm": 3.451613426208496,
63
  "learning_rate": 0.00018722044728434505,
64
- "loss": 1.4425,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.29,
69
- "grad_norm": 1.747863531112671,
70
  "learning_rate": 0.0001856230031948882,
71
- "loss": 1.3641,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
- "grad_norm": 2.263026714324951,
77
  "learning_rate": 0.00018402555910543132,
78
- "loss": 1.3578,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.32,
83
- "eval_loss": 1.2057933807373047,
84
- "eval_na_accuracy": 0.944,
85
- "eval_ordinal_accuracy": 0.54,
86
- "eval_ordinal_mae": 90.9845891143661,
87
- "eval_runtime": 46.445,
88
- "eval_samples_per_second": 10.765,
89
- "eval_steps_per_second": 1.356,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.35,
94
- "grad_norm": 2.7085518836975098,
95
  "learning_rate": 0.00018242811501597444,
96
- "loss": 1.3463,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.38,
101
- "grad_norm": 2.2408878803253174,
102
  "learning_rate": 0.00018083067092651756,
103
- "loss": 1.1704,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.42,
108
- "grad_norm": 2.1588001251220703,
109
  "learning_rate": 0.00017923322683706071,
110
- "loss": 1.24,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.45,
115
- "grad_norm": 2.582127094268799,
116
  "learning_rate": 0.00017763578274760384,
117
- "loss": 1.2735,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.48,
122
- "grad_norm": 1.9934537410736084,
123
  "learning_rate": 0.000176038338658147,
124
- "loss": 1.344,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.51,
129
- "grad_norm": 2.1636180877685547,
130
  "learning_rate": 0.0001744408945686901,
131
- "loss": 1.1532,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.54,
136
- "grad_norm": 3.9455337524414062,
137
  "learning_rate": 0.00017284345047923323,
138
- "loss": 1.1231,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.58,
143
- "grad_norm": 3.46232271194458,
144
  "learning_rate": 0.00017124600638977638,
145
- "loss": 1.0945,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.61,
150
- "grad_norm": 2.913996696472168,
151
  "learning_rate": 0.00016964856230031948,
152
- "loss": 1.3168,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.64,
157
- "grad_norm": 2.16892671585083,
158
  "learning_rate": 0.00016805111821086263,
159
- "loss": 1.089,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.64,
164
- "eval_loss": 1.0986785888671875,
165
- "eval_na_accuracy": 0.95,
166
- "eval_ordinal_accuracy": 0.548,
167
- "eval_ordinal_mae": 120.70968111835792,
168
- "eval_runtime": 20.2841,
169
- "eval_samples_per_second": 24.65,
170
- "eval_steps_per_second": 3.106,
171
  "step": 200
172
  },
173
  {
174
  "epoch": 0.67,
175
- "grad_norm": 2.9793784618377686,
176
  "learning_rate": 0.00016645367412140575,
177
- "loss": 1.1788,
178
  "step": 210
179
  },
180
  {
181
  "epoch": 0.7,
182
- "grad_norm": 2.662822723388672,
183
  "learning_rate": 0.0001648562300319489,
184
- "loss": 1.0067,
185
  "step": 220
186
  },
187
  {
188
  "epoch": 0.73,
189
- "grad_norm": 2.3921396732330322,
190
  "learning_rate": 0.00016325878594249202,
191
- "loss": 1.2542,
192
  "step": 230
193
  },
194
  {
195
  "epoch": 0.77,
196
- "grad_norm": 2.670137882232666,
197
  "learning_rate": 0.00016166134185303515,
198
- "loss": 1.1284,
199
  "step": 240
200
  },
201
  {
202
  "epoch": 0.8,
203
- "grad_norm": 3.212069511413574,
204
  "learning_rate": 0.0001600638977635783,
205
- "loss": 1.3407,
206
  "step": 250
207
  },
208
  {
209
  "epoch": 0.83,
210
- "grad_norm": 2.284740447998047,
211
  "learning_rate": 0.00015846645367412142,
212
- "loss": 1.1876,
213
  "step": 260
214
  },
215
  {
216
  "epoch": 0.86,
217
- "grad_norm": 2.98988938331604,
218
  "learning_rate": 0.00015686900958466454,
219
- "loss": 1.0993,
220
  "step": 270
221
  },
222
  {
223
  "epoch": 0.89,
224
- "grad_norm": 3.687721014022827,
225
  "learning_rate": 0.00015527156549520767,
226
- "loss": 1.1273,
227
  "step": 280
228
  },
229
  {
230
  "epoch": 0.93,
231
- "grad_norm": 2.0068211555480957,
232
  "learning_rate": 0.00015367412140575082,
233
- "loss": 1.0204,
234
  "step": 290
235
  },
236
  {
237
  "epoch": 0.96,
238
- "grad_norm": 2.719209909439087,
239
  "learning_rate": 0.00015207667731629394,
240
- "loss": 0.924,
241
  "step": 300
242
  },
243
  {
244
  "epoch": 0.96,
245
- "eval_loss": 1.083807110786438,
246
  "eval_na_accuracy": 0.946,
247
- "eval_ordinal_accuracy": 0.568,
248
- "eval_ordinal_mae": 76.79816162186209,
249
- "eval_runtime": 20.3196,
250
- "eval_samples_per_second": 24.607,
251
- "eval_steps_per_second": 3.1,
252
  "step": 300
253
  },
254
  {
255
  "epoch": 0.99,
256
- "grad_norm": 6.174163341522217,
257
- "learning_rate": 0.00015063897763578277,
258
- "loss": 1.0079,
259
  "step": 310
260
  },
261
  {
262
  "epoch": 1.02,
263
- "grad_norm": 1.8171716928482056,
264
- "learning_rate": 0.0001490415335463259,
265
- "loss": 0.9573,
266
  "step": 320
267
  },
268
  {
269
  "epoch": 1.05,
270
- "grad_norm": 1.7882392406463623,
271
- "learning_rate": 0.00014744408945686902,
272
- "loss": 0.8008,
273
  "step": 330
274
  },
275
  {
276
  "epoch": 1.09,
277
- "grad_norm": 2.8711063861846924,
278
- "learning_rate": 0.00014584664536741214,
279
- "loss": 0.7867,
280
  "step": 340
281
  },
282
  {
283
  "epoch": 1.12,
284
- "grad_norm": 3.8461153507232666,
285
- "learning_rate": 0.00014424920127795526,
286
- "loss": 0.7375,
287
  "step": 350
288
  },
289
  {
290
  "epoch": 1.15,
291
- "grad_norm": 3.599330186843872,
292
- "learning_rate": 0.0001426517571884984,
293
- "loss": 0.8057,
294
  "step": 360
295
  },
296
  {
297
  "epoch": 1.18,
298
- "grad_norm": 1.9864487648010254,
299
- "learning_rate": 0.00014105431309904153,
300
- "loss": 0.7876,
301
  "step": 370
302
  },
303
  {
304
  "epoch": 1.21,
305
- "grad_norm": 3.377739191055298,
306
- "learning_rate": 0.00013945686900958468,
307
- "loss": 0.7835,
308
  "step": 380
309
  },
310
  {
311
  "epoch": 1.25,
312
- "grad_norm": 3.5138072967529297,
313
- "learning_rate": 0.0001378594249201278,
314
- "loss": 0.8265,
315
  "step": 390
316
  },
317
  {
318
  "epoch": 1.28,
319
- "grad_norm": 3.5262341499328613,
320
- "learning_rate": 0.00013626198083067093,
321
- "loss": 0.694,
322
  "step": 400
323
  },
324
  {
325
  "epoch": 1.28,
326
- "eval_loss": 1.068002700805664,
327
- "eval_na_accuracy": 0.942,
328
- "eval_ordinal_accuracy": 0.556,
329
- "eval_ordinal_mae": 105.63124973691814,
330
- "eval_runtime": 19.9749,
331
- "eval_samples_per_second": 25.031,
332
- "eval_steps_per_second": 3.154,
333
  "step": 400
334
  },
335
  {
336
  "epoch": 1.31,
337
- "grad_norm": 2.814100980758667,
338
- "learning_rate": 0.00013466453674121405,
339
- "loss": 0.7246,
340
  "step": 410
341
  },
342
  {
343
  "epoch": 1.34,
344
- "grad_norm": 1.9095064401626587,
345
- "learning_rate": 0.00013306709265175718,
346
- "loss": 0.768,
347
  "step": 420
348
  },
349
  {
350
  "epoch": 1.37,
351
- "grad_norm": 2.849485158920288,
352
- "learning_rate": 0.00013146964856230033,
353
- "loss": 0.6669,
354
  "step": 430
355
  },
356
  {
357
  "epoch": 1.41,
358
- "grad_norm": 2.2935235500335693,
359
- "learning_rate": 0.00012987220447284345,
360
- "loss": 0.6708,
361
  "step": 440
362
  },
363
  {
364
  "epoch": 1.44,
365
- "grad_norm": 2.6063146591186523,
366
- "learning_rate": 0.0001282747603833866,
367
- "loss": 0.7481,
368
  "step": 450
369
  },
370
  {
371
  "epoch": 1.47,
372
- "grad_norm": 2.329845428466797,
373
- "learning_rate": 0.00012667731629392972,
374
- "loss": 0.7341,
375
  "step": 460
376
  },
377
  {
378
  "epoch": 1.5,
379
- "grad_norm": 4.184568881988525,
380
- "learning_rate": 0.00012507987220447287,
381
- "loss": 0.785,
382
  "step": 470
383
  },
384
  {
385
  "epoch": 1.53,
386
- "grad_norm": 2.142719268798828,
387
- "learning_rate": 0.00012348242811501597,
388
- "loss": 0.702,
389
  "step": 480
390
  },
391
  {
392
  "epoch": 1.57,
393
- "grad_norm": 2.7038168907165527,
394
- "learning_rate": 0.0001218849840255591,
395
- "loss": 0.8301,
396
  "step": 490
397
  },
398
  {
399
  "epoch": 1.6,
400
- "grad_norm": 3.093721389770508,
401
- "learning_rate": 0.00012028753993610224,
402
- "loss": 0.7739,
403
  "step": 500
404
  },
405
  {
406
  "epoch": 1.6,
407
- "eval_loss": 1.0327022075653076,
408
  "eval_na_accuracy": 0.956,
409
- "eval_ordinal_accuracy": 0.568,
410
- "eval_ordinal_mae": 56.34840202213265,
411
- "eval_runtime": 20.2051,
412
- "eval_samples_per_second": 24.746,
413
- "eval_steps_per_second": 3.118,
414
  "step": 500
415
  },
416
  {
417
  "epoch": 1.63,
418
- "grad_norm": 3.727419853210449,
419
- "learning_rate": 0.00011869009584664536,
420
- "loss": 0.7536,
421
  "step": 510
422
  },
423
  {
424
  "epoch": 1.66,
425
- "grad_norm": 3.3148770332336426,
426
- "learning_rate": 0.00011709265175718851,
427
- "loss": 0.7675,
428
  "step": 520
429
  },
430
  {
431
  "epoch": 1.69,
432
- "grad_norm": 3.7952725887298584,
433
- "learning_rate": 0.00011549520766773163,
434
- "loss": 0.7543,
435
  "step": 530
436
  },
437
  {
438
  "epoch": 1.73,
439
- "grad_norm": 3.9252896308898926,
440
- "learning_rate": 0.00011389776357827477,
441
- "loss": 0.7754,
442
  "step": 540
443
  },
444
  {
445
  "epoch": 1.76,
446
- "grad_norm": 3.045405387878418,
447
- "learning_rate": 0.0001123003194888179,
448
- "loss": 0.5713,
449
  "step": 550
450
  },
451
  {
452
  "epoch": 1.79,
453
- "grad_norm": 1.8544007539749146,
454
- "learning_rate": 0.00011070287539936102,
455
- "loss": 0.5592,
456
  "step": 560
457
  },
458
  {
459
  "epoch": 1.82,
460
- "grad_norm": 3.626079797744751,
461
- "learning_rate": 0.00010910543130990417,
462
- "loss": 0.6631,
463
  "step": 570
464
  },
465
  {
466
  "epoch": 1.85,
467
- "grad_norm": 4.355627536773682,
468
- "learning_rate": 0.00010750798722044728,
469
- "loss": 0.5981,
470
  "step": 580
471
  },
472
  {
473
  "epoch": 1.88,
474
- "grad_norm": 4.025274276733398,
475
- "learning_rate": 0.00010591054313099043,
476
- "loss": 0.7375,
477
  "step": 590
478
  },
479
  {
480
  "epoch": 1.92,
481
- "grad_norm": 2.506023645401001,
482
- "learning_rate": 0.00010431309904153355,
483
- "loss": 0.5935,
484
  "step": 600
485
  },
486
  {
487
  "epoch": 1.92,
488
- "eval_loss": 1.0479341745376587,
489
- "eval_na_accuracy": 0.932,
490
- "eval_ordinal_accuracy": 0.598,
491
- "eval_ordinal_mae": 50.45202396943141,
492
- "eval_runtime": 20.1021,
493
- "eval_samples_per_second": 24.873,
494
- "eval_steps_per_second": 3.134,
495
  "step": 600
496
  },
497
  {
498
  "epoch": 1.95,
499
- "grad_norm": 4.494864463806152,
500
- "learning_rate": 0.00010271565495207669,
501
- "loss": 0.651,
502
  "step": 610
503
  },
504
  {
505
  "epoch": 1.98,
506
- "grad_norm": 4.595503807067871,
507
- "learning_rate": 0.00010111821086261981,
508
- "loss": 0.6743,
509
  "step": 620
510
  },
511
  {
512
  "epoch": 2.01,
513
- "grad_norm": 3.0126616954803467,
514
- "learning_rate": 9.952076677316294e-05,
515
- "loss": 0.5441,
516
  "step": 630
517
  },
518
  {
519
  "epoch": 2.04,
520
- "grad_norm": 2.2599124908447266,
521
- "learning_rate": 9.792332268370608e-05,
522
- "loss": 0.3816,
523
  "step": 640
524
  },
525
  {
526
  "epoch": 2.08,
527
- "grad_norm": 1.8868279457092285,
528
- "learning_rate": 9.63258785942492e-05,
529
- "loss": 0.4175,
530
  "step": 650
531
  },
532
  {
533
  "epoch": 2.11,
534
- "grad_norm": 2.2594425678253174,
535
- "learning_rate": 9.472843450479234e-05,
536
- "loss": 0.3208,
537
  "step": 660
538
  },
539
  {
540
  "epoch": 2.14,
541
- "grad_norm": 1.4436126947402954,
542
- "learning_rate": 9.313099041533548e-05,
543
- "loss": 0.2459,
544
  "step": 670
545
  },
546
  {
547
  "epoch": 2.17,
548
- "grad_norm": 2.018686532974243,
549
- "learning_rate": 9.15335463258786e-05,
550
- "loss": 0.2556,
551
  "step": 680
552
  },
553
  {
554
  "epoch": 2.2,
555
- "grad_norm": 3.8970091342926025,
556
- "learning_rate": 8.993610223642172e-05,
557
- "loss": 0.2912,
558
  "step": 690
559
  },
560
  {
561
  "epoch": 2.24,
562
- "grad_norm": 7.18654727935791,
563
- "learning_rate": 8.833865814696486e-05,
564
- "loss": 0.3525,
565
  "step": 700
566
  },
567
  {
568
  "epoch": 2.24,
569
- "eval_loss": 1.1914987564086914,
570
- "eval_na_accuracy": 0.94,
571
- "eval_ordinal_accuracy": 0.578,
572
- "eval_ordinal_mae": 68.50985479798727,
573
- "eval_runtime": 20.035,
574
- "eval_samples_per_second": 24.956,
575
- "eval_steps_per_second": 3.145,
576
  "step": 700
577
  },
578
  {
579
  "epoch": 2.27,
580
- "grad_norm": 3.8287854194641113,
581
- "learning_rate": 8.6741214057508e-05,
582
- "loss": 0.3517,
583
  "step": 710
584
  },
585
  {
586
  "epoch": 2.3,
587
- "grad_norm": 4.602190971374512,
588
- "learning_rate": 8.514376996805112e-05,
589
- "loss": 0.3595,
590
  "step": 720
591
  },
592
  {
593
  "epoch": 2.33,
594
- "grad_norm": 4.8052978515625,
595
- "learning_rate": 8.354632587859425e-05,
596
- "loss": 0.3686,
597
  "step": 730
598
  },
599
  {
600
  "epoch": 2.36,
601
- "grad_norm": 1.684538722038269,
602
- "learning_rate": 8.194888178913739e-05,
603
- "loss": 0.3499,
604
  "step": 740
605
  },
606
  {
607
  "epoch": 2.4,
608
- "grad_norm": 2.9729549884796143,
609
- "learning_rate": 8.035143769968051e-05,
610
- "loss": 0.3116,
611
  "step": 750
612
  },
613
  {
614
  "epoch": 2.43,
615
- "grad_norm": 4.066229343414307,
616
- "learning_rate": 7.875399361022364e-05,
617
- "loss": 0.301,
618
  "step": 760
619
  },
620
  {
621
  "epoch": 2.46,
622
- "grad_norm": 4.297142505645752,
623
- "learning_rate": 7.715654952076677e-05,
624
- "loss": 0.3501,
625
  "step": 770
626
  },
627
  {
628
  "epoch": 2.49,
629
- "grad_norm": 3.189189910888672,
630
- "learning_rate": 7.555910543130991e-05,
631
- "loss": 0.4095,
632
  "step": 780
633
  },
634
  {
635
  "epoch": 2.52,
636
- "grad_norm": 4.354482173919678,
637
- "learning_rate": 7.396166134185304e-05,
638
- "loss": 0.2547,
639
  "step": 790
640
  },
641
  {
642
  "epoch": 2.56,
643
- "grad_norm": 2.2913010120391846,
644
- "learning_rate": 7.236421725239617e-05,
645
- "loss": 0.2385,
646
  "step": 800
647
  },
648
  {
649
  "epoch": 2.56,
650
- "eval_loss": 1.1303021907806396,
651
  "eval_na_accuracy": 0.948,
652
- "eval_ordinal_accuracy": 0.586,
653
- "eval_ordinal_mae": 43.02211729006702,
654
- "eval_runtime": 20.2963,
655
- "eval_samples_per_second": 24.635,
656
- "eval_steps_per_second": 3.104,
657
  "step": 800
658
  },
659
  {
660
  "epoch": 2.59,
661
- "grad_norm": 4.907588481903076,
662
- "learning_rate": 7.07667731629393e-05,
663
- "loss": 0.2578,
664
  "step": 810
665
  },
666
  {
667
  "epoch": 2.62,
668
- "grad_norm": 5.770906925201416,
669
- "learning_rate": 6.916932907348244e-05,
670
- "loss": 0.3875,
671
  "step": 820
672
  },
673
  {
674
  "epoch": 2.65,
675
- "grad_norm": 3.4392714500427246,
676
- "learning_rate": 6.757188498402556e-05,
677
- "loss": 0.3435,
678
  "step": 830
679
  },
680
  {
681
  "epoch": 2.68,
682
- "grad_norm": 1.785447359085083,
683
- "learning_rate": 6.597444089456869e-05,
684
- "loss": 0.2681,
685
  "step": 840
686
  },
687
  {
688
  "epoch": 2.72,
689
- "grad_norm": 3.527557373046875,
690
- "learning_rate": 6.437699680511182e-05,
691
- "loss": 0.3525,
692
  "step": 850
693
  },
694
  {
695
  "epoch": 2.75,
696
- "grad_norm": 0.4556908905506134,
697
- "learning_rate": 6.277955271565496e-05,
698
- "loss": 0.2462,
699
  "step": 860
700
  },
701
  {
702
  "epoch": 2.78,
703
- "grad_norm": 2.7756271362304688,
704
- "learning_rate": 6.118210862619808e-05,
705
- "loss": 0.3107,
706
  "step": 870
707
  },
708
  {
709
  "epoch": 2.81,
710
- "grad_norm": 4.53245735168457,
711
- "learning_rate": 5.958466453674122e-05,
712
- "loss": 0.319,
713
  "step": 880
714
  },
715
  {
716
  "epoch": 2.84,
717
- "grad_norm": 1.7556302547454834,
718
- "learning_rate": 5.7987220447284354e-05,
719
- "loss": 0.2754,
720
  "step": 890
721
  },
722
  {
723
  "epoch": 2.88,
724
- "grad_norm": 4.295087814331055,
725
- "learning_rate": 5.6389776357827484e-05,
726
- "loss": 0.3423,
727
  "step": 900
728
  },
729
  {
730
  "epoch": 2.88,
731
- "eval_loss": 1.1767209768295288,
732
- "eval_na_accuracy": 0.94,
733
- "eval_ordinal_accuracy": 0.604,
734
- "eval_ordinal_mae": 72.14369884862379,
735
- "eval_runtime": 19.1966,
736
- "eval_samples_per_second": 26.046,
737
- "eval_steps_per_second": 3.282,
738
  "step": 900
739
  },
740
  {
741
  "epoch": 2.91,
742
- "grad_norm": 3.532813549041748,
743
- "learning_rate": 5.479233226837061e-05,
744
- "loss": 0.2459,
745
  "step": 910
746
  },
747
  {
748
  "epoch": 2.94,
749
- "grad_norm": 2.6231563091278076,
750
- "learning_rate": 5.3194888178913736e-05,
751
- "loss": 0.2713,
752
  "step": 920
753
  },
754
  {
755
  "epoch": 2.97,
756
- "grad_norm": 1.7921823263168335,
757
- "learning_rate": 5.159744408945687e-05,
758
- "loss": 0.1915,
759
  "step": 930
760
  },
761
  {
762
  "epoch": 3.0,
763
- "grad_norm": 0.42470985651016235,
764
- "learning_rate": 5e-05,
765
- "loss": 0.2758,
766
  "step": 940
767
  },
768
  {
769
  "epoch": 3.04,
770
- "grad_norm": 2.1311330795288086,
771
- "learning_rate": 4.840255591054313e-05,
772
- "loss": 0.1447,
773
  "step": 950
774
  },
775
  {
776
  "epoch": 3.07,
777
- "grad_norm": 2.371814489364624,
778
- "learning_rate": 4.680511182108626e-05,
779
- "loss": 0.0985,
780
  "step": 960
781
  },
782
  {
783
  "epoch": 3.1,
784
- "grad_norm": 0.3315708637237549,
785
- "learning_rate": 4.520766773162939e-05,
786
- "loss": 0.0887,
787
  "step": 970
788
  },
789
  {
790
  "epoch": 3.13,
791
- "grad_norm": 5.331906795501709,
792
- "learning_rate": 4.361022364217253e-05,
793
- "loss": 0.1125,
794
  "step": 980
795
  },
796
  {
797
  "epoch": 3.16,
798
- "grad_norm": 1.143639326095581,
799
- "learning_rate": 4.201277955271566e-05,
800
- "loss": 0.1029,
801
  "step": 990
802
  },
803
  {
804
  "epoch": 3.19,
805
- "grad_norm": 0.28226566314697266,
806
- "learning_rate": 4.041533546325879e-05,
807
- "loss": 0.0674,
808
  "step": 1000
809
  },
810
  {
811
  "epoch": 3.19,
812
- "eval_loss": 1.2294036149978638,
813
- "eval_na_accuracy": 0.938,
814
- "eval_ordinal_accuracy": 0.606,
815
- "eval_ordinal_mae": 28.07016432112176,
816
- "eval_runtime": 20.2957,
817
- "eval_samples_per_second": 24.636,
818
- "eval_steps_per_second": 3.104,
819
  "step": 1000
820
  },
821
  {
822
  "epoch": 3.23,
823
- "grad_norm": 0.1891467422246933,
824
- "learning_rate": 3.8817891373801916e-05,
825
- "loss": 0.0911,
826
  "step": 1010
827
  },
828
  {
829
  "epoch": 3.26,
830
- "grad_norm": 0.14374223351478577,
831
- "learning_rate": 3.722044728434505e-05,
832
- "loss": 0.1007,
833
  "step": 1020
834
  },
835
  {
836
  "epoch": 3.29,
837
- "grad_norm": 0.22984538972377777,
838
- "learning_rate": 3.562300319488818e-05,
839
- "loss": 0.0686,
840
  "step": 1030
841
  },
842
  {
843
  "epoch": 3.32,
844
- "grad_norm": 6.988112926483154,
845
- "learning_rate": 3.402555910543131e-05,
846
- "loss": 0.0882,
847
  "step": 1040
848
  },
849
  {
850
  "epoch": 3.35,
851
- "grad_norm": 0.12949563562870026,
852
- "learning_rate": 3.242811501597444e-05,
853
- "loss": 0.1098,
854
  "step": 1050
855
  },
856
  {
857
  "epoch": 3.39,
858
- "grad_norm": 0.40354716777801514,
859
- "learning_rate": 3.083067092651757e-05,
860
- "loss": 0.0809,
861
  "step": 1060
862
  },
863
  {
864
  "epoch": 3.42,
865
- "grad_norm": 0.6469861268997192,
866
- "learning_rate": 2.9233226837060707e-05,
867
- "loss": 0.0998,
868
  "step": 1070
869
  },
870
  {
871
  "epoch": 3.45,
872
- "grad_norm": 2.1838839054107666,
873
- "learning_rate": 2.7635782747603834e-05,
874
- "loss": 0.0923,
875
  "step": 1080
876
  },
877
  {
878
  "epoch": 3.48,
879
- "grad_norm": 1.0453554391860962,
880
- "learning_rate": 2.6038338658146967e-05,
881
- "loss": 0.0669,
882
  "step": 1090
883
  },
884
  {
885
  "epoch": 3.51,
886
- "grad_norm": 0.6662182807922363,
887
- "learning_rate": 2.44408945686901e-05,
888
- "loss": 0.1206,
889
  "step": 1100
890
  },
891
  {
892
  "epoch": 3.51,
893
- "eval_loss": 1.2335716485977173,
894
- "eval_na_accuracy": 0.938,
895
- "eval_ordinal_accuracy": 0.616,
896
- "eval_ordinal_mae": 65.07940023336745,
897
- "eval_runtime": 20.0687,
898
- "eval_samples_per_second": 24.914,
899
- "eval_steps_per_second": 3.139,
900
  "step": 1100
901
  },
902
  {
903
  "epoch": 3.55,
904
- "grad_norm": 3.52840256690979,
905
- "learning_rate": 2.284345047923323e-05,
906
- "loss": 0.1402,
907
  "step": 1110
908
  },
909
  {
910
  "epoch": 3.58,
911
- "grad_norm": 0.1548507809638977,
912
- "learning_rate": 2.124600638977636e-05,
913
- "loss": 0.1038,
914
  "step": 1120
915
  },
916
  {
917
  "epoch": 3.61,
918
- "grad_norm": 3.753204584121704,
919
- "learning_rate": 1.964856230031949e-05,
920
- "loss": 0.1237,
921
  "step": 1130
922
  },
923
  {
924
  "epoch": 3.64,
925
- "grad_norm": 2.5928821563720703,
926
- "learning_rate": 1.805111821086262e-05,
927
- "loss": 0.1653,
928
  "step": 1140
929
  },
930
  {
931
  "epoch": 3.67,
932
- "grad_norm": 0.26777195930480957,
933
- "learning_rate": 1.645367412140575e-05,
934
- "loss": 0.0924,
935
  "step": 1150
936
  },
937
  {
938
  "epoch": 3.71,
939
- "grad_norm": 0.11366520822048187,
940
- "learning_rate": 1.485623003194888e-05,
941
- "loss": 0.0376,
942
  "step": 1160
943
  },
944
  {
945
  "epoch": 3.74,
946
- "grad_norm": 2.3529770374298096,
947
- "learning_rate": 1.3258785942492014e-05,
948
- "loss": 0.1034,
949
  "step": 1170
950
  },
951
  {
952
  "epoch": 3.77,
953
- "grad_norm": 0.16830460727214813,
954
- "learning_rate": 1.1661341853035145e-05,
955
- "loss": 0.107,
956
  "step": 1180
957
  },
958
  {
959
  "epoch": 3.8,
960
- "grad_norm": 0.9607967734336853,
961
- "learning_rate": 1.0063897763578276e-05,
962
- "loss": 0.0751,
963
  "step": 1190
964
  },
965
  {
966
  "epoch": 3.83,
967
- "grad_norm": 7.122812271118164,
968
- "learning_rate": 8.466453674121406e-06,
969
- "loss": 0.1261,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 3.83,
974
- "eval_loss": 1.2907226085662842,
975
- "eval_na_accuracy": 0.938,
976
- "eval_ordinal_accuracy": 0.604,
977
- "eval_ordinal_mae": 45.83343084012577,
978
- "eval_runtime": 20.016,
979
- "eval_samples_per_second": 24.98,
980
- "eval_steps_per_second": 3.147,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 3.83,
985
  "step": 1200,
986
  "total_flos": 1.4860396665534874e+18,
987
- "train_loss": 0.6305676238735517,
988
- "train_runtime": 1636.5913,
989
- "train_samples_per_second": 12.221,
990
- "train_steps_per_second": 0.765
991
  }
992
  ],
993
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.2552177309989929,
3
+ "best_model_checkpoint": "./ryan_model3272024/checkpoint-600",
4
  "epoch": 3.8338658146964857,
5
  "eval_steps": 100,
6
  "global_step": 1200,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 1.4023665189743042,
14
  "learning_rate": 0.00019840255591054313,
15
+ "loss": 0.5486,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.06,
20
+ "grad_norm": 1.2863692045211792,
21
  "learning_rate": 0.00019680511182108628,
22
+ "loss": 0.4543,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.1,
27
+ "grad_norm": 0.8842328190803528,
28
  "learning_rate": 0.0001952076677316294,
29
+ "loss": 0.4222,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.13,
34
+ "grad_norm": 0.8728455901145935,
35
  "learning_rate": 0.00019361022364217253,
36
+ "loss": 0.3764,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 0.6641435027122498,
42
  "learning_rate": 0.00019201277955271565,
43
+ "loss": 0.3214,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.19,
48
+ "grad_norm": 1.4344050884246826,
49
  "learning_rate": 0.0001904153354632588,
50
+ "loss": 0.3286,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.22,
55
+ "grad_norm": 0.8919397592544556,
56
  "learning_rate": 0.00018881789137380192,
57
+ "loss": 0.33,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.26,
62
+ "grad_norm": 1.7052876949310303,
63
  "learning_rate": 0.00018722044728434505,
64
+ "loss": 0.3337,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.29,
69
+ "grad_norm": 0.4728272259235382,
70
  "learning_rate": 0.0001856230031948882,
71
+ "loss": 0.3784,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
+ "grad_norm": 1.1663854122161865,
77
  "learning_rate": 0.00018402555910543132,
78
+ "loss": 0.3853,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.32,
83
+ "eval_loss": 0.3272034823894501,
84
+ "eval_na_accuracy": 0.924,
85
+ "eval_ordinal_accuracy": 0.52,
86
+ "eval_ordinal_mae": 1.210578082634343,
87
+ "eval_runtime": 52.9914,
88
+ "eval_samples_per_second": 9.435,
89
+ "eval_steps_per_second": 1.189,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.35,
94
+ "grad_norm": 0.8579528331756592,
95
  "learning_rate": 0.00018242811501597444,
96
+ "loss": 0.3585,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.38,
101
+ "grad_norm": 1.02351975440979,
102
  "learning_rate": 0.00018083067092651756,
103
+ "loss": 0.3621,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.42,
108
+ "grad_norm": 1.3286011219024658,
109
  "learning_rate": 0.00017923322683706071,
110
+ "loss": 0.3714,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.45,
115
+ "grad_norm": 0.6290095448493958,
116
  "learning_rate": 0.00017763578274760384,
117
+ "loss": 0.3275,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.48,
122
+ "grad_norm": 1.269338846206665,
123
  "learning_rate": 0.000176038338658147,
124
+ "loss": 0.4287,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.51,
129
+ "grad_norm": 0.6244733333587646,
130
  "learning_rate": 0.0001744408945686901,
131
+ "loss": 0.3067,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.54,
136
+ "grad_norm": 1.1287596225738525,
137
  "learning_rate": 0.00017284345047923323,
138
+ "loss": 0.2982,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.58,
143
+ "grad_norm": 1.436303734779358,
144
  "learning_rate": 0.00017124600638977638,
145
+ "loss": 0.2946,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.61,
150
+ "grad_norm": 0.8159350752830505,
151
  "learning_rate": 0.00016964856230031948,
152
+ "loss": 0.3514,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.64,
157
+ "grad_norm": 0.7363901138305664,
158
  "learning_rate": 0.00016805111821086263,
159
+ "loss": 0.3396,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.64,
164
+ "eval_loss": 0.27412503957748413,
165
+ "eval_na_accuracy": 0.94,
166
+ "eval_ordinal_accuracy": 0.5644444444444444,
167
+ "eval_ordinal_mae": 1.1640199238227473,
168
+ "eval_runtime": 21.3186,
169
+ "eval_samples_per_second": 23.454,
170
+ "eval_steps_per_second": 2.955,
171
  "step": 200
172
  },
173
  {
174
  "epoch": 0.67,
175
+ "grad_norm": 0.6321592330932617,
176
  "learning_rate": 0.00016645367412140575,
177
+ "loss": 0.3952,
178
  "step": 210
179
  },
180
  {
181
  "epoch": 0.7,
182
+ "grad_norm": 0.6153714656829834,
183
  "learning_rate": 0.0001648562300319489,
184
+ "loss": 0.2947,
185
  "step": 220
186
  },
187
  {
188
  "epoch": 0.73,
189
+ "grad_norm": 1.3031296730041504,
190
  "learning_rate": 0.00016325878594249202,
191
+ "loss": 0.3556,
192
  "step": 230
193
  },
194
  {
195
  "epoch": 0.77,
196
+ "grad_norm": 1.058060646057129,
197
  "learning_rate": 0.00016166134185303515,
198
+ "loss": 0.3432,
199
  "step": 240
200
  },
201
  {
202
  "epoch": 0.8,
203
+ "grad_norm": 0.957135796546936,
204
  "learning_rate": 0.0001600638977635783,
205
+ "loss": 0.3675,
206
  "step": 250
207
  },
208
  {
209
  "epoch": 0.83,
210
+ "grad_norm": 1.6347941160202026,
211
  "learning_rate": 0.00015846645367412142,
212
+ "loss": 0.3008,
213
  "step": 260
214
  },
215
  {
216
  "epoch": 0.86,
217
+ "grad_norm": 1.1190528869628906,
218
  "learning_rate": 0.00015686900958466454,
219
+ "loss": 0.2944,
220
  "step": 270
221
  },
222
  {
223
  "epoch": 0.89,
224
+ "grad_norm": 0.8016924858093262,
225
  "learning_rate": 0.00015527156549520767,
226
+ "loss": 0.2361,
227
  "step": 280
228
  },
229
  {
230
  "epoch": 0.93,
231
+ "grad_norm": 1.3622130155563354,
232
  "learning_rate": 0.00015367412140575082,
233
+ "loss": 0.3569,
234
  "step": 290
235
  },
236
  {
237
  "epoch": 0.96,
238
+ "grad_norm": 0.6603774428367615,
239
  "learning_rate": 0.00015207667731629394,
240
+ "loss": 0.2075,
241
  "step": 300
242
  },
243
  {
244
  "epoch": 0.96,
245
+ "eval_loss": 0.2772314250469208,
246
  "eval_na_accuracy": 0.946,
247
+ "eval_ordinal_accuracy": 0.5933333333333334,
248
+ "eval_ordinal_mae": 1.194209214001894,
249
+ "eval_runtime": 20.7347,
250
+ "eval_samples_per_second": 24.114,
251
+ "eval_steps_per_second": 3.038,
252
  "step": 300
253
  },
254
  {
255
  "epoch": 0.99,
256
+ "grad_norm": 1.3968242406845093,
257
+ "learning_rate": 0.00015047923322683706,
258
+ "loss": 0.2232,
259
  "step": 310
260
  },
261
  {
262
  "epoch": 1.02,
263
+ "grad_norm": 0.7815521359443665,
264
+ "learning_rate": 0.0001488817891373802,
265
+ "loss": 0.3132,
266
  "step": 320
267
  },
268
  {
269
  "epoch": 1.05,
270
+ "grad_norm": 1.1288195848464966,
271
+ "learning_rate": 0.00014728434504792333,
272
+ "loss": 0.255,
273
  "step": 330
274
  },
275
  {
276
  "epoch": 1.09,
277
+ "grad_norm": 0.7704196572303772,
278
+ "learning_rate": 0.00014568690095846646,
279
+ "loss": 0.2415,
280
  "step": 340
281
  },
282
  {
283
  "epoch": 1.12,
284
+ "grad_norm": 1.9226877689361572,
285
+ "learning_rate": 0.00014408945686900958,
286
+ "loss": 0.1975,
287
  "step": 350
288
  },
289
  {
290
  "epoch": 1.15,
291
+ "grad_norm": 0.5694310069084167,
292
+ "learning_rate": 0.00014249201277955273,
293
+ "loss": 0.1722,
294
  "step": 360
295
  },
296
  {
297
  "epoch": 1.18,
298
+ "grad_norm": 1.719147801399231,
299
+ "learning_rate": 0.00014089456869009585,
300
+ "loss": 0.2175,
301
  "step": 370
302
  },
303
  {
304
  "epoch": 1.21,
305
+ "grad_norm": 0.9247463941574097,
306
+ "learning_rate": 0.000139297124600639,
307
+ "loss": 0.2088,
308
  "step": 380
309
  },
310
  {
311
  "epoch": 1.25,
312
+ "grad_norm": 1.0941154956817627,
313
+ "learning_rate": 0.00013769968051118212,
314
+ "loss": 0.2854,
315
  "step": 390
316
  },
317
  {
318
  "epoch": 1.28,
319
+ "grad_norm": 1.0274015665054321,
320
+ "learning_rate": 0.00013610223642172525,
321
+ "loss": 0.196,
322
  "step": 400
323
  },
324
  {
325
  "epoch": 1.28,
326
+ "eval_loss": 0.273777574300766,
327
+ "eval_na_accuracy": 0.95,
328
+ "eval_ordinal_accuracy": 0.6133333333333333,
329
+ "eval_ordinal_mae": 1.198390154937903,
330
+ "eval_runtime": 20.9145,
331
+ "eval_samples_per_second": 23.907,
332
+ "eval_steps_per_second": 3.012,
333
  "step": 400
334
  },
335
  {
336
  "epoch": 1.31,
337
+ "grad_norm": 2.912687063217163,
338
+ "learning_rate": 0.00013450479233226837,
339
+ "loss": 0.2156,
340
  "step": 410
341
  },
342
  {
343
  "epoch": 1.34,
344
+ "grad_norm": 0.6906268000602722,
345
+ "learning_rate": 0.0001329073482428115,
346
+ "loss": 0.1366,
347
  "step": 420
348
  },
349
  {
350
  "epoch": 1.37,
351
+ "grad_norm": 0.43070048093795776,
352
+ "learning_rate": 0.00013130990415335464,
353
+ "loss": 0.2174,
354
  "step": 430
355
  },
356
  {
357
  "epoch": 1.41,
358
+ "grad_norm": 0.5173763632774353,
359
+ "learning_rate": 0.00012971246006389777,
360
+ "loss": 0.2016,
361
  "step": 440
362
  },
363
  {
364
  "epoch": 1.44,
365
+ "grad_norm": 1.04314386844635,
366
+ "learning_rate": 0.00012811501597444092,
367
+ "loss": 0.2233,
368
  "step": 450
369
  },
370
  {
371
  "epoch": 1.47,
372
+ "grad_norm": 0.523073673248291,
373
+ "learning_rate": 0.00012651757188498404,
374
+ "loss": 0.2231,
375
  "step": 460
376
  },
377
  {
378
  "epoch": 1.5,
379
+ "grad_norm": 3.259795904159546,
380
+ "learning_rate": 0.00012492012779552716,
381
+ "loss": 0.2366,
382
  "step": 470
383
  },
384
  {
385
  "epoch": 1.53,
386
+ "grad_norm": 0.6846562027931213,
387
+ "learning_rate": 0.00012332268370607028,
388
+ "loss": 0.2144,
389
  "step": 480
390
  },
391
  {
392
  "epoch": 1.57,
393
+ "grad_norm": 1.2122007608413696,
394
+ "learning_rate": 0.00012172523961661342,
395
+ "loss": 0.2938,
396
  "step": 490
397
  },
398
  {
399
  "epoch": 1.6,
400
+ "grad_norm": 1.3790067434310913,
401
+ "learning_rate": 0.00012012779552715656,
402
+ "loss": 0.2228,
403
  "step": 500
404
  },
405
  {
406
  "epoch": 1.6,
407
+ "eval_loss": 0.26852139830589294,
408
  "eval_na_accuracy": 0.956,
409
+ "eval_ordinal_accuracy": 0.62,
410
+ "eval_ordinal_mae": 1.1989026491012837,
411
+ "eval_runtime": 20.0158,
412
+ "eval_samples_per_second": 24.98,
413
+ "eval_steps_per_second": 3.148,
414
  "step": 500
415
  },
416
  {
417
  "epoch": 1.63,
418
+ "grad_norm": 0.7108421921730042,
419
+ "learning_rate": 0.00011853035143769968,
420
+ "loss": 0.1916,
421
  "step": 510
422
  },
423
  {
424
  "epoch": 1.66,
425
+ "grad_norm": 0.42910462617874146,
426
+ "learning_rate": 0.00011693290734824283,
427
+ "loss": 0.2478,
428
  "step": 520
429
  },
430
  {
431
  "epoch": 1.69,
432
+ "grad_norm": 0.9730465412139893,
433
+ "learning_rate": 0.00011533546325878595,
434
+ "loss": 0.189,
435
  "step": 530
436
  },
437
  {
438
  "epoch": 1.73,
439
+ "grad_norm": 0.9566612243652344,
440
+ "learning_rate": 0.00011373801916932908,
441
+ "loss": 0.1768,
442
  "step": 540
443
  },
444
  {
445
  "epoch": 1.76,
446
+ "grad_norm": 0.5167070627212524,
447
+ "learning_rate": 0.00011214057507987221,
448
+ "loss": 0.1385,
449
  "step": 550
450
  },
451
  {
452
  "epoch": 1.79,
453
+ "grad_norm": 0.5880122780799866,
454
+ "learning_rate": 0.00011054313099041533,
455
+ "loss": 0.1262,
456
  "step": 560
457
  },
458
  {
459
  "epoch": 1.82,
460
+ "grad_norm": 1.202286720275879,
461
+ "learning_rate": 0.00010894568690095847,
462
+ "loss": 0.1721,
463
  "step": 570
464
  },
465
  {
466
  "epoch": 1.85,
467
+ "grad_norm": 2.6997601985931396,
468
+ "learning_rate": 0.0001073482428115016,
469
+ "loss": 0.2128,
470
  "step": 580
471
  },
472
  {
473
  "epoch": 1.88,
474
+ "grad_norm": 1.1591830253601074,
475
+ "learning_rate": 0.00010575079872204474,
476
+ "loss": 0.2402,
477
  "step": 590
478
  },
479
  {
480
  "epoch": 1.92,
481
+ "grad_norm": 0.5840221643447876,
482
+ "learning_rate": 0.00010415335463258787,
483
+ "loss": 0.1816,
484
  "step": 600
485
  },
486
  {
487
  "epoch": 1.92,
488
+ "eval_loss": 0.2552177309989929,
489
+ "eval_na_accuracy": 0.95,
490
+ "eval_ordinal_accuracy": 0.6266666666666667,
491
+ "eval_ordinal_mae": 1.158560517811113,
492
+ "eval_runtime": 19.5011,
493
+ "eval_samples_per_second": 25.64,
494
+ "eval_steps_per_second": 3.231,
495
  "step": 600
496
  },
497
  {
498
  "epoch": 1.95,
499
+ "grad_norm": 0.7560299634933472,
500
+ "learning_rate": 0.000102555910543131,
501
+ "loss": 0.2021,
502
  "step": 610
503
  },
504
  {
505
  "epoch": 1.98,
506
+ "grad_norm": 1.8860361576080322,
507
+ "learning_rate": 0.00010095846645367413,
508
+ "loss": 0.2092,
509
  "step": 620
510
  },
511
  {
512
  "epoch": 2.01,
513
+ "grad_norm": 0.7235255837440491,
514
+ "learning_rate": 9.936102236421726e-05,
515
+ "loss": 0.1131,
516
  "step": 630
517
  },
518
  {
519
  "epoch": 2.04,
520
+ "grad_norm": 0.3656529486179352,
521
+ "learning_rate": 9.77635782747604e-05,
522
+ "loss": 0.0867,
523
  "step": 640
524
  },
525
  {
526
  "epoch": 2.08,
527
+ "grad_norm": 0.3450271785259247,
528
+ "learning_rate": 9.616613418530351e-05,
529
+ "loss": 0.0903,
530
  "step": 650
531
  },
532
  {
533
  "epoch": 2.11,
534
+ "grad_norm": 1.0603750944137573,
535
+ "learning_rate": 9.456869009584664e-05,
536
+ "loss": 0.1234,
537
  "step": 660
538
  },
539
  {
540
  "epoch": 2.14,
541
+ "grad_norm": 0.6790297031402588,
542
+ "learning_rate": 9.297124600638978e-05,
543
+ "loss": 0.0936,
544
  "step": 670
545
  },
546
  {
547
  "epoch": 2.17,
548
+ "grad_norm": 0.5596363544464111,
549
+ "learning_rate": 9.137380191693292e-05,
550
+ "loss": 0.0651,
551
  "step": 680
552
  },
553
  {
554
  "epoch": 2.2,
555
+ "grad_norm": 0.5989049673080444,
556
+ "learning_rate": 8.977635782747604e-05,
557
+ "loss": 0.1218,
558
  "step": 690
559
  },
560
  {
561
  "epoch": 2.24,
562
+ "grad_norm": 0.9003208875656128,
563
+ "learning_rate": 8.817891373801918e-05,
564
+ "loss": 0.0682,
565
  "step": 700
566
  },
567
  {
568
  "epoch": 2.24,
569
+ "eval_loss": 0.27212005853652954,
570
+ "eval_na_accuracy": 0.952,
571
+ "eval_ordinal_accuracy": 0.6577777777777778,
572
+ "eval_ordinal_mae": 1.1557789803379113,
573
+ "eval_runtime": 19.5966,
574
+ "eval_samples_per_second": 25.515,
575
+ "eval_steps_per_second": 3.215,
576
  "step": 700
577
  },
578
  {
579
  "epoch": 2.27,
580
+ "grad_norm": 0.6663013100624084,
581
+ "learning_rate": 8.658146964856231e-05,
582
+ "loss": 0.0714,
583
  "step": 710
584
  },
585
  {
586
  "epoch": 2.3,
587
+ "grad_norm": 1.0458776950836182,
588
+ "learning_rate": 8.498402555910544e-05,
589
+ "loss": 0.102,
590
  "step": 720
591
  },
592
  {
593
  "epoch": 2.33,
594
+ "grad_norm": 0.9246501922607422,
595
+ "learning_rate": 8.338658146964856e-05,
596
+ "loss": 0.1623,
597
  "step": 730
598
  },
599
  {
600
  "epoch": 2.36,
601
+ "grad_norm": 1.0837684869766235,
602
+ "learning_rate": 8.17891373801917e-05,
603
+ "loss": 0.0934,
604
  "step": 740
605
  },
606
  {
607
  "epoch": 2.4,
608
+ "grad_norm": 0.564241349697113,
609
+ "learning_rate": 8.019169329073483e-05,
610
+ "loss": 0.0853,
611
  "step": 750
612
  },
613
  {
614
  "epoch": 2.43,
615
+ "grad_norm": 4.335838794708252,
616
+ "learning_rate": 7.859424920127795e-05,
617
+ "loss": 0.1246,
618
  "step": 760
619
  },
620
  {
621
  "epoch": 2.46,
622
+ "grad_norm": 0.957082211971283,
623
+ "learning_rate": 7.699680511182109e-05,
624
+ "loss": 0.1292,
625
  "step": 770
626
  },
627
  {
628
  "epoch": 2.49,
629
+ "grad_norm": 0.9633702039718628,
630
+ "learning_rate": 7.539936102236423e-05,
631
+ "loss": 0.1916,
632
  "step": 780
633
  },
634
  {
635
  "epoch": 2.52,
636
+ "grad_norm": 0.7254676222801208,
637
+ "learning_rate": 7.380191693290735e-05,
638
+ "loss": 0.1054,
639
  "step": 790
640
  },
641
  {
642
  "epoch": 2.56,
643
+ "grad_norm": 0.5885197520256042,
644
+ "learning_rate": 7.220447284345049e-05,
645
+ "loss": 0.0795,
646
  "step": 800
647
  },
648
  {
649
  "epoch": 2.56,
650
+ "eval_loss": 0.2753521502017975,
651
  "eval_na_accuracy": 0.948,
652
+ "eval_ordinal_accuracy": 0.6333333333333333,
653
+ "eval_ordinal_mae": 1.1599188842872779,
654
+ "eval_runtime": 20.0506,
655
+ "eval_samples_per_second": 24.937,
656
+ "eval_steps_per_second": 3.142,
657
  "step": 800
658
  },
659
  {
660
  "epoch": 2.59,
661
+ "grad_norm": 0.5671622157096863,
662
+ "learning_rate": 7.060702875399361e-05,
663
+ "loss": 0.0948,
664
  "step": 810
665
  },
666
  {
667
  "epoch": 2.62,
668
+ "grad_norm": 0.9914100766181946,
669
+ "learning_rate": 6.900958466453674e-05,
670
+ "loss": 0.0715,
671
  "step": 820
672
  },
673
  {
674
  "epoch": 2.65,
675
+ "grad_norm": 0.4819205105304718,
676
+ "learning_rate": 6.741214057507987e-05,
677
+ "loss": 0.0839,
678
  "step": 830
679
  },
680
  {
681
  "epoch": 2.68,
682
+ "grad_norm": 0.3811684250831604,
683
+ "learning_rate": 6.5814696485623e-05,
684
+ "loss": 0.0825,
685
  "step": 840
686
  },
687
  {
688
  "epoch": 2.72,
689
+ "grad_norm": 0.9750994443893433,
690
+ "learning_rate": 6.421725239616614e-05,
691
+ "loss": 0.0968,
692
  "step": 850
693
  },
694
  {
695
  "epoch": 2.75,
696
+ "grad_norm": 0.35765138268470764,
697
+ "learning_rate": 6.261980830670928e-05,
698
+ "loss": 0.1605,
699
  "step": 860
700
  },
701
  {
702
  "epoch": 2.78,
703
+ "grad_norm": 0.3497343361377716,
704
+ "learning_rate": 6.1022364217252406e-05,
705
+ "loss": 0.0933,
706
  "step": 870
707
  },
708
  {
709
  "epoch": 2.81,
710
+ "grad_norm": 0.4838835299015045,
711
+ "learning_rate": 5.942492012779552e-05,
712
+ "loss": 0.0859,
713
  "step": 880
714
  },
715
  {
716
  "epoch": 2.84,
717
+ "grad_norm": 0.7002846002578735,
718
+ "learning_rate": 5.782747603833866e-05,
719
+ "loss": 0.1021,
720
  "step": 890
721
  },
722
  {
723
  "epoch": 2.88,
724
+ "grad_norm": 2.312203884124756,
725
+ "learning_rate": 5.623003194888179e-05,
726
+ "loss": 0.1367,
727
  "step": 900
728
  },
729
  {
730
  "epoch": 2.88,
731
+ "eval_loss": 0.29526129364967346,
732
+ "eval_na_accuracy": 0.946,
733
+ "eval_ordinal_accuracy": 0.64,
734
+ "eval_ordinal_mae": 1.166716830432415,
735
+ "eval_runtime": 20.0091,
736
+ "eval_samples_per_second": 24.989,
737
+ "eval_steps_per_second": 3.149,
738
  "step": 900
739
  },
740
  {
741
  "epoch": 2.91,
742
+ "grad_norm": 0.44126951694488525,
743
+ "learning_rate": 5.4632587859424925e-05,
744
+ "loss": 0.0854,
745
  "step": 910
746
  },
747
  {
748
  "epoch": 2.94,
749
+ "grad_norm": 1.0075191259384155,
750
+ "learning_rate": 5.3035143769968054e-05,
751
+ "loss": 0.0823,
752
  "step": 920
753
  },
754
  {
755
  "epoch": 2.97,
756
+ "grad_norm": 0.9991279244422913,
757
+ "learning_rate": 5.1437699680511184e-05,
758
+ "loss": 0.1156,
759
  "step": 930
760
  },
761
  {
762
  "epoch": 3.0,
763
+ "grad_norm": 0.8888081312179565,
764
+ "learning_rate": 4.984025559105431e-05,
765
+ "loss": 0.0876,
766
  "step": 940
767
  },
768
  {
769
  "epoch": 3.04,
770
+ "grad_norm": 0.3761376738548279,
771
+ "learning_rate": 4.824281150159744e-05,
772
+ "loss": 0.0452,
773
  "step": 950
774
  },
775
  {
776
  "epoch": 3.07,
777
+ "grad_norm": 0.365622341632843,
778
+ "learning_rate": 4.664536741214058e-05,
779
+ "loss": 0.0428,
780
  "step": 960
781
  },
782
  {
783
  "epoch": 3.1,
784
+ "grad_norm": 0.35657036304473877,
785
+ "learning_rate": 4.504792332268371e-05,
786
+ "loss": 0.033,
787
  "step": 970
788
  },
789
  {
790
  "epoch": 3.13,
791
+ "grad_norm": 0.5636401176452637,
792
+ "learning_rate": 4.345047923322684e-05,
793
+ "loss": 0.0356,
794
  "step": 980
795
  },
796
  {
797
  "epoch": 3.16,
798
+ "grad_norm": 0.431383341550827,
799
+ "learning_rate": 4.185303514376997e-05,
800
+ "loss": 0.0463,
801
  "step": 990
802
  },
803
  {
804
  "epoch": 3.19,
805
+ "grad_norm": 0.583328127861023,
806
+ "learning_rate": 4.0255591054313104e-05,
807
+ "loss": 0.0387,
808
  "step": 1000
809
  },
810
  {
811
  "epoch": 3.19,
812
+ "eval_loss": 0.2923290431499481,
813
+ "eval_na_accuracy": 0.944,
814
+ "eval_ordinal_accuracy": 0.6377777777777778,
815
+ "eval_ordinal_mae": 1.2024743282463815,
816
+ "eval_runtime": 19.3226,
817
+ "eval_samples_per_second": 25.876,
818
+ "eval_steps_per_second": 3.26,
819
  "step": 1000
820
  },
821
  {
822
  "epoch": 3.23,
823
+ "grad_norm": 2.440162420272827,
824
+ "learning_rate": 3.8658146964856234e-05,
825
+ "loss": 0.0607,
826
  "step": 1010
827
  },
828
  {
829
  "epoch": 3.26,
830
+ "grad_norm": 0.29546236991882324,
831
+ "learning_rate": 3.7060702875399364e-05,
832
+ "loss": 0.0515,
833
  "step": 1020
834
  },
835
  {
836
  "epoch": 3.29,
837
+ "grad_norm": 0.44689303636550903,
838
+ "learning_rate": 3.546325878594249e-05,
839
+ "loss": 0.0273,
840
  "step": 1030
841
  },
842
  {
843
  "epoch": 3.32,
844
+ "grad_norm": 0.3288978040218353,
845
+ "learning_rate": 3.386581469648562e-05,
846
+ "loss": 0.0352,
847
  "step": 1040
848
  },
849
  {
850
  "epoch": 3.35,
851
+ "grad_norm": 0.41706767678260803,
852
+ "learning_rate": 3.226837060702875e-05,
853
+ "loss": 0.0345,
854
  "step": 1050
855
  },
856
  {
857
  "epoch": 3.39,
858
+ "grad_norm": 0.31060507893562317,
859
+ "learning_rate": 3.067092651757188e-05,
860
+ "loss": 0.0294,
861
  "step": 1060
862
  },
863
  {
864
  "epoch": 3.42,
865
+ "grad_norm": 0.2541821599006653,
866
+ "learning_rate": 2.907348242811502e-05,
867
+ "loss": 0.0354,
868
  "step": 1070
869
  },
870
  {
871
  "epoch": 3.45,
872
+ "grad_norm": 0.574343740940094,
873
+ "learning_rate": 2.747603833865815e-05,
874
+ "loss": 0.0443,
875
  "step": 1080
876
  },
877
  {
878
  "epoch": 3.48,
879
+ "grad_norm": 0.47532182931900024,
880
+ "learning_rate": 2.5878594249201278e-05,
881
+ "loss": 0.0605,
882
  "step": 1090
883
  },
884
  {
885
  "epoch": 3.51,
886
+ "grad_norm": 0.45276594161987305,
887
+ "learning_rate": 2.428115015974441e-05,
888
+ "loss": 0.0293,
889
  "step": 1100
890
  },
891
  {
892
  "epoch": 3.51,
893
+ "eval_loss": 0.2884800434112549,
894
+ "eval_na_accuracy": 0.948,
895
+ "eval_ordinal_accuracy": 0.6644444444444444,
896
+ "eval_ordinal_mae": 1.1666180535654227,
897
+ "eval_runtime": 19.9365,
898
+ "eval_samples_per_second": 25.08,
899
+ "eval_steps_per_second": 3.16,
900
  "step": 1100
901
  },
902
  {
903
  "epoch": 3.55,
904
+ "grad_norm": 0.655549168586731,
905
+ "learning_rate": 2.268370607028754e-05,
906
+ "loss": 0.034,
907
  "step": 1110
908
  },
909
  {
910
  "epoch": 3.58,
911
+ "grad_norm": 0.16610193252563477,
912
+ "learning_rate": 2.108626198083067e-05,
913
+ "loss": 0.0319,
914
  "step": 1120
915
  },
916
  {
917
  "epoch": 3.61,
918
+ "grad_norm": 0.26889652013778687,
919
+ "learning_rate": 1.9488817891373803e-05,
920
+ "loss": 0.0479,
921
  "step": 1130
922
  },
923
  {
924
  "epoch": 3.64,
925
+ "grad_norm": 0.2418793886899948,
926
+ "learning_rate": 1.7891373801916932e-05,
927
+ "loss": 0.0322,
928
  "step": 1140
929
  },
930
  {
931
  "epoch": 3.67,
932
+ "grad_norm": 0.5379694104194641,
933
+ "learning_rate": 1.6293929712460065e-05,
934
+ "loss": 0.0393,
935
  "step": 1150
936
  },
937
  {
938
  "epoch": 3.71,
939
+ "grad_norm": 0.19815516471862793,
940
+ "learning_rate": 1.4696485623003195e-05,
941
+ "loss": 0.0217,
942
  "step": 1160
943
  },
944
  {
945
  "epoch": 3.74,
946
+ "grad_norm": 0.889312207698822,
947
+ "learning_rate": 1.3099041533546328e-05,
948
+ "loss": 0.0332,
949
  "step": 1170
950
  },
951
  {
952
  "epoch": 3.77,
953
+ "grad_norm": 0.2865816652774811,
954
+ "learning_rate": 1.1501597444089457e-05,
955
+ "loss": 0.0313,
956
  "step": 1180
957
  },
958
  {
959
  "epoch": 3.8,
960
+ "grad_norm": 0.5947129726409912,
961
+ "learning_rate": 9.904153354632589e-06,
962
+ "loss": 0.034,
963
  "step": 1190
964
  },
965
  {
966
  "epoch": 3.83,
967
+ "grad_norm": 0.44885268807411194,
968
+ "learning_rate": 8.306709265175718e-06,
969
+ "loss": 0.0286,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 3.83,
974
+ "eval_loss": 0.28681233525276184,
975
+ "eval_na_accuracy": 0.95,
976
+ "eval_ordinal_accuracy": 0.6711111111111111,
977
+ "eval_ordinal_mae": 1.1625636271304554,
978
+ "eval_runtime": 19.7259,
979
+ "eval_samples_per_second": 25.347,
980
+ "eval_steps_per_second": 3.194,
981
  "step": 1200
982
  },
983
  {
984
  "epoch": 3.83,
985
  "step": 1200,
986
  "total_flos": 1.4860396665534874e+18,
987
+ "train_loss": 0.17935538868109385,
988
+ "train_runtime": 1702.5744,
989
+ "train_samples_per_second": 11.747,
990
+ "train_steps_per_second": 0.735
991
  }
992
  ],
993
  "logging_steps": 10,