rshrott commited on
Commit
c809e8e
1 Parent(s): d352ff6

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  model-index:
7
  - name: ryan_model3272024
@@ -13,12 +14,12 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # ryan_model3272024
15
 
16
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.3366
19
- - Na Accuracy: 0.5946
20
- - Ordinal Accuracy: 0.5171
21
- - Ordinal Mae: 0.6727
22
 
23
  ## Model description
24
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  model-index:
8
  - name: ryan_model3272024
 
14
 
15
  # ryan_model3272024
16
 
17
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.3037
20
+ - Na Accuracy: 0.7297
21
+ - Ordinal Accuracy: 0.5285
22
+ - Ordinal Mae: 0.6723
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 3.83,
3
- "eval_loss": 0.2552177309989929,
4
- "eval_na_accuracy": 0.95,
5
- "eval_ordinal_accuracy": 0.6266666666666667,
6
- "eval_ordinal_mae": 1.158560517811113,
7
- "eval_runtime": 19.6745,
8
- "eval_samples_per_second": 25.414,
9
- "eval_steps_per_second": 3.202,
10
- "train_loss": 0.17935538868109385,
11
- "train_runtime": 1702.5744,
12
- "train_samples_per_second": 11.747,
13
- "train_steps_per_second": 0.735
14
  }
 
1
  {
2
+ "epoch": 1.46,
3
+ "eval_loss": 0.3037484288215637,
4
+ "eval_na_accuracy": 0.7297297120094299,
5
+ "eval_ordinal_accuracy": 0.5285171270370483,
6
+ "eval_ordinal_mae": 0.6723113059997559,
7
+ "eval_runtime": 11.1069,
8
+ "eval_samples_per_second": 27.01,
9
+ "eval_steps_per_second": 3.421,
10
+ "train_loss": 0.32911812565543436,
11
+ "train_runtime": 761.2747,
12
+ "train_samples_per_second": 15.763,
13
+ "train_steps_per_second": 0.988
14
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 3.83,
3
- "eval_loss": 0.2552177309989929,
4
- "eval_na_accuracy": 0.95,
5
- "eval_ordinal_accuracy": 0.6266666666666667,
6
- "eval_ordinal_mae": 1.158560517811113,
7
- "eval_runtime": 19.6745,
8
- "eval_samples_per_second": 25.414,
9
- "eval_steps_per_second": 3.202
10
  }
 
1
  {
2
+ "epoch": 1.46,
3
+ "eval_loss": 0.3037484288215637,
4
+ "eval_na_accuracy": 0.7297297120094299,
5
+ "eval_ordinal_accuracy": 0.5285171270370483,
6
+ "eval_ordinal_mae": 0.6723113059997559,
7
+ "eval_runtime": 11.1069,
8
+ "eval_samples_per_second": 27.01,
9
+ "eval_steps_per_second": 3.421
10
  }
runs/Mar27_22-12-30_ryanserver/events.out.tfevents.1711592832.ryanserver.19364.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98d46e1427ce12b776ce603a453568088afd1258d1e6b7d00e4608b6c15502c1
3
+ size 529
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 3.83,
3
- "train_loss": 0.17935538868109385,
4
- "train_runtime": 1702.5744,
5
- "train_samples_per_second": 11.747,
6
- "train_steps_per_second": 0.735
7
  }
 
1
  {
2
+ "epoch": 1.46,
3
+ "train_loss": 0.32911812565543436,
4
+ "train_runtime": 761.2747,
5
+ "train_samples_per_second": 15.763,
6
+ "train_steps_per_second": 0.988
7
  }
trainer_state.json CHANGED
@@ -1,1001 +1,535 @@
1
  {
2
- "best_metric": 0.2552177309989929,
3
- "best_model_checkpoint": "./ryan_model3272024/checkpoint-600",
4
- "epoch": 3.8338658146964857,
5
- "eval_steps": 100,
6
- "global_step": 1200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": 1.4023665189743042,
14
- "learning_rate": 0.00019840255591054313,
15
- "loss": 0.5486,
 
 
 
 
 
 
 
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.06,
20
- "grad_norm": 1.2863692045211792,
21
- "learning_rate": 0.00019680511182108628,
22
- "loss": 0.4543,
 
 
 
 
 
 
 
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.1,
27
- "grad_norm": 0.8842328190803528,
28
- "learning_rate": 0.0001952076677316294,
29
- "loss": 0.4222,
30
- "step": 30
31
  },
32
  {
33
  "epoch": 0.13,
34
- "grad_norm": 0.8728455901145935,
35
- "learning_rate": 0.00019361022364217253,
36
- "loss": 0.3764,
37
- "step": 40
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 0.6641435027122498,
42
- "learning_rate": 0.00019201277955271565,
43
- "loss": 0.3214,
44
- "step": 50
45
  },
46
  {
47
  "epoch": 0.19,
48
- "grad_norm": 1.4344050884246826,
49
- "learning_rate": 0.0001904153354632588,
50
- "loss": 0.3286,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.22,
55
- "grad_norm": 0.8919397592544556,
56
- "learning_rate": 0.00018881789137380192,
57
- "loss": 0.33,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.26,
62
- "grad_norm": 1.7052876949310303,
63
- "learning_rate": 0.00018722044728434505,
64
- "loss": 0.3337,
65
- "step": 80
66
  },
67
  {
68
- "epoch": 0.29,
69
- "grad_norm": 0.4728272259235382,
70
- "learning_rate": 0.0001856230031948882,
71
- "loss": 0.3784,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.32,
76
- "grad_norm": 1.1663854122161865,
77
- "learning_rate": 0.00018402555910543132,
78
- "loss": 0.3853,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.32,
83
- "eval_loss": 0.3272034823894501,
84
- "eval_na_accuracy": 0.924,
85
- "eval_ordinal_accuracy": 0.52,
86
- "eval_ordinal_mae": 1.210578082634343,
87
- "eval_runtime": 52.9914,
88
- "eval_samples_per_second": 9.435,
89
- "eval_steps_per_second": 1.189,
90
- "step": 100
91
- },
92
- {
93
- "epoch": 0.35,
94
- "grad_norm": 0.8579528331756592,
95
- "learning_rate": 0.00018242811501597444,
96
- "loss": 0.3585,
97
- "step": 110
98
- },
99
- {
100
- "epoch": 0.38,
101
- "grad_norm": 1.02351975440979,
102
- "learning_rate": 0.00018083067092651756,
103
- "loss": 0.3621,
104
- "step": 120
105
- },
106
- {
107
- "epoch": 0.42,
108
- "grad_norm": 1.3286011219024658,
109
- "learning_rate": 0.00017923322683706071,
110
- "loss": 0.3714,
111
- "step": 130
112
- },
113
- {
114
- "epoch": 0.45,
115
- "grad_norm": 0.6290095448493958,
116
- "learning_rate": 0.00017763578274760384,
117
- "loss": 0.3275,
118
- "step": 140
119
- },
120
- {
121
- "epoch": 0.48,
122
- "grad_norm": 1.269338846206665,
123
- "learning_rate": 0.000176038338658147,
124
- "loss": 0.4287,
125
- "step": 150
126
- },
127
- {
128
- "epoch": 0.51,
129
- "grad_norm": 0.6244733333587646,
130
- "learning_rate": 0.0001744408945686901,
131
- "loss": 0.3067,
132
- "step": 160
133
- },
134
- {
135
- "epoch": 0.54,
136
- "grad_norm": 1.1287596225738525,
137
- "learning_rate": 0.00017284345047923323,
138
- "loss": 0.2982,
139
- "step": 170
140
- },
141
- {
142
- "epoch": 0.58,
143
- "grad_norm": 1.436303734779358,
144
- "learning_rate": 0.00017124600638977638,
145
- "loss": 0.2946,
146
- "step": 180
147
- },
148
- {
149
- "epoch": 0.61,
150
- "grad_norm": 0.8159350752830505,
151
- "learning_rate": 0.00016964856230031948,
152
- "loss": 0.3514,
153
- "step": 190
154
- },
155
- {
156
- "epoch": 0.64,
157
- "grad_norm": 0.7363901138305664,
158
- "learning_rate": 0.00016805111821086263,
159
- "loss": 0.3396,
160
- "step": 200
161
- },
162
- {
163
- "epoch": 0.64,
164
- "eval_loss": 0.27412503957748413,
165
- "eval_na_accuracy": 0.94,
166
- "eval_ordinal_accuracy": 0.5644444444444444,
167
- "eval_ordinal_mae": 1.1640199238227473,
168
- "eval_runtime": 21.3186,
169
- "eval_samples_per_second": 23.454,
170
- "eval_steps_per_second": 2.955,
171
- "step": 200
172
- },
173
- {
174
- "epoch": 0.67,
175
- "grad_norm": 0.6321592330932617,
176
- "learning_rate": 0.00016645367412140575,
177
- "loss": 0.3952,
178
- "step": 210
179
- },
180
- {
181
- "epoch": 0.7,
182
- "grad_norm": 0.6153714656829834,
183
- "learning_rate": 0.0001648562300319489,
184
- "loss": 0.2947,
185
- "step": 220
186
- },
187
- {
188
- "epoch": 0.73,
189
- "grad_norm": 1.3031296730041504,
190
- "learning_rate": 0.00016325878594249202,
191
- "loss": 0.3556,
192
- "step": 230
193
- },
194
- {
195
- "epoch": 0.77,
196
- "grad_norm": 1.058060646057129,
197
- "learning_rate": 0.00016166134185303515,
198
- "loss": 0.3432,
199
- "step": 240
200
- },
201
- {
202
- "epoch": 0.8,
203
- "grad_norm": 0.957135796546936,
204
- "learning_rate": 0.0001600638977635783,
205
- "loss": 0.3675,
206
- "step": 250
207
- },
208
- {
209
- "epoch": 0.83,
210
- "grad_norm": 1.6347941160202026,
211
- "learning_rate": 0.00015846645367412142,
212
- "loss": 0.3008,
213
- "step": 260
214
  },
215
  {
216
- "epoch": 0.86,
217
- "grad_norm": 1.1190528869628906,
218
- "learning_rate": 0.00015686900958466454,
219
- "loss": 0.2944,
220
- "step": 270
221
  },
222
  {
223
- "epoch": 0.89,
224
- "grad_norm": 0.8016924858093262,
225
- "learning_rate": 0.00015527156549520767,
226
- "loss": 0.2361,
227
- "step": 280
228
  },
229
  {
230
- "epoch": 0.93,
231
- "grad_norm": 1.3622130155563354,
232
- "learning_rate": 0.00015367412140575082,
233
- "loss": 0.3569,
234
- "step": 290
 
 
 
 
235
  },
236
  {
237
- "epoch": 0.96,
238
- "grad_norm": 0.6603774428367615,
239
- "learning_rate": 0.00015207667731629394,
240
- "loss": 0.2075,
241
- "step": 300
242
  },
243
  {
244
- "epoch": 0.96,
245
- "eval_loss": 0.2772314250469208,
246
- "eval_na_accuracy": 0.946,
247
- "eval_ordinal_accuracy": 0.5933333333333334,
248
- "eval_ordinal_mae": 1.194209214001894,
249
- "eval_runtime": 20.7347,
250
- "eval_samples_per_second": 24.114,
251
- "eval_steps_per_second": 3.038,
252
- "step": 300
253
  },
254
  {
255
- "epoch": 0.99,
256
- "grad_norm": 1.3968242406845093,
257
- "learning_rate": 0.00015047923322683706,
258
- "loss": 0.2232,
259
- "step": 310
260
  },
261
  {
262
- "epoch": 1.02,
263
- "grad_norm": 0.7815521359443665,
264
- "learning_rate": 0.0001488817891373802,
265
  "loss": 0.3132,
266
- "step": 320
267
- },
268
- {
269
- "epoch": 1.05,
270
- "grad_norm": 1.1288195848464966,
271
- "learning_rate": 0.00014728434504792333,
272
- "loss": 0.255,
273
- "step": 330
274
- },
275
- {
276
- "epoch": 1.09,
277
- "grad_norm": 0.7704196572303772,
278
- "learning_rate": 0.00014568690095846646,
279
- "loss": 0.2415,
280
- "step": 340
281
- },
282
- {
283
- "epoch": 1.12,
284
- "grad_norm": 1.9226877689361572,
285
- "learning_rate": 0.00014408945686900958,
286
- "loss": 0.1975,
287
- "step": 350
288
- },
289
- {
290
- "epoch": 1.15,
291
- "grad_norm": 0.5694310069084167,
292
- "learning_rate": 0.00014249201277955273,
293
- "loss": 0.1722,
294
- "step": 360
295
- },
296
- {
297
- "epoch": 1.18,
298
- "grad_norm": 1.719147801399231,
299
- "learning_rate": 0.00014089456869009585,
300
- "loss": 0.2175,
301
- "step": 370
302
- },
303
- {
304
- "epoch": 1.21,
305
- "grad_norm": 0.9247463941574097,
306
- "learning_rate": 0.000139297124600639,
307
- "loss": 0.2088,
308
- "step": 380
309
- },
310
- {
311
- "epoch": 1.25,
312
- "grad_norm": 1.0941154956817627,
313
- "learning_rate": 0.00013769968051118212,
314
- "loss": 0.2854,
315
- "step": 390
316
- },
317
- {
318
- "epoch": 1.28,
319
- "grad_norm": 1.0274015665054321,
320
- "learning_rate": 0.00013610223642172525,
321
- "loss": 0.196,
322
- "step": 400
323
- },
324
- {
325
- "epoch": 1.28,
326
- "eval_loss": 0.273777574300766,
327
- "eval_na_accuracy": 0.95,
328
- "eval_ordinal_accuracy": 0.6133333333333333,
329
- "eval_ordinal_mae": 1.198390154937903,
330
- "eval_runtime": 20.9145,
331
- "eval_samples_per_second": 23.907,
332
- "eval_steps_per_second": 3.012,
333
- "step": 400
334
- },
335
- {
336
- "epoch": 1.31,
337
- "grad_norm": 2.912687063217163,
338
- "learning_rate": 0.00013450479233226837,
339
- "loss": 0.2156,
340
- "step": 410
341
- },
342
- {
343
- "epoch": 1.34,
344
- "grad_norm": 0.6906268000602722,
345
- "learning_rate": 0.0001329073482428115,
346
- "loss": 0.1366,
347
- "step": 420
348
- },
349
- {
350
- "epoch": 1.37,
351
- "grad_norm": 0.43070048093795776,
352
- "learning_rate": 0.00013130990415335464,
353
- "loss": 0.2174,
354
- "step": 430
355
- },
356
- {
357
- "epoch": 1.41,
358
- "grad_norm": 0.5173763632774353,
359
- "learning_rate": 0.00012971246006389777,
360
- "loss": 0.2016,
361
- "step": 440
362
- },
363
- {
364
- "epoch": 1.44,
365
- "grad_norm": 1.04314386844635,
366
- "learning_rate": 0.00012811501597444092,
367
- "loss": 0.2233,
368
- "step": 450
369
- },
370
- {
371
- "epoch": 1.47,
372
- "grad_norm": 0.523073673248291,
373
- "learning_rate": 0.00012651757188498404,
374
- "loss": 0.2231,
375
- "step": 460
376
- },
377
- {
378
- "epoch": 1.5,
379
- "grad_norm": 3.259795904159546,
380
- "learning_rate": 0.00012492012779552716,
381
- "loss": 0.2366,
382
- "step": 470
383
- },
384
- {
385
- "epoch": 1.53,
386
- "grad_norm": 0.6846562027931213,
387
- "learning_rate": 0.00012332268370607028,
388
- "loss": 0.2144,
389
- "step": 480
390
- },
391
- {
392
- "epoch": 1.57,
393
- "grad_norm": 1.2122007608413696,
394
- "learning_rate": 0.00012172523961661342,
395
- "loss": 0.2938,
396
- "step": 490
397
- },
398
- {
399
- "epoch": 1.6,
400
- "grad_norm": 1.3790067434310913,
401
- "learning_rate": 0.00012012779552715656,
402
- "loss": 0.2228,
403
- "step": 500
404
- },
405
- {
406
- "epoch": 1.6,
407
- "eval_loss": 0.26852139830589294,
408
- "eval_na_accuracy": 0.956,
409
- "eval_ordinal_accuracy": 0.62,
410
- "eval_ordinal_mae": 1.1989026491012837,
411
- "eval_runtime": 20.0158,
412
- "eval_samples_per_second": 24.98,
413
- "eval_steps_per_second": 3.148,
414
- "step": 500
415
- },
416
- {
417
- "epoch": 1.63,
418
- "grad_norm": 0.7108421921730042,
419
- "learning_rate": 0.00011853035143769968,
420
- "loss": 0.1916,
421
- "step": 510
422
- },
423
- {
424
- "epoch": 1.66,
425
- "grad_norm": 0.42910462617874146,
426
- "learning_rate": 0.00011693290734824283,
427
- "loss": 0.2478,
428
- "step": 520
429
- },
430
- {
431
- "epoch": 1.69,
432
- "grad_norm": 0.9730465412139893,
433
- "learning_rate": 0.00011533546325878595,
434
- "loss": 0.189,
435
- "step": 530
436
- },
437
- {
438
- "epoch": 1.73,
439
- "grad_norm": 0.9566612243652344,
440
- "learning_rate": 0.00011373801916932908,
441
- "loss": 0.1768,
442
- "step": 540
443
- },
444
- {
445
- "epoch": 1.76,
446
- "grad_norm": 0.5167070627212524,
447
- "learning_rate": 0.00011214057507987221,
448
- "loss": 0.1385,
449
- "step": 550
450
- },
451
- {
452
- "epoch": 1.79,
453
- "grad_norm": 0.5880122780799866,
454
- "learning_rate": 0.00011054313099041533,
455
- "loss": 0.1262,
456
- "step": 560
457
- },
458
- {
459
- "epoch": 1.82,
460
- "grad_norm": 1.202286720275879,
461
- "learning_rate": 0.00010894568690095847,
462
- "loss": 0.1721,
463
- "step": 570
464
- },
465
- {
466
- "epoch": 1.85,
467
- "grad_norm": 2.6997601985931396,
468
- "learning_rate": 0.0001073482428115016,
469
- "loss": 0.2128,
470
- "step": 580
471
- },
472
- {
473
- "epoch": 1.88,
474
- "grad_norm": 1.1591830253601074,
475
- "learning_rate": 0.00010575079872204474,
476
- "loss": 0.2402,
477
- "step": 590
478
- },
479
- {
480
- "epoch": 1.92,
481
- "grad_norm": 0.5840221643447876,
482
- "learning_rate": 0.00010415335463258787,
483
- "loss": 0.1816,
484
- "step": 600
485
- },
486
- {
487
- "epoch": 1.92,
488
- "eval_loss": 0.2552177309989929,
489
- "eval_na_accuracy": 0.95,
490
- "eval_ordinal_accuracy": 0.6266666666666667,
491
- "eval_ordinal_mae": 1.158560517811113,
492
- "eval_runtime": 19.5011,
493
- "eval_samples_per_second": 25.64,
494
- "eval_steps_per_second": 3.231,
495
- "step": 600
496
- },
497
- {
498
- "epoch": 1.95,
499
- "grad_norm": 0.7560299634933472,
500
- "learning_rate": 0.000102555910543131,
501
- "loss": 0.2021,
502
- "step": 610
503
- },
504
- {
505
- "epoch": 1.98,
506
- "grad_norm": 1.8860361576080322,
507
- "learning_rate": 0.00010095846645367413,
508
- "loss": 0.2092,
509
- "step": 620
510
- },
511
- {
512
- "epoch": 2.01,
513
- "grad_norm": 0.7235255837440491,
514
- "learning_rate": 9.936102236421726e-05,
515
- "loss": 0.1131,
516
- "step": 630
517
- },
518
- {
519
- "epoch": 2.04,
520
- "grad_norm": 0.3656529486179352,
521
- "learning_rate": 9.77635782747604e-05,
522
- "loss": 0.0867,
523
- "step": 640
524
- },
525
- {
526
- "epoch": 2.08,
527
- "grad_norm": 0.3450271785259247,
528
- "learning_rate": 9.616613418530351e-05,
529
- "loss": 0.0903,
530
- "step": 650
531
- },
532
- {
533
- "epoch": 2.11,
534
- "grad_norm": 1.0603750944137573,
535
- "learning_rate": 9.456869009584664e-05,
536
- "loss": 0.1234,
537
- "step": 660
538
- },
539
- {
540
- "epoch": 2.14,
541
- "grad_norm": 0.6790297031402588,
542
- "learning_rate": 9.297124600638978e-05,
543
- "loss": 0.0936,
544
- "step": 670
545
- },
546
- {
547
- "epoch": 2.17,
548
- "grad_norm": 0.5596363544464111,
549
- "learning_rate": 9.137380191693292e-05,
550
- "loss": 0.0651,
551
- "step": 680
552
- },
553
- {
554
- "epoch": 2.2,
555
- "grad_norm": 0.5989049673080444,
556
- "learning_rate": 8.977635782747604e-05,
557
- "loss": 0.1218,
558
- "step": 690
559
- },
560
- {
561
- "epoch": 2.24,
562
- "grad_norm": 0.9003208875656128,
563
- "learning_rate": 8.817891373801918e-05,
564
- "loss": 0.0682,
565
- "step": 700
566
- },
567
- {
568
- "epoch": 2.24,
569
- "eval_loss": 0.27212005853652954,
570
- "eval_na_accuracy": 0.952,
571
- "eval_ordinal_accuracy": 0.6577777777777778,
572
- "eval_ordinal_mae": 1.1557789803379113,
573
- "eval_runtime": 19.5966,
574
- "eval_samples_per_second": 25.515,
575
- "eval_steps_per_second": 3.215,
576
- "step": 700
577
- },
578
- {
579
- "epoch": 2.27,
580
- "grad_norm": 0.6663013100624084,
581
- "learning_rate": 8.658146964856231e-05,
582
- "loss": 0.0714,
583
- "step": 710
584
- },
585
- {
586
- "epoch": 2.3,
587
- "grad_norm": 1.0458776950836182,
588
- "learning_rate": 8.498402555910544e-05,
589
- "loss": 0.102,
590
- "step": 720
591
- },
592
- {
593
- "epoch": 2.33,
594
- "grad_norm": 0.9246501922607422,
595
- "learning_rate": 8.338658146964856e-05,
596
- "loss": 0.1623,
597
- "step": 730
598
- },
599
- {
600
- "epoch": 2.36,
601
- "grad_norm": 1.0837684869766235,
602
- "learning_rate": 8.17891373801917e-05,
603
- "loss": 0.0934,
604
- "step": 740
605
- },
606
- {
607
- "epoch": 2.4,
608
- "grad_norm": 0.564241349697113,
609
- "learning_rate": 8.019169329073483e-05,
610
- "loss": 0.0853,
611
- "step": 750
612
  },
613
  {
614
- "epoch": 2.43,
615
- "grad_norm": 4.335838794708252,
616
- "learning_rate": 7.859424920127795e-05,
617
- "loss": 0.1246,
618
- "step": 760
619
  },
620
  {
621
- "epoch": 2.46,
622
- "grad_norm": 0.957082211971283,
623
- "learning_rate": 7.699680511182109e-05,
624
- "loss": 0.1292,
625
- "step": 770
 
 
 
 
626
  },
627
  {
628
- "epoch": 2.49,
629
- "grad_norm": 0.9633702039718628,
630
- "learning_rate": 7.539936102236423e-05,
631
- "loss": 0.1916,
632
- "step": 780
633
  },
634
  {
635
- "epoch": 2.52,
636
- "grad_norm": 0.7254676222801208,
637
- "learning_rate": 7.380191693290735e-05,
638
- "loss": 0.1054,
639
- "step": 790
640
  },
641
  {
642
- "epoch": 2.56,
643
- "grad_norm": 0.5885197520256042,
644
- "learning_rate": 7.220447284345049e-05,
645
- "loss": 0.0795,
646
- "step": 800
647
  },
648
  {
649
- "epoch": 2.56,
650
- "eval_loss": 0.2753521502017975,
651
- "eval_na_accuracy": 0.948,
652
- "eval_ordinal_accuracy": 0.6333333333333333,
653
- "eval_ordinal_mae": 1.1599188842872779,
654
- "eval_runtime": 20.0506,
655
- "eval_samples_per_second": 24.937,
656
- "eval_steps_per_second": 3.142,
657
- "step": 800
658
  },
659
  {
660
- "epoch": 2.59,
661
- "grad_norm": 0.5671622157096863,
662
- "learning_rate": 7.060702875399361e-05,
663
- "loss": 0.0948,
664
- "step": 810
665
  },
666
  {
667
- "epoch": 2.62,
668
- "grad_norm": 0.9914100766181946,
669
- "learning_rate": 6.900958466453674e-05,
670
- "loss": 0.0715,
671
- "step": 820
 
 
 
 
672
  },
673
  {
674
- "epoch": 2.65,
675
- "grad_norm": 0.4819205105304718,
676
- "learning_rate": 6.741214057507987e-05,
677
- "loss": 0.0839,
678
- "step": 830
679
  },
680
  {
681
- "epoch": 2.68,
682
- "grad_norm": 0.3811684250831604,
683
- "learning_rate": 6.5814696485623e-05,
684
- "loss": 0.0825,
685
- "step": 840
686
  },
687
  {
688
- "epoch": 2.72,
689
- "grad_norm": 0.9750994443893433,
690
- "learning_rate": 6.421725239616614e-05,
691
- "loss": 0.0968,
692
- "step": 850
693
  },
694
  {
695
- "epoch": 2.75,
696
- "grad_norm": 0.35765138268470764,
697
- "learning_rate": 6.261980830670928e-05,
698
- "loss": 0.1605,
699
- "step": 860
700
  },
701
  {
702
- "epoch": 2.78,
703
- "grad_norm": 0.3497343361377716,
704
- "learning_rate": 6.1022364217252406e-05,
705
- "loss": 0.0933,
706
- "step": 870
707
  },
708
  {
709
- "epoch": 2.81,
710
- "grad_norm": 0.4838835299015045,
711
- "learning_rate": 5.942492012779552e-05,
712
- "loss": 0.0859,
713
- "step": 880
 
 
 
 
714
  },
715
  {
716
- "epoch": 2.84,
717
- "grad_norm": 0.7002846002578735,
718
- "learning_rate": 5.782747603833866e-05,
719
- "loss": 0.1021,
720
- "step": 890
721
  },
722
  {
723
- "epoch": 2.88,
724
- "grad_norm": 2.312203884124756,
725
- "learning_rate": 5.623003194888179e-05,
726
- "loss": 0.1367,
727
- "step": 900
728
  },
729
  {
730
- "epoch": 2.88,
731
- "eval_loss": 0.29526129364967346,
732
- "eval_na_accuracy": 0.946,
733
- "eval_ordinal_accuracy": 0.64,
734
- "eval_ordinal_mae": 1.166716830432415,
735
- "eval_runtime": 20.0091,
736
- "eval_samples_per_second": 24.989,
737
- "eval_steps_per_second": 3.149,
738
- "step": 900
739
  },
740
  {
741
- "epoch": 2.91,
742
- "grad_norm": 0.44126951694488525,
743
- "learning_rate": 5.4632587859424925e-05,
744
- "loss": 0.0854,
745
- "step": 910
746
  },
747
  {
748
- "epoch": 2.94,
749
- "grad_norm": 1.0075191259384155,
750
- "learning_rate": 5.3035143769968054e-05,
751
- "loss": 0.0823,
752
- "step": 920
753
  },
754
  {
755
- "epoch": 2.97,
756
- "grad_norm": 0.9991279244422913,
757
- "learning_rate": 5.1437699680511184e-05,
758
- "loss": 0.1156,
759
- "step": 930
 
 
 
 
760
  },
761
  {
762
- "epoch": 3.0,
763
- "grad_norm": 0.8888081312179565,
764
- "learning_rate": 4.984025559105431e-05,
765
- "loss": 0.0876,
766
- "step": 940
767
  },
768
  {
769
- "epoch": 3.04,
770
- "grad_norm": 0.3761376738548279,
771
- "learning_rate": 4.824281150159744e-05,
772
- "loss": 0.0452,
773
- "step": 950
774
  },
775
  {
776
- "epoch": 3.07,
777
- "grad_norm": 0.365622341632843,
778
- "learning_rate": 4.664536741214058e-05,
779
- "loss": 0.0428,
780
- "step": 960
781
  },
782
  {
783
- "epoch": 3.1,
784
- "grad_norm": 0.35657036304473877,
785
- "learning_rate": 4.504792332268371e-05,
786
- "loss": 0.033,
787
- "step": 970
788
  },
789
  {
790
- "epoch": 3.13,
791
- "grad_norm": 0.5636401176452637,
792
- "learning_rate": 4.345047923322684e-05,
793
- "loss": 0.0356,
794
- "step": 980
795
  },
796
  {
797
- "epoch": 3.16,
798
- "grad_norm": 0.431383341550827,
799
- "learning_rate": 4.185303514376997e-05,
800
- "loss": 0.0463,
801
- "step": 990
 
 
 
 
802
  },
803
  {
804
- "epoch": 3.19,
805
- "grad_norm": 0.583328127861023,
806
- "learning_rate": 4.0255591054313104e-05,
807
- "loss": 0.0387,
808
- "step": 1000
809
  },
810
  {
811
- "epoch": 3.19,
812
- "eval_loss": 0.2923290431499481,
813
- "eval_na_accuracy": 0.944,
814
- "eval_ordinal_accuracy": 0.6377777777777778,
815
- "eval_ordinal_mae": 1.2024743282463815,
816
- "eval_runtime": 19.3226,
817
- "eval_samples_per_second": 25.876,
818
- "eval_steps_per_second": 3.26,
819
- "step": 1000
820
  },
821
  {
822
- "epoch": 3.23,
823
- "grad_norm": 2.440162420272827,
824
- "learning_rate": 3.8658146964856234e-05,
825
- "loss": 0.0607,
826
- "step": 1010
827
  },
828
  {
829
- "epoch": 3.26,
830
- "grad_norm": 0.29546236991882324,
831
- "learning_rate": 3.7060702875399364e-05,
832
- "loss": 0.0515,
833
- "step": 1020
834
  },
835
  {
836
- "epoch": 3.29,
837
- "grad_norm": 0.44689303636550903,
838
- "learning_rate": 3.546325878594249e-05,
839
- "loss": 0.0273,
840
- "step": 1030
841
  },
842
  {
843
- "epoch": 3.32,
844
- "grad_norm": 0.3288978040218353,
845
- "learning_rate": 3.386581469648562e-05,
846
- "loss": 0.0352,
847
- "step": 1040
 
 
 
 
848
  },
849
  {
850
- "epoch": 3.35,
851
- "grad_norm": 0.41706767678260803,
852
- "learning_rate": 3.226837060702875e-05,
853
- "loss": 0.0345,
854
- "step": 1050
855
  },
856
  {
857
- "epoch": 3.39,
858
- "grad_norm": 0.31060507893562317,
859
- "learning_rate": 3.067092651757188e-05,
860
- "loss": 0.0294,
861
- "step": 1060
862
  },
863
  {
864
- "epoch": 3.42,
865
- "grad_norm": 0.2541821599006653,
866
- "learning_rate": 2.907348242811502e-05,
867
- "loss": 0.0354,
868
- "step": 1070
869
  },
870
  {
871
- "epoch": 3.45,
872
- "grad_norm": 0.574343740940094,
873
- "learning_rate": 2.747603833865815e-05,
874
- "loss": 0.0443,
875
- "step": 1080
876
  },
877
  {
878
- "epoch": 3.48,
879
- "grad_norm": 0.47532182931900024,
880
- "learning_rate": 2.5878594249201278e-05,
881
- "loss": 0.0605,
882
- "step": 1090
883
  },
884
  {
885
- "epoch": 3.51,
886
- "grad_norm": 0.45276594161987305,
887
- "learning_rate": 2.428115015974441e-05,
888
- "loss": 0.0293,
889
- "step": 1100
 
 
 
 
890
  },
891
  {
892
- "epoch": 3.51,
893
- "eval_loss": 0.2884800434112549,
894
- "eval_na_accuracy": 0.948,
895
- "eval_ordinal_accuracy": 0.6644444444444444,
896
- "eval_ordinal_mae": 1.1666180535654227,
897
- "eval_runtime": 19.9365,
898
- "eval_samples_per_second": 25.08,
899
- "eval_steps_per_second": 3.16,
900
- "step": 1100
901
  },
902
  {
903
- "epoch": 3.55,
904
- "grad_norm": 0.655549168586731,
905
- "learning_rate": 2.268370607028754e-05,
906
- "loss": 0.034,
907
- "step": 1110
908
  },
909
  {
910
- "epoch": 3.58,
911
- "grad_norm": 0.16610193252563477,
912
- "learning_rate": 2.108626198083067e-05,
913
- "loss": 0.0319,
914
- "step": 1120
915
  },
916
  {
917
- "epoch": 3.61,
918
- "grad_norm": 0.26889652013778687,
919
- "learning_rate": 1.9488817891373803e-05,
920
- "loss": 0.0479,
921
- "step": 1130
922
  },
923
  {
924
- "epoch": 3.64,
925
- "grad_norm": 0.2418793886899948,
926
- "learning_rate": 1.7891373801916932e-05,
927
- "loss": 0.0322,
928
- "step": 1140
929
  },
930
  {
931
- "epoch": 3.67,
932
- "grad_norm": 0.5379694104194641,
933
- "learning_rate": 1.6293929712460065e-05,
934
- "loss": 0.0393,
935
- "step": 1150
 
 
 
 
936
  },
937
  {
938
- "epoch": 3.71,
939
- "grad_norm": 0.19815516471862793,
940
- "learning_rate": 1.4696485623003195e-05,
941
- "loss": 0.0217,
942
- "step": 1160
943
  },
944
  {
945
- "epoch": 3.74,
946
- "grad_norm": 0.889312207698822,
947
- "learning_rate": 1.3099041533546328e-05,
948
- "loss": 0.0332,
949
- "step": 1170
950
  },
951
  {
952
- "epoch": 3.77,
953
- "grad_norm": 0.2865816652774811,
954
- "learning_rate": 1.1501597444089457e-05,
955
- "loss": 0.0313,
956
- "step": 1180
957
  },
958
  {
959
- "epoch": 3.8,
960
- "grad_norm": 0.5947129726409912,
961
- "learning_rate": 9.904153354632589e-06,
962
- "loss": 0.034,
963
- "step": 1190
964
  },
965
  {
966
- "epoch": 3.83,
967
- "grad_norm": 0.44885268807411194,
968
- "learning_rate": 8.306709265175718e-06,
969
- "loss": 0.0286,
970
- "step": 1200
971
  },
972
  {
973
- "epoch": 3.83,
974
- "eval_loss": 0.28681233525276184,
975
- "eval_na_accuracy": 0.95,
976
- "eval_ordinal_accuracy": 0.6711111111111111,
977
- "eval_ordinal_mae": 1.1625636271304554,
978
- "eval_runtime": 19.7259,
979
- "eval_samples_per_second": 25.347,
980
- "eval_steps_per_second": 3.194,
981
- "step": 1200
982
  },
983
  {
984
- "epoch": 3.83,
985
- "step": 1200,
986
- "total_flos": 1.4860396665534874e+18,
987
- "train_loss": 0.17935538868109385,
988
- "train_runtime": 1702.5744,
989
- "train_samples_per_second": 11.747,
990
- "train_steps_per_second": 0.735
991
  }
992
  ],
993
- "logging_steps": 10,
994
- "max_steps": 1252,
995
  "num_input_tokens_seen": 0,
996
  "num_train_epochs": 4,
997
- "save_steps": 100,
998
- "total_flos": 1.4860396665534874e+18,
999
  "train_batch_size": 16,
1000
  "trial_name": null,
1001
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.3037484288215637,
3
+ "best_model_checkpoint": "./ryan_model3272024/checkpoint-250",
4
+ "epoch": 1.4627659574468086,
5
+ "eval_steps": 25,
6
+ "global_step": 275,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 0.5357832312583923,
14
+ "learning_rate": 0.00019867021276595746,
15
+ "loss": 0.4838,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.05,
20
+ "grad_norm": 0.3301275372505188,
21
+ "learning_rate": 0.00019734042553191489,
22
+ "loss": 0.3457,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.08,
27
+ "grad_norm": 0.829723060131073,
28
+ "learning_rate": 0.00019601063829787234,
29
+ "loss": 0.4508,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.11,
34
+ "grad_norm": 0.43481916189193726,
35
+ "learning_rate": 0.00019468085106382982,
36
+ "loss": 0.3205,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.13,
41
+ "grad_norm": 0.9304245114326477,
42
+ "learning_rate": 0.00019335106382978724,
43
+ "loss": 0.4062,
44
+ "step": 25
45
  },
46
  {
47
  "epoch": 0.13,
48
+ "eval_loss": 0.3798711597919464,
49
+ "eval_na_accuracy": 0.6216216087341309,
50
+ "eval_ordinal_accuracy": 0.23954372107982635,
51
+ "eval_ordinal_mae": 0.9244347810745239,
52
+ "eval_runtime": 31.9197,
53
+ "eval_samples_per_second": 9.399,
54
+ "eval_steps_per_second": 1.19,
55
+ "step": 25
56
  },
57
  {
58
  "epoch": 0.16,
59
+ "grad_norm": 0.8552330136299133,
60
+ "learning_rate": 0.0001920212765957447,
61
+ "loss": 0.3029,
62
+ "step": 30
63
  },
64
  {
65
  "epoch": 0.19,
66
+ "grad_norm": 0.43845850229263306,
67
+ "learning_rate": 0.00019069148936170214,
68
+ "loss": 0.3261,
69
+ "step": 35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  },
71
  {
72
+ "epoch": 0.21,
73
+ "grad_norm": 0.18216854333877563,
74
+ "learning_rate": 0.00018936170212765957,
75
+ "loss": 0.2895,
76
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  {
79
+ "epoch": 0.24,
80
+ "grad_norm": 0.7537259459495544,
81
+ "learning_rate": 0.00018829787234042554,
82
+ "loss": 0.4943,
83
+ "step": 45
84
  },
85
  {
86
+ "epoch": 0.27,
87
+ "grad_norm": 0.4183604419231415,
88
+ "learning_rate": 0.000186968085106383,
89
+ "loss": 0.3536,
90
+ "step": 50
91
  },
92
  {
93
+ "epoch": 0.27,
94
+ "eval_loss": 0.3699643015861511,
95
+ "eval_na_accuracy": 0.6756756901741028,
96
+ "eval_ordinal_accuracy": 0.38403043150901794,
97
+ "eval_ordinal_mae": 0.9066693782806396,
98
+ "eval_runtime": 12.0769,
99
+ "eval_samples_per_second": 24.841,
100
+ "eval_steps_per_second": 3.147,
101
+ "step": 50
102
  },
103
  {
104
+ "epoch": 0.29,
105
+ "grad_norm": 0.8079000115394592,
106
+ "learning_rate": 0.00018563829787234044,
107
+ "loss": 0.3601,
108
+ "step": 55
109
  },
110
  {
111
+ "epoch": 0.32,
112
+ "grad_norm": 0.9893763065338135,
113
+ "learning_rate": 0.0001843085106382979,
114
+ "loss": 0.4152,
115
+ "step": 60
 
 
 
 
116
  },
117
  {
118
+ "epoch": 0.35,
119
+ "grad_norm": 1.3993595838546753,
120
+ "learning_rate": 0.00018297872340425532,
121
+ "loss": 0.448,
122
+ "step": 65
123
  },
124
  {
125
+ "epoch": 0.37,
126
+ "grad_norm": 0.45827603340148926,
127
+ "learning_rate": 0.00018164893617021277,
128
  "loss": 0.3132,
129
+ "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  },
131
  {
132
+ "epoch": 0.4,
133
+ "grad_norm": 1.4677202701568604,
134
+ "learning_rate": 0.00018031914893617022,
135
+ "loss": 0.4295,
136
+ "step": 75
137
  },
138
  {
139
+ "epoch": 0.4,
140
+ "eval_loss": 0.34051504731178284,
141
+ "eval_na_accuracy": 0.7837837934494019,
142
+ "eval_ordinal_accuracy": 0.29657796025276184,
143
+ "eval_ordinal_mae": 0.8797782063484192,
144
+ "eval_runtime": 11.5094,
145
+ "eval_samples_per_second": 26.066,
146
+ "eval_steps_per_second": 3.302,
147
+ "step": 75
148
  },
149
  {
150
+ "epoch": 0.43,
151
+ "grad_norm": 1.1136772632598877,
152
+ "learning_rate": 0.00017898936170212767,
153
+ "loss": 0.2444,
154
+ "step": 80
155
  },
156
  {
157
+ "epoch": 0.45,
158
+ "grad_norm": 3.5127644538879395,
159
+ "learning_rate": 0.00017765957446808512,
160
+ "loss": 0.4542,
161
+ "step": 85
162
  },
163
  {
164
+ "epoch": 0.48,
165
+ "grad_norm": 0.6393898725509644,
166
+ "learning_rate": 0.00017632978723404257,
167
+ "loss": 0.3411,
168
+ "step": 90
169
  },
170
  {
171
+ "epoch": 0.51,
172
+ "grad_norm": 0.8198608160018921,
173
+ "learning_rate": 0.000175,
174
+ "loss": 0.3184,
175
+ "step": 95
 
 
 
 
176
  },
177
  {
178
+ "epoch": 0.53,
179
+ "grad_norm": 1.1194807291030884,
180
+ "learning_rate": 0.00017367021276595745,
181
+ "loss": 0.4114,
182
+ "step": 100
183
  },
184
  {
185
+ "epoch": 0.53,
186
+ "eval_loss": 0.39057785272598267,
187
+ "eval_na_accuracy": 0.7297297120094299,
188
+ "eval_ordinal_accuracy": 0.35361215472221375,
189
+ "eval_ordinal_mae": 0.8806185126304626,
190
+ "eval_runtime": 12.2664,
191
+ "eval_samples_per_second": 24.457,
192
+ "eval_steps_per_second": 3.098,
193
+ "step": 100
194
  },
195
  {
196
+ "epoch": 0.56,
197
+ "grad_norm": 3.365068197250366,
198
+ "learning_rate": 0.0001723404255319149,
199
+ "loss": 0.469,
200
+ "step": 105
201
  },
202
  {
203
+ "epoch": 0.59,
204
+ "grad_norm": 0.6197894215583801,
205
+ "learning_rate": 0.00017101063829787233,
206
+ "loss": 0.2738,
207
+ "step": 110
208
  },
209
  {
210
+ "epoch": 0.61,
211
+ "grad_norm": 0.5341880917549133,
212
+ "learning_rate": 0.0001696808510638298,
213
+ "loss": 0.2867,
214
+ "step": 115
215
  },
216
  {
217
+ "epoch": 0.64,
218
+ "grad_norm": 1.0934885740280151,
219
+ "learning_rate": 0.00016835106382978726,
220
+ "loss": 0.4105,
221
+ "step": 120
222
  },
223
  {
224
+ "epoch": 0.66,
225
+ "grad_norm": 0.5764520764350891,
226
+ "learning_rate": 0.00016702127659574468,
227
+ "loss": 0.3521,
228
+ "step": 125
229
  },
230
  {
231
+ "epoch": 0.66,
232
+ "eval_loss": 0.35300251841545105,
233
+ "eval_na_accuracy": 0.8108108043670654,
234
+ "eval_ordinal_accuracy": 0.42585551738739014,
235
+ "eval_ordinal_mae": 0.8441764116287231,
236
+ "eval_runtime": 12.3015,
237
+ "eval_samples_per_second": 24.387,
238
+ "eval_steps_per_second": 3.089,
239
+ "step": 125
240
  },
241
  {
242
+ "epoch": 0.69,
243
+ "grad_norm": 0.2090584933757782,
244
+ "learning_rate": 0.00016569148936170213,
245
+ "loss": 0.294,
246
+ "step": 130
247
  },
248
  {
249
+ "epoch": 0.72,
250
+ "grad_norm": 0.2995198369026184,
251
+ "learning_rate": 0.00016436170212765958,
252
+ "loss": 0.3099,
253
+ "step": 135
254
  },
255
  {
256
+ "epoch": 0.74,
257
+ "grad_norm": 0.41820451617240906,
258
+ "learning_rate": 0.00016303191489361703,
259
+ "loss": 0.4392,
260
+ "step": 140
 
 
 
 
261
  },
262
  {
263
+ "epoch": 0.77,
264
+ "grad_norm": 1.1886084079742432,
265
+ "learning_rate": 0.00016170212765957446,
266
+ "loss": 0.3725,
267
+ "step": 145
268
  },
269
  {
270
+ "epoch": 0.8,
271
+ "grad_norm": 0.8490511178970337,
272
+ "learning_rate": 0.00016037234042553194,
273
+ "loss": 0.3349,
274
+ "step": 150
275
  },
276
  {
277
+ "epoch": 0.8,
278
+ "eval_loss": 0.34123122692108154,
279
+ "eval_na_accuracy": 0.7297297120094299,
280
+ "eval_ordinal_accuracy": 0.4752851724624634,
281
+ "eval_ordinal_mae": 0.8015652298927307,
282
+ "eval_runtime": 12.509,
283
+ "eval_samples_per_second": 23.983,
284
+ "eval_steps_per_second": 3.038,
285
+ "step": 150
286
  },
287
  {
288
+ "epoch": 0.82,
289
+ "grad_norm": 0.4098907709121704,
290
+ "learning_rate": 0.00015904255319148936,
291
+ "loss": 0.2306,
292
+ "step": 155
293
  },
294
  {
295
+ "epoch": 0.85,
296
+ "grad_norm": 1.5454349517822266,
297
+ "learning_rate": 0.0001577127659574468,
298
+ "loss": 0.2382,
299
+ "step": 160
300
  },
301
  {
302
+ "epoch": 0.88,
303
+ "grad_norm": 0.61043381690979,
304
+ "learning_rate": 0.00015638297872340426,
305
+ "loss": 0.3448,
306
+ "step": 165
307
  },
308
  {
309
+ "epoch": 0.9,
310
+ "grad_norm": 0.7741652727127075,
311
+ "learning_rate": 0.00015505319148936171,
312
+ "loss": 0.2037,
313
+ "step": 170
314
  },
315
  {
316
+ "epoch": 0.93,
317
+ "grad_norm": 0.5108156204223633,
318
+ "learning_rate": 0.00015372340425531914,
319
+ "loss": 0.4612,
320
+ "step": 175
321
  },
322
  {
323
+ "epoch": 0.93,
324
+ "eval_loss": 0.36386463046073914,
325
+ "eval_na_accuracy": 0.5405405163764954,
326
+ "eval_ordinal_accuracy": 0.46768060326576233,
327
+ "eval_ordinal_mae": 0.7603853940963745,
328
+ "eval_runtime": 12.3807,
329
+ "eval_samples_per_second": 24.231,
330
+ "eval_steps_per_second": 3.069,
331
+ "step": 175
332
  },
333
  {
334
+ "epoch": 0.96,
335
+ "grad_norm": 0.8889250159263611,
336
+ "learning_rate": 0.00015239361702127662,
337
+ "loss": 0.4401,
338
+ "step": 180
339
  },
340
  {
341
+ "epoch": 0.98,
342
+ "grad_norm": 1.047706127166748,
343
+ "learning_rate": 0.00015106382978723407,
344
+ "loss": 0.2771,
345
+ "step": 185
 
 
 
 
346
  },
347
  {
348
+ "epoch": 1.01,
349
+ "grad_norm": 0.6727350354194641,
350
+ "learning_rate": 0.0001497340425531915,
351
+ "loss": 0.3726,
352
+ "step": 190
353
  },
354
  {
355
+ "epoch": 1.04,
356
+ "grad_norm": 0.4043642282485962,
357
+ "learning_rate": 0.00014840425531914894,
358
+ "loss": 0.3504,
359
+ "step": 195
360
  },
361
  {
362
+ "epoch": 1.06,
363
+ "grad_norm": 1.089250922203064,
364
+ "learning_rate": 0.0001470744680851064,
365
+ "loss": 0.2424,
366
+ "step": 200
367
  },
368
  {
369
+ "epoch": 1.06,
370
+ "eval_loss": 0.32972484827041626,
371
+ "eval_na_accuracy": 0.7027027010917664,
372
+ "eval_ordinal_accuracy": 0.48669201135635376,
373
+ "eval_ordinal_mae": 0.7117426991462708,
374
+ "eval_runtime": 11.8333,
375
+ "eval_samples_per_second": 25.352,
376
+ "eval_steps_per_second": 3.211,
377
+ "step": 200
378
  },
379
  {
380
+ "epoch": 1.09,
381
+ "grad_norm": 0.6453298926353455,
382
+ "learning_rate": 0.00014574468085106382,
383
+ "loss": 0.2115,
384
+ "step": 205
385
  },
386
  {
387
+ "epoch": 1.12,
388
+ "grad_norm": 0.7021524310112,
389
+ "learning_rate": 0.00014441489361702127,
390
+ "loss": 0.2281,
391
+ "step": 210
392
  },
393
  {
394
+ "epoch": 1.14,
395
+ "grad_norm": 0.7665510773658752,
396
+ "learning_rate": 0.00014308510638297875,
397
+ "loss": 0.2048,
398
+ "step": 215
399
  },
400
  {
401
+ "epoch": 1.17,
402
+ "grad_norm": 1.2339574098587036,
403
+ "learning_rate": 0.00014175531914893617,
404
+ "loss": 0.2344,
405
+ "step": 220
406
  },
407
  {
408
+ "epoch": 1.2,
409
+ "grad_norm": 2.540107011795044,
410
+ "learning_rate": 0.00014042553191489363,
411
+ "loss": 0.2928,
412
+ "step": 225
413
  },
414
  {
415
+ "epoch": 1.2,
416
+ "eval_loss": 0.3493916690349579,
417
+ "eval_na_accuracy": 0.6756756901741028,
418
+ "eval_ordinal_accuracy": 0.5285171270370483,
419
+ "eval_ordinal_mae": 0.6955077052116394,
420
+ "eval_runtime": 10.8667,
421
+ "eval_samples_per_second": 27.607,
422
+ "eval_steps_per_second": 3.497,
423
+ "step": 225
424
  },
425
  {
426
+ "epoch": 1.22,
427
+ "grad_norm": 0.7262032628059387,
428
+ "learning_rate": 0.00013909574468085108,
429
+ "loss": 0.3973,
430
+ "step": 230
 
 
 
 
431
  },
432
  {
433
+ "epoch": 1.25,
434
+ "grad_norm": 0.28402724862098694,
435
+ "learning_rate": 0.0001377659574468085,
436
+ "loss": 0.218,
437
+ "step": 235
438
  },
439
  {
440
+ "epoch": 1.28,
441
+ "grad_norm": 1.5170676708221436,
442
+ "learning_rate": 0.00013643617021276595,
443
+ "loss": 0.227,
444
+ "step": 240
445
  },
446
  {
447
+ "epoch": 1.3,
448
+ "grad_norm": 0.35739636421203613,
449
+ "learning_rate": 0.0001351063829787234,
450
+ "loss": 0.2275,
451
+ "step": 245
452
  },
453
  {
454
+ "epoch": 1.33,
455
+ "grad_norm": 0.5471745133399963,
456
+ "learning_rate": 0.00013377659574468086,
457
+ "loss": 0.2436,
458
+ "step": 250
459
  },
460
  {
461
+ "epoch": 1.33,
462
+ "eval_loss": 0.3037484288215637,
463
+ "eval_na_accuracy": 0.7297297120094299,
464
+ "eval_ordinal_accuracy": 0.5285171270370483,
465
+ "eval_ordinal_mae": 0.6723113059997559,
466
+ "eval_runtime": 12.062,
467
+ "eval_samples_per_second": 24.871,
468
+ "eval_steps_per_second": 3.15,
469
+ "step": 250
470
  },
471
  {
472
+ "epoch": 1.36,
473
+ "grad_norm": 0.41173043847084045,
474
+ "learning_rate": 0.0001324468085106383,
475
+ "loss": 0.2408,
476
+ "step": 255
477
  },
478
  {
479
+ "epoch": 1.38,
480
+ "grad_norm": 0.8529615998268127,
481
+ "learning_rate": 0.00013111702127659576,
482
+ "loss": 0.2935,
483
+ "step": 260
484
  },
485
  {
486
+ "epoch": 1.41,
487
+ "grad_norm": 1.3896653652191162,
488
+ "learning_rate": 0.00012978723404255318,
489
+ "loss": 0.2547,
490
+ "step": 265
491
  },
492
  {
493
+ "epoch": 1.44,
494
+ "grad_norm": 0.30819597840309143,
495
+ "learning_rate": 0.00012845744680851063,
496
+ "loss": 0.3372,
497
+ "step": 270
498
  },
499
  {
500
+ "epoch": 1.46,
501
+ "grad_norm": 2.0342652797698975,
502
+ "learning_rate": 0.00012712765957446809,
503
+ "loss": 0.2776,
504
+ "step": 275
505
  },
506
  {
507
+ "epoch": 1.46,
508
+ "eval_loss": 0.3365646004676819,
509
+ "eval_na_accuracy": 0.5945945978164673,
510
+ "eval_ordinal_accuracy": 0.517110288143158,
511
+ "eval_ordinal_mae": 0.672748863697052,
512
+ "eval_runtime": 11.4536,
513
+ "eval_samples_per_second": 26.193,
514
+ "eval_steps_per_second": 3.318,
515
+ "step": 275
516
  },
517
  {
518
+ "epoch": 1.46,
519
+ "step": 275,
520
+ "total_flos": 3.403570199991091e+17,
521
+ "train_loss": 0.32911812565543436,
522
+ "train_runtime": 761.2747,
523
+ "train_samples_per_second": 15.763,
524
+ "train_steps_per_second": 0.988
525
  }
526
  ],
527
+ "logging_steps": 5,
528
+ "max_steps": 752,
529
  "num_input_tokens_seen": 0,
530
  "num_train_epochs": 4,
531
+ "save_steps": 25,
532
+ "total_flos": 3.403570199991091e+17,
533
  "train_batch_size": 16,
534
  "trial_name": null,
535
  "trial_params": null