d071696 commited on
Commit
b22dd59
1 Parent(s): 212bb0f

🍻 cheers

Browse files
README.md CHANGED
@@ -1,6 +1,9 @@
1
  ---
2
  base_model: d071696/vit-finetune-scrap
3
  tags:
 
 
 
4
  - generated_from_trainer
5
  datasets:
6
  - arrow
@@ -13,7 +16,7 @@ model-index:
13
  name: Image Classification
14
  type: image-classification
15
  dataset:
16
- name: arrow
17
  type: arrow
18
  config: default
19
  split: train
@@ -21,7 +24,7 @@ model-index:
21
  metrics:
22
  - name: Accuracy
23
  type: accuracy
24
- value: 0.9485530546623794
25
  ---
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -29,10 +32,10 @@ should probably proofread and complete it, then remove this comment. -->
29
 
30
  # vit-finetune-scrap
31
 
32
- This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the arrow dataset.
33
  It achieves the following results on the evaluation set:
34
- - Loss: 0.2143
35
- - Accuracy: 0.9486
36
 
37
  ## Model description
38
 
 
1
  ---
2
  base_model: d071696/vit-finetune-scrap
3
  tags:
4
+ - image-classification
5
+ - image-feature-extraction
6
+ - image-to-text
7
  - generated_from_trainer
8
  datasets:
9
  - arrow
 
16
  name: Image Classification
17
  type: image-classification
18
  dataset:
19
+ name: d071696/scraps1
20
  type: arrow
21
  config: default
22
  split: train
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.9963782696177063
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
32
 
33
  # vit-finetune-scrap
34
 
35
+ This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the d071696/scraps1 dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.0129
38
+ - Accuracy: 0.9964
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 1.0,
4
- "eval_loss": 0.9895318150520325,
5
- "eval_runtime": 1.234,
6
- "eval_samples_per_second": 24.312,
7
- "eval_steps_per_second": 3.242,
8
- "total_flos": 9.63148132192297e+17,
9
- "train_loss": 0.48303211886426173,
10
- "train_runtime": 1041.8456,
11
- "train_samples_per_second": 11.929,
12
- "train_steps_per_second": 1.494
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9963782696177063,
4
+ "eval_loss": 0.012937591411173344,
5
+ "eval_runtime": 48.4901,
6
+ "eval_samples_per_second": 51.248,
7
+ "eval_steps_per_second": 6.414,
8
+ "total_flos": 7.703325099767808e+17,
9
+ "train_loss": 0.1579143282675882,
10
+ "train_runtime": 491.3754,
11
+ "train_samples_per_second": 20.229,
12
+ "train_steps_per_second": 2.532
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 1.0,
4
- "eval_loss": 0.9895318150520325,
5
- "eval_runtime": 1.234,
6
- "eval_samples_per_second": 24.312,
7
- "eval_steps_per_second": 3.242
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9963782696177063,
4
+ "eval_loss": 0.012937591411173344,
5
+ "eval_runtime": 48.4901,
6
+ "eval_samples_per_second": 51.248,
7
+ "eval_steps_per_second": 6.414
8
  }
runs/Mar29_16-45-17_X5C922065N/events.out.tfevents.1711732953.X5C922065N.53009.5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23139dbd44575df7213bf6b823555c99387ed9f6983bbe6670af966d47b34125
3
+ size 734
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "total_flos": 9.63148132192297e+17,
4
- "train_loss": 0.48303211886426173,
5
- "train_runtime": 1041.8456,
6
- "train_samples_per_second": 11.929,
7
- "train_steps_per_second": 1.494
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "total_flos": 7.703325099767808e+17,
4
+ "train_loss": 0.1579143282675882,
5
+ "train_runtime": 491.3754,
6
+ "train_samples_per_second": 20.229,
7
+ "train_steps_per_second": 2.532
8
  }
trainer_state.json CHANGED
@@ -1,1123 +1,906 @@
1
  {
2
- "best_metric": 0.11155818402767181,
3
  "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
4
  "epoch": 4.0,
5
  "eval_steps": 1000,
6
- "global_step": 1556,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "grad_norm": 2.721052646636963,
14
- "learning_rate": 0.0001987146529562982,
15
- "loss": 2.3309,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.05,
20
- "grad_norm": 2.6302196979522705,
21
- "learning_rate": 0.0001974293059125964,
22
- "loss": 2.1693,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.08,
27
- "grad_norm": 3.2131187915802,
28
- "learning_rate": 0.0001961439588688946,
29
- "loss": 1.914,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.1,
34
- "grad_norm": 3.520822525024414,
35
- "learning_rate": 0.00019485861182519281,
36
- "loss": 1.6374,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.13,
41
- "grad_norm": 4.047000408172607,
42
- "learning_rate": 0.000193573264781491,
43
- "loss": 1.6214,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.15,
48
- "grad_norm": 3.1879024505615234,
49
- "learning_rate": 0.0001922879177377892,
50
- "loss": 1.6056,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.18,
55
- "grad_norm": 3.4971909523010254,
56
- "learning_rate": 0.00019100257069408743,
57
- "loss": 1.4073,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.21,
62
- "grad_norm": 5.953548908233643,
63
- "learning_rate": 0.00018971722365038562,
64
- "loss": 1.2913,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.23,
69
- "grad_norm": 3.639462471008301,
70
- "learning_rate": 0.00018843187660668383,
71
- "loss": 1.1548,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.26,
76
- "grad_norm": 5.055682182312012,
77
- "learning_rate": 0.00018714652956298202,
78
- "loss": 1.3169,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.28,
83
- "grad_norm": 2.787602186203003,
84
- "learning_rate": 0.0001858611825192802,
85
- "loss": 1.2729,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.31,
90
- "grad_norm": 5.495873928070068,
91
- "learning_rate": 0.00018457583547557842,
92
- "loss": 1.1431,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.33,
97
- "grad_norm": 2.6707160472869873,
98
- "learning_rate": 0.0001832904884318766,
99
- "loss": 1.0606,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.36,
104
- "grad_norm": 5.753376483917236,
105
- "learning_rate": 0.00018200514138817483,
106
- "loss": 1.0528,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.39,
111
- "grad_norm": 4.965968132019043,
112
- "learning_rate": 0.000180719794344473,
113
- "loss": 1.3918,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.41,
118
- "grad_norm": 4.477539539337158,
119
- "learning_rate": 0.0001794344473007712,
120
- "loss": 1.2681,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.44,
125
- "grad_norm": 6.264174938201904,
126
- "learning_rate": 0.00017814910025706942,
127
- "loss": 1.0713,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.46,
132
- "grad_norm": 3.289985179901123,
133
- "learning_rate": 0.0001768637532133676,
134
- "loss": 1.1408,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.49,
139
- "grad_norm": 4.64877986907959,
140
- "learning_rate": 0.00017557840616966582,
141
- "loss": 1.3037,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.51,
146
- "grad_norm": 3.4218943119049072,
147
- "learning_rate": 0.000174293059125964,
148
- "loss": 0.9621,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 0.54,
153
- "grad_norm": 5.507615566253662,
154
- "learning_rate": 0.00017300771208226222,
155
- "loss": 1.0243,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 0.57,
160
- "grad_norm": 4.309627532958984,
161
- "learning_rate": 0.00017172236503856043,
162
- "loss": 0.9739,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 0.59,
167
- "grad_norm": 4.056205749511719,
168
- "learning_rate": 0.00017043701799485862,
169
- "loss": 0.9158,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 0.62,
174
- "grad_norm": 3.1590564250946045,
175
- "learning_rate": 0.00016915167095115684,
176
- "loss": 0.8557,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 0.64,
181
- "grad_norm": 1.6367921829223633,
182
- "learning_rate": 0.00016786632390745502,
183
- "loss": 1.0898,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 0.67,
188
- "grad_norm": 5.508506774902344,
189
- "learning_rate": 0.0001665809768637532,
190
- "loss": 1.0173,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.69,
195
- "grad_norm": 5.602323532104492,
196
- "learning_rate": 0.00016529562982005143,
197
- "loss": 0.9706,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 0.72,
202
- "grad_norm": 8.27458381652832,
203
- "learning_rate": 0.00016401028277634961,
204
- "loss": 1.1064,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 0.75,
209
- "grad_norm": 3.5698864459991455,
210
- "learning_rate": 0.00016272493573264783,
211
- "loss": 0.9979,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 0.77,
216
- "grad_norm": 5.842220783233643,
217
- "learning_rate": 0.00016143958868894602,
218
- "loss": 1.0221,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 0.8,
223
- "grad_norm": 3.458761692047119,
224
- "learning_rate": 0.00016015424164524423,
225
- "loss": 0.9931,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 0.82,
230
- "grad_norm": 5.971825122833252,
231
- "learning_rate": 0.00015886889460154242,
232
- "loss": 1.1686,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 0.85,
237
- "grad_norm": 5.68731689453125,
238
- "learning_rate": 0.0001575835475578406,
239
- "loss": 0.9805,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 0.87,
244
- "grad_norm": 5.103214263916016,
245
- "learning_rate": 0.00015629820051413882,
246
- "loss": 0.8668,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 0.9,
251
- "grad_norm": 4.177506923675537,
252
- "learning_rate": 0.00015501285347043704,
253
- "loss": 1.1952,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 0.93,
258
- "grad_norm": 1.9655299186706543,
259
- "learning_rate": 0.00015372750642673522,
260
- "loss": 0.8981,
261
  "step": 360
262
  },
263
  {
264
- "epoch": 0.95,
265
- "grad_norm": 4.982448577880859,
266
- "learning_rate": 0.00015244215938303344,
267
- "loss": 0.7721,
268
  "step": 370
269
  },
270
  {
271
- "epoch": 0.98,
272
- "grad_norm": 5.1874775886535645,
273
- "learning_rate": 0.00015115681233933163,
274
- "loss": 0.97,
275
  "step": 380
276
  },
277
  {
278
- "epoch": 1.0,
279
- "grad_norm": 4.942078590393066,
280
- "learning_rate": 0.00014987146529562984,
281
- "loss": 0.8415,
282
  "step": 390
283
  },
284
  {
285
- "epoch": 1.03,
286
- "grad_norm": 3.160961389541626,
287
- "learning_rate": 0.00014858611825192803,
288
- "loss": 0.5367,
289
  "step": 400
290
  },
291
  {
292
- "epoch": 1.05,
293
- "grad_norm": 5.394630432128906,
294
- "learning_rate": 0.00014730077120822622,
295
- "loss": 0.561,
296
  "step": 410
297
  },
298
  {
299
- "epoch": 1.08,
300
- "grad_norm": 2.2095775604248047,
301
- "learning_rate": 0.00014601542416452443,
302
- "loss": 0.5548,
303
  "step": 420
304
  },
305
  {
306
- "epoch": 1.11,
307
- "grad_norm": 4.9532575607299805,
308
- "learning_rate": 0.00014473007712082262,
309
- "loss": 0.6005,
310
  "step": 430
311
  },
312
  {
313
- "epoch": 1.13,
314
- "grad_norm": 5.503066062927246,
315
- "learning_rate": 0.00014344473007712083,
316
- "loss": 0.514,
317
  "step": 440
318
  },
319
  {
320
- "epoch": 1.16,
321
- "grad_norm": 5.952071189880371,
322
- "learning_rate": 0.00014215938303341902,
323
- "loss": 0.5386,
324
  "step": 450
325
  },
326
  {
327
- "epoch": 1.18,
328
- "grad_norm": 4.198409557342529,
329
- "learning_rate": 0.00014087403598971724,
330
- "loss": 0.5937,
331
  "step": 460
332
  },
333
  {
334
- "epoch": 1.21,
335
- "grad_norm": 4.768213272094727,
336
- "learning_rate": 0.00013958868894601542,
337
- "loss": 0.6504,
338
  "step": 470
339
  },
340
  {
341
- "epoch": 1.23,
342
- "grad_norm": 4.068699359893799,
343
- "learning_rate": 0.0001383033419023136,
344
- "loss": 0.421,
345
  "step": 480
346
  },
347
  {
348
- "epoch": 1.26,
349
- "grad_norm": 4.887763500213623,
350
- "learning_rate": 0.00013701799485861185,
351
- "loss": 0.5566,
352
  "step": 490
353
  },
354
  {
355
- "epoch": 1.29,
356
- "grad_norm": 5.322113037109375,
357
- "learning_rate": 0.00013573264781491004,
358
- "loss": 0.514,
359
  "step": 500
360
  },
361
  {
362
- "epoch": 1.31,
363
- "grad_norm": 3.480942726135254,
364
- "learning_rate": 0.00013444730077120823,
365
- "loss": 0.5012,
366
  "step": 510
367
  },
368
  {
369
- "epoch": 1.34,
370
- "grad_norm": 3.2392122745513916,
371
- "learning_rate": 0.00013316195372750644,
372
- "loss": 0.5065,
373
  "step": 520
374
  },
375
  {
376
- "epoch": 1.36,
377
- "grad_norm": 1.8148912191390991,
378
- "learning_rate": 0.00013187660668380463,
379
- "loss": 0.4932,
380
  "step": 530
381
  },
382
  {
383
- "epoch": 1.39,
384
- "grad_norm": 1.9780988693237305,
385
- "learning_rate": 0.00013059125964010284,
386
- "loss": 0.6036,
387
  "step": 540
388
  },
389
  {
390
- "epoch": 1.41,
391
- "grad_norm": 5.625373840332031,
392
- "learning_rate": 0.00012930591259640103,
393
- "loss": 0.564,
394
  "step": 550
395
  },
396
  {
397
- "epoch": 1.44,
398
- "grad_norm": 9.524807929992676,
399
- "learning_rate": 0.00012802056555269925,
400
- "loss": 0.5695,
401
  "step": 560
402
  },
403
  {
404
- "epoch": 1.47,
405
- "grad_norm": 1.463976263999939,
406
- "learning_rate": 0.00012673521850899743,
407
- "loss": 0.3198,
408
  "step": 570
409
  },
410
  {
411
- "epoch": 1.49,
412
- "grad_norm": 6.108857154846191,
413
- "learning_rate": 0.00012544987146529562,
414
- "loss": 0.6759,
415
  "step": 580
416
  },
417
  {
418
- "epoch": 1.52,
419
- "grad_norm": 1.5109316110610962,
420
- "learning_rate": 0.00012416452442159384,
421
- "loss": 0.4468,
422
  "step": 590
423
  },
424
  {
425
- "epoch": 1.54,
426
- "grad_norm": 1.9603605270385742,
427
- "learning_rate": 0.00012287917737789202,
428
- "loss": 0.3569,
429
  "step": 600
430
  },
431
  {
432
- "epoch": 1.57,
433
- "grad_norm": 7.527422904968262,
434
- "learning_rate": 0.00012159383033419023,
435
- "loss": 0.6518,
436
  "step": 610
437
  },
438
  {
439
- "epoch": 1.59,
440
- "grad_norm": 5.3868255615234375,
441
- "learning_rate": 0.00012030848329048843,
442
- "loss": 0.5278,
443
  "step": 620
444
  },
445
  {
446
- "epoch": 1.62,
447
- "grad_norm": 8.257445335388184,
448
- "learning_rate": 0.00011902313624678665,
449
- "loss": 0.6488,
450
  "step": 630
451
  },
452
  {
453
- "epoch": 1.65,
454
- "grad_norm": 8.786994934082031,
455
- "learning_rate": 0.00011773778920308484,
456
- "loss": 0.6637,
457
  "step": 640
458
  },
459
  {
460
- "epoch": 1.67,
461
- "grad_norm": 11.612885475158691,
462
- "learning_rate": 0.00011645244215938304,
463
- "loss": 0.5637,
464
  "step": 650
465
  },
466
  {
467
- "epoch": 1.7,
468
- "grad_norm": 4.953100204467773,
469
- "learning_rate": 0.00011516709511568124,
470
- "loss": 0.3346,
471
  "step": 660
472
  },
473
  {
474
- "epoch": 1.72,
475
- "grad_norm": 8.756507873535156,
476
- "learning_rate": 0.00011388174807197945,
477
- "loss": 0.5318,
478
  "step": 670
479
  },
480
  {
481
- "epoch": 1.75,
482
- "grad_norm": 5.3309760093688965,
483
- "learning_rate": 0.00011259640102827765,
484
- "loss": 0.433,
485
  "step": 680
486
  },
487
  {
488
- "epoch": 1.77,
489
- "grad_norm": 0.4981166422367096,
490
- "learning_rate": 0.00011131105398457585,
491
- "loss": 0.4548,
492
  "step": 690
493
  },
494
  {
495
- "epoch": 1.8,
496
- "grad_norm": 7.036471366882324,
497
- "learning_rate": 0.00011002570694087404,
498
- "loss": 0.6301,
499
  "step": 700
500
  },
501
  {
502
- "epoch": 1.83,
503
- "grad_norm": 5.0402021408081055,
504
- "learning_rate": 0.00010874035989717224,
505
- "loss": 0.6178,
506
  "step": 710
507
  },
508
  {
509
- "epoch": 1.85,
510
- "grad_norm": 0.2094542682170868,
511
- "learning_rate": 0.00010745501285347044,
512
- "loss": 0.3818,
513
  "step": 720
514
  },
515
  {
516
- "epoch": 1.88,
517
- "grad_norm": 5.399072647094727,
518
- "learning_rate": 0.00010616966580976864,
519
- "loss": 0.5482,
520
  "step": 730
521
  },
522
  {
523
- "epoch": 1.9,
524
- "grad_norm": 9.017058372497559,
525
- "learning_rate": 0.00010488431876606684,
526
- "loss": 0.5286,
527
  "step": 740
528
  },
529
  {
530
- "epoch": 1.93,
531
- "grad_norm": 2.5559568405151367,
532
- "learning_rate": 0.00010359897172236503,
533
- "loss": 0.4894,
534
  "step": 750
535
  },
536
  {
537
- "epoch": 1.95,
538
- "grad_norm": 1.3460350036621094,
539
- "learning_rate": 0.00010231362467866323,
540
- "loss": 0.369,
541
  "step": 760
542
  },
543
  {
544
- "epoch": 1.98,
545
- "grad_norm": 0.5879113078117371,
546
- "learning_rate": 0.00010102827763496146,
547
- "loss": 0.3088,
548
  "step": 770
549
  },
550
  {
551
- "epoch": 2.01,
552
- "grad_norm": 1.1561224460601807,
553
- "learning_rate": 9.974293059125965e-05,
554
- "loss": 0.573,
555
  "step": 780
556
  },
557
  {
558
- "epoch": 2.03,
559
- "grad_norm": 2.361337900161743,
560
- "learning_rate": 9.845758354755785e-05,
561
- "loss": 0.1348,
562
  "step": 790
563
  },
564
  {
565
- "epoch": 2.06,
566
- "grad_norm": 2.3323395252227783,
567
- "learning_rate": 9.717223650385605e-05,
568
- "loss": 0.1292,
569
  "step": 800
570
  },
571
  {
572
- "epoch": 2.08,
573
- "grad_norm": 0.5499300956726074,
574
- "learning_rate": 9.588688946015425e-05,
575
- "loss": 0.1817,
576
  "step": 810
577
  },
578
  {
579
- "epoch": 2.11,
580
- "grad_norm": 0.2054494023323059,
581
- "learning_rate": 9.460154241645245e-05,
582
- "loss": 0.2232,
583
  "step": 820
584
  },
585
  {
586
- "epoch": 2.13,
587
- "grad_norm": 2.15979266166687,
588
- "learning_rate": 9.331619537275065e-05,
589
- "loss": 0.2153,
590
  "step": 830
591
  },
592
  {
593
- "epoch": 2.16,
594
- "grad_norm": 3.1036410331726074,
595
- "learning_rate": 9.203084832904885e-05,
596
- "loss": 0.1692,
597
  "step": 840
598
  },
599
  {
600
- "epoch": 2.19,
601
- "grad_norm": 2.084644317626953,
602
- "learning_rate": 9.074550128534704e-05,
603
- "loss": 0.2034,
604
  "step": 850
605
  },
606
  {
607
- "epoch": 2.21,
608
- "grad_norm": 2.1689724922180176,
609
- "learning_rate": 8.946015424164524e-05,
610
- "loss": 0.2217,
611
  "step": 860
612
  },
613
  {
614
- "epoch": 2.24,
615
- "grad_norm": 1.0331225395202637,
616
- "learning_rate": 8.817480719794346e-05,
617
- "loss": 0.1232,
618
  "step": 870
619
  },
620
  {
621
- "epoch": 2.26,
622
- "grad_norm": 3.129354953765869,
623
- "learning_rate": 8.688946015424166e-05,
624
- "loss": 0.1363,
625
  "step": 880
626
  },
627
  {
628
- "epoch": 2.29,
629
- "grad_norm": 0.653751015663147,
630
- "learning_rate": 8.560411311053986e-05,
631
- "loss": 0.1845,
632
  "step": 890
633
  },
634
  {
635
- "epoch": 2.31,
636
- "grad_norm": 0.20718339085578918,
637
- "learning_rate": 8.431876606683805e-05,
638
- "loss": 0.1638,
639
  "step": 900
640
  },
641
  {
642
- "epoch": 2.34,
643
- "grad_norm": 5.0227274894714355,
644
- "learning_rate": 8.303341902313625e-05,
645
- "loss": 0.1423,
646
  "step": 910
647
  },
648
  {
649
- "epoch": 2.37,
650
- "grad_norm": 0.7187924385070801,
651
- "learning_rate": 8.174807197943445e-05,
652
- "loss": 0.1702,
653
  "step": 920
654
  },
655
  {
656
- "epoch": 2.39,
657
- "grad_norm": 1.2977266311645508,
658
- "learning_rate": 8.046272493573265e-05,
659
- "loss": 0.1354,
660
  "step": 930
661
  },
662
  {
663
- "epoch": 2.42,
664
- "grad_norm": 2.3543667793273926,
665
- "learning_rate": 7.917737789203086e-05,
666
- "loss": 0.2209,
667
  "step": 940
668
  },
669
  {
670
- "epoch": 2.44,
671
- "grad_norm": 0.8430781364440918,
672
- "learning_rate": 7.789203084832905e-05,
673
- "loss": 0.1487,
674
  "step": 950
675
  },
676
  {
677
- "epoch": 2.47,
678
- "grad_norm": 0.08762349933385849,
679
- "learning_rate": 7.660668380462725e-05,
680
- "loss": 0.1038,
681
  "step": 960
682
  },
683
  {
684
- "epoch": 2.49,
685
- "grad_norm": 8.408522605895996,
686
- "learning_rate": 7.532133676092545e-05,
687
- "loss": 0.2402,
688
  "step": 970
689
  },
690
  {
691
- "epoch": 2.52,
692
- "grad_norm": 1.173913836479187,
693
- "learning_rate": 7.403598971722365e-05,
694
- "loss": 0.0641,
695
  "step": 980
696
  },
697
  {
698
- "epoch": 2.54,
699
- "grad_norm": 7.908231735229492,
700
- "learning_rate": 7.275064267352186e-05,
701
- "loss": 0.2347,
702
  "step": 990
703
  },
704
  {
705
- "epoch": 2.57,
706
- "grad_norm": 3.18058180809021,
707
- "learning_rate": 7.146529562982006e-05,
708
- "loss": 0.1326,
709
  "step": 1000
710
  },
711
  {
712
- "epoch": 2.57,
713
- "eval_accuracy": 0.9694238815577728,
714
- "eval_loss": 0.11155818402767181,
715
- "eval_runtime": 52.5851,
716
- "eval_samples_per_second": 59.085,
717
- "eval_steps_per_second": 7.398,
718
  "step": 1000
719
  },
720
  {
721
- "epoch": 2.6,
722
- "grad_norm": 0.4821953773498535,
723
- "learning_rate": 7.017994858611826e-05,
724
- "loss": 0.258,
725
  "step": 1010
726
  },
727
  {
728
- "epoch": 2.62,
729
- "grad_norm": 4.647073268890381,
730
- "learning_rate": 6.889460154241646e-05,
731
- "loss": 0.1106,
732
  "step": 1020
733
  },
734
  {
735
- "epoch": 2.65,
736
- "grad_norm": 0.07687141746282578,
737
- "learning_rate": 6.760925449871466e-05,
738
- "loss": 0.1768,
739
  "step": 1030
740
  },
741
  {
742
- "epoch": 2.67,
743
- "grad_norm": 0.8537989854812622,
744
- "learning_rate": 6.632390745501286e-05,
745
- "loss": 0.1321,
746
  "step": 1040
747
  },
748
  {
749
- "epoch": 2.7,
750
- "grad_norm": 1.6428909301757812,
751
- "learning_rate": 6.503856041131106e-05,
752
- "loss": 0.2679,
753
  "step": 1050
754
  },
755
  {
756
- "epoch": 2.72,
757
- "grad_norm": 0.4707659184932709,
758
- "learning_rate": 6.375321336760925e-05,
759
- "loss": 0.192,
760
  "step": 1060
761
  },
762
  {
763
- "epoch": 2.75,
764
- "grad_norm": 0.09739229083061218,
765
- "learning_rate": 6.246786632390745e-05,
766
- "loss": 0.2501,
767
  "step": 1070
768
  },
769
  {
770
- "epoch": 2.78,
771
- "grad_norm": 2.0249221324920654,
772
- "learning_rate": 6.118251928020567e-05,
773
- "loss": 0.1988,
774
  "step": 1080
775
  },
776
  {
777
- "epoch": 2.8,
778
- "grad_norm": 0.08042796701192856,
779
- "learning_rate": 5.989717223650386e-05,
780
- "loss": 0.04,
781
  "step": 1090
782
  },
783
  {
784
- "epoch": 2.83,
785
- "grad_norm": 0.40489840507507324,
786
- "learning_rate": 5.861182519280206e-05,
787
- "loss": 0.1326,
788
  "step": 1100
789
  },
790
  {
791
- "epoch": 2.85,
792
- "grad_norm": 8.32421875,
793
- "learning_rate": 5.732647814910026e-05,
794
- "loss": 0.1881,
795
  "step": 1110
796
  },
797
  {
798
- "epoch": 2.88,
799
- "grad_norm": 0.3356345295906067,
800
- "learning_rate": 5.604113110539846e-05,
801
- "loss": 0.1638,
802
  "step": 1120
803
  },
804
  {
805
- "epoch": 2.9,
806
- "grad_norm": 2.0262017250061035,
807
- "learning_rate": 5.475578406169666e-05,
808
- "loss": 0.0901,
809
  "step": 1130
810
  },
811
  {
812
- "epoch": 2.93,
813
- "grad_norm": 5.13381290435791,
814
- "learning_rate": 5.347043701799486e-05,
815
- "loss": 0.1947,
816
  "step": 1140
817
  },
818
  {
819
- "epoch": 2.96,
820
- "grad_norm": 4.401228904724121,
821
- "learning_rate": 5.218508997429307e-05,
822
- "loss": 0.1105,
823
  "step": 1150
824
  },
825
  {
826
- "epoch": 2.98,
827
- "grad_norm": 3.711754083633423,
828
- "learning_rate": 5.089974293059127e-05,
829
- "loss": 0.082,
830
  "step": 1160
831
  },
832
  {
833
- "epoch": 3.01,
834
- "grad_norm": 0.4783603847026825,
835
- "learning_rate": 4.961439588688946e-05,
836
- "loss": 0.1223,
837
  "step": 1170
838
  },
839
  {
840
- "epoch": 3.03,
841
- "grad_norm": 6.101786136627197,
842
- "learning_rate": 4.8329048843187664e-05,
843
- "loss": 0.0386,
844
  "step": 1180
845
  },
846
  {
847
- "epoch": 3.06,
848
- "grad_norm": 0.09219735115766525,
849
- "learning_rate": 4.7043701799485865e-05,
850
- "loss": 0.046,
851
  "step": 1190
852
  },
853
  {
854
- "epoch": 3.08,
855
- "grad_norm": 0.09228511899709702,
856
- "learning_rate": 4.5758354755784066e-05,
857
- "loss": 0.0179,
858
  "step": 1200
859
  },
860
  {
861
- "epoch": 3.11,
862
- "grad_norm": 0.06705611199140549,
863
- "learning_rate": 4.447300771208227e-05,
864
- "loss": 0.0401,
865
  "step": 1210
866
  },
867
  {
868
- "epoch": 3.14,
869
- "grad_norm": 0.05702489614486694,
870
- "learning_rate": 4.318766066838046e-05,
871
- "loss": 0.0222,
872
  "step": 1220
873
  },
874
  {
875
- "epoch": 3.16,
876
- "grad_norm": 0.051934726536273956,
877
- "learning_rate": 4.190231362467866e-05,
878
- "loss": 0.0249,
879
  "step": 1230
880
  },
881
  {
882
- "epoch": 3.19,
883
- "grad_norm": 0.05382351949810982,
884
- "learning_rate": 4.0616966580976864e-05,
885
- "loss": 0.017,
886
  "step": 1240
887
  },
888
- {
889
- "epoch": 3.21,
890
- "grad_norm": 0.10244094580411911,
891
- "learning_rate": 3.9331619537275065e-05,
892
- "loss": 0.1425,
893
- "step": 1250
894
- },
895
- {
896
- "epoch": 3.24,
897
- "grad_norm": 0.04559057578444481,
898
- "learning_rate": 3.8046272493573266e-05,
899
- "loss": 0.0188,
900
- "step": 1260
901
- },
902
- {
903
- "epoch": 3.26,
904
- "grad_norm": 1.9016327857971191,
905
- "learning_rate": 3.676092544987147e-05,
906
- "loss": 0.0196,
907
- "step": 1270
908
- },
909
- {
910
- "epoch": 3.29,
911
- "grad_norm": 0.06497751176357269,
912
- "learning_rate": 3.547557840616967e-05,
913
- "loss": 0.0161,
914
- "step": 1280
915
- },
916
- {
917
- "epoch": 3.32,
918
- "grad_norm": 0.05229075625538826,
919
- "learning_rate": 3.419023136246787e-05,
920
- "loss": 0.0165,
921
- "step": 1290
922
- },
923
- {
924
- "epoch": 3.34,
925
- "grad_norm": 0.04599655419588089,
926
- "learning_rate": 3.2904884318766064e-05,
927
- "loss": 0.0338,
928
- "step": 1300
929
- },
930
- {
931
- "epoch": 3.37,
932
- "grad_norm": 0.054148729890584946,
933
- "learning_rate": 3.161953727506427e-05,
934
- "loss": 0.0401,
935
- "step": 1310
936
- },
937
- {
938
- "epoch": 3.39,
939
- "grad_norm": 0.135112926363945,
940
- "learning_rate": 3.033419023136247e-05,
941
- "loss": 0.0386,
942
- "step": 1320
943
- },
944
- {
945
- "epoch": 3.42,
946
- "grad_norm": 0.05881468951702118,
947
- "learning_rate": 2.9048843187660668e-05,
948
- "loss": 0.0526,
949
- "step": 1330
950
- },
951
- {
952
- "epoch": 3.44,
953
- "grad_norm": 0.11401781439781189,
954
- "learning_rate": 2.7763496143958872e-05,
955
- "loss": 0.0652,
956
- "step": 1340
957
- },
958
- {
959
- "epoch": 3.47,
960
- "grad_norm": 0.6476575136184692,
961
- "learning_rate": 2.647814910025707e-05,
962
- "loss": 0.0772,
963
- "step": 1350
964
- },
965
- {
966
- "epoch": 3.5,
967
- "grad_norm": 0.0521862767636776,
968
- "learning_rate": 2.519280205655527e-05,
969
- "loss": 0.0166,
970
- "step": 1360
971
- },
972
- {
973
- "epoch": 3.52,
974
- "grad_norm": 0.05607061833143234,
975
- "learning_rate": 2.3907455012853472e-05,
976
- "loss": 0.0138,
977
- "step": 1370
978
- },
979
- {
980
- "epoch": 3.55,
981
- "grad_norm": 0.05825699120759964,
982
- "learning_rate": 2.262210796915167e-05,
983
- "loss": 0.015,
984
- "step": 1380
985
- },
986
- {
987
- "epoch": 3.57,
988
- "grad_norm": 4.6812334060668945,
989
- "learning_rate": 2.133676092544987e-05,
990
- "loss": 0.1053,
991
- "step": 1390
992
- },
993
- {
994
- "epoch": 3.6,
995
- "grad_norm": 0.6198139786720276,
996
- "learning_rate": 2.0051413881748076e-05,
997
- "loss": 0.0178,
998
- "step": 1400
999
- },
1000
- {
1001
- "epoch": 3.62,
1002
- "grad_norm": 0.05886732041835785,
1003
- "learning_rate": 1.8766066838046273e-05,
1004
- "loss": 0.0278,
1005
- "step": 1410
1006
- },
1007
- {
1008
- "epoch": 3.65,
1009
- "grad_norm": 0.673959493637085,
1010
- "learning_rate": 1.7480719794344475e-05,
1011
- "loss": 0.0769,
1012
- "step": 1420
1013
- },
1014
- {
1015
- "epoch": 3.68,
1016
- "grad_norm": 0.31164562702178955,
1017
- "learning_rate": 1.6195372750642672e-05,
1018
- "loss": 0.1499,
1019
- "step": 1430
1020
- },
1021
- {
1022
- "epoch": 3.7,
1023
- "grad_norm": 0.2713916599750519,
1024
- "learning_rate": 1.4910025706940875e-05,
1025
- "loss": 0.0444,
1026
- "step": 1440
1027
- },
1028
- {
1029
- "epoch": 3.73,
1030
- "grad_norm": 2.0257036685943604,
1031
- "learning_rate": 1.3624678663239075e-05,
1032
- "loss": 0.071,
1033
- "step": 1450
1034
- },
1035
- {
1036
- "epoch": 3.75,
1037
- "grad_norm": 0.24306029081344604,
1038
- "learning_rate": 1.2339331619537276e-05,
1039
- "loss": 0.0114,
1040
- "step": 1460
1041
- },
1042
- {
1043
- "epoch": 3.78,
1044
- "grad_norm": 0.2247108817100525,
1045
- "learning_rate": 1.1053984575835475e-05,
1046
- "loss": 0.0132,
1047
- "step": 1470
1048
- },
1049
- {
1050
- "epoch": 3.8,
1051
- "grad_norm": 0.056268274784088135,
1052
- "learning_rate": 9.768637532133676e-06,
1053
- "loss": 0.0827,
1054
- "step": 1480
1055
- },
1056
- {
1057
- "epoch": 3.83,
1058
- "grad_norm": 0.04434029012918472,
1059
- "learning_rate": 8.483290488431877e-06,
1060
- "loss": 0.0138,
1061
- "step": 1490
1062
- },
1063
- {
1064
- "epoch": 3.86,
1065
- "grad_norm": 0.06419169157743454,
1066
- "learning_rate": 7.197943444730078e-06,
1067
- "loss": 0.077,
1068
- "step": 1500
1069
- },
1070
- {
1071
- "epoch": 3.88,
1072
- "grad_norm": 0.11697979271411896,
1073
- "learning_rate": 5.912596401028278e-06,
1074
- "loss": 0.0206,
1075
- "step": 1510
1076
- },
1077
- {
1078
- "epoch": 3.91,
1079
- "grad_norm": 1.25772225856781,
1080
- "learning_rate": 4.627249357326478e-06,
1081
- "loss": 0.02,
1082
- "step": 1520
1083
- },
1084
- {
1085
- "epoch": 3.93,
1086
- "grad_norm": 0.12491010874509811,
1087
- "learning_rate": 3.3419023136246787e-06,
1088
- "loss": 0.0257,
1089
- "step": 1530
1090
- },
1091
- {
1092
- "epoch": 3.96,
1093
- "grad_norm": 3.4478936195373535,
1094
- "learning_rate": 2.056555269922879e-06,
1095
- "loss": 0.0366,
1096
- "step": 1540
1097
- },
1098
- {
1099
- "epoch": 3.98,
1100
- "grad_norm": 0.0410100519657135,
1101
- "learning_rate": 7.712082262210797e-07,
1102
- "loss": 0.0525,
1103
- "step": 1550
1104
- },
1105
  {
1106
  "epoch": 4.0,
1107
- "step": 1556,
1108
- "total_flos": 9.63148132192297e+17,
1109
- "train_loss": 0.48303211886426173,
1110
- "train_runtime": 1041.8456,
1111
- "train_samples_per_second": 11.929,
1112
- "train_steps_per_second": 1.494
1113
  }
1114
  ],
1115
  "logging_steps": 10,
1116
- "max_steps": 1556,
1117
  "num_input_tokens_seen": 0,
1118
  "num_train_epochs": 4,
1119
  "save_steps": 1000,
1120
- "total_flos": 9.63148132192297e+17,
1121
  "train_batch_size": 8,
1122
  "trial_name": null,
1123
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.21430718898773193,
3
  "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
4
  "epoch": 4.0,
5
  "eval_steps": 1000,
6
+ "global_step": 1244,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "grad_norm": 10.849635124206543,
14
+ "learning_rate": 0.00019839228295819936,
15
+ "loss": 0.1258,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.06,
20
+ "grad_norm": 29.82047462463379,
21
+ "learning_rate": 0.00019678456591639874,
22
+ "loss": 0.2394,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.1,
27
+ "grad_norm": 4.707005500793457,
28
+ "learning_rate": 0.00019517684887459809,
29
+ "loss": 0.234,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.13,
34
+ "grad_norm": 13.66462516784668,
35
+ "learning_rate": 0.00019356913183279743,
36
+ "loss": 0.705,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.16,
41
+ "grad_norm": 4.419419765472412,
42
+ "learning_rate": 0.00019196141479099678,
43
+ "loss": 0.6657,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.19,
48
+ "grad_norm": 0.14946621656417847,
49
+ "learning_rate": 0.00019035369774919616,
50
+ "loss": 0.2407,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.23,
55
+ "grad_norm": 2.8290460109710693,
56
+ "learning_rate": 0.0001887459807073955,
57
+ "loss": 0.3973,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.26,
62
+ "grad_norm": 15.848897933959961,
63
+ "learning_rate": 0.00018713826366559486,
64
+ "loss": 0.2432,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.29,
69
+ "grad_norm": 0.30860471725463867,
70
+ "learning_rate": 0.0001855305466237942,
71
+ "loss": 0.2732,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.32,
76
+ "grad_norm": 14.233210563659668,
77
+ "learning_rate": 0.0001839228295819936,
78
+ "loss": 0.261,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.35,
83
+ "grad_norm": 9.140750885009766,
84
+ "learning_rate": 0.00018231511254019294,
85
+ "loss": 0.1776,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.39,
90
+ "grad_norm": 0.9528696537017822,
91
+ "learning_rate": 0.00018070739549839229,
92
+ "loss": 0.3849,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.42,
97
+ "grad_norm": 21.716726303100586,
98
+ "learning_rate": 0.00017909967845659166,
99
+ "loss": 0.3328,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.45,
104
+ "grad_norm": 7.960571765899658,
105
+ "learning_rate": 0.000177491961414791,
106
+ "loss": 0.5052,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.48,
111
+ "grad_norm": 3.9136505126953125,
112
+ "learning_rate": 0.00017588424437299036,
113
+ "loss": 0.5026,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.51,
118
+ "grad_norm": 14.131813049316406,
119
+ "learning_rate": 0.0001742765273311897,
120
+ "loss": 0.3997,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.55,
125
+ "grad_norm": 13.529720306396484,
126
+ "learning_rate": 0.0001726688102893891,
127
+ "loss": 0.2877,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.58,
132
+ "grad_norm": 6.182504653930664,
133
+ "learning_rate": 0.00017106109324758844,
134
+ "loss": 0.3408,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.61,
139
+ "grad_norm": 0.17119653522968292,
140
+ "learning_rate": 0.0001694533762057878,
141
+ "loss": 0.2916,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.64,
146
+ "grad_norm": 13.307029724121094,
147
+ "learning_rate": 0.00016784565916398716,
148
+ "loss": 0.3485,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.68,
153
+ "grad_norm": 4.883426666259766,
154
+ "learning_rate": 0.0001662379421221865,
155
+ "loss": 0.3939,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.71,
160
+ "grad_norm": 5.17271614074707,
161
+ "learning_rate": 0.00016463022508038586,
162
+ "loss": 0.4001,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.74,
167
+ "grad_norm": 0.18887023627758026,
168
+ "learning_rate": 0.0001630225080385852,
169
+ "loss": 0.2459,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.77,
174
+ "grad_norm": 0.3397394120693207,
175
+ "learning_rate": 0.0001614147909967846,
176
+ "loss": 0.3813,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.8,
181
+ "grad_norm": 7.221404075622559,
182
+ "learning_rate": 0.00015980707395498394,
183
+ "loss": 0.2913,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.84,
188
+ "grad_norm": 3.0032007694244385,
189
+ "learning_rate": 0.0001581993569131833,
190
+ "loss": 0.273,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.87,
195
+ "grad_norm": 3.486640691757202,
196
+ "learning_rate": 0.00015659163987138264,
197
+ "loss": 0.5797,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 0.9,
202
+ "grad_norm": 0.3199945092201233,
203
+ "learning_rate": 0.00015498392282958201,
204
+ "loss": 0.4904,
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 0.93,
209
+ "grad_norm": 38.1386833190918,
210
+ "learning_rate": 0.00015337620578778136,
211
+ "loss": 0.2789,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 0.96,
216
+ "grad_norm": 3.608177661895752,
217
+ "learning_rate": 0.0001517684887459807,
218
+ "loss": 0.5587,
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 1.0,
223
+ "grad_norm": 0.1488448977470398,
224
+ "learning_rate": 0.0001501607717041801,
225
+ "loss": 0.3405,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 1.03,
230
+ "grad_norm": 1.542035460472107,
231
+ "learning_rate": 0.00014855305466237944,
232
+ "loss": 0.2688,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 1.06,
237
+ "grad_norm": 2.1089909076690674,
238
+ "learning_rate": 0.0001469453376205788,
239
+ "loss": 0.2085,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 1.09,
244
+ "grad_norm": 11.16602897644043,
245
+ "learning_rate": 0.00014533762057877814,
246
+ "loss": 0.3437,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 1.13,
251
+ "grad_norm": 2.2596559524536133,
252
+ "learning_rate": 0.00014372990353697752,
253
+ "loss": 0.337,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 1.16,
258
+ "grad_norm": 2.616323947906494,
259
+ "learning_rate": 0.00014212218649517686,
260
+ "loss": 0.2074,
261
  "step": 360
262
  },
263
  {
264
+ "epoch": 1.19,
265
+ "grad_norm": 0.5269195437431335,
266
+ "learning_rate": 0.00014051446945337621,
267
+ "loss": 0.0913,
268
  "step": 370
269
  },
270
  {
271
+ "epoch": 1.22,
272
+ "grad_norm": 7.34785270690918,
273
+ "learning_rate": 0.0001389067524115756,
274
+ "loss": 0.1462,
275
  "step": 380
276
  },
277
  {
278
+ "epoch": 1.25,
279
+ "grad_norm": 0.13304546475410461,
280
+ "learning_rate": 0.00013729903536977494,
281
+ "loss": 0.0303,
282
  "step": 390
283
  },
284
  {
285
+ "epoch": 1.29,
286
+ "grad_norm": 0.42307400703430176,
287
+ "learning_rate": 0.0001356913183279743,
288
+ "loss": 0.2195,
289
  "step": 400
290
  },
291
  {
292
+ "epoch": 1.32,
293
+ "grad_norm": 0.16662320494651794,
294
+ "learning_rate": 0.00013408360128617364,
295
+ "loss": 0.2999,
296
  "step": 410
297
  },
298
  {
299
+ "epoch": 1.35,
300
+ "grad_norm": 35.579891204833984,
301
+ "learning_rate": 0.00013247588424437302,
302
+ "loss": 0.1194,
303
  "step": 420
304
  },
305
  {
306
+ "epoch": 1.38,
307
+ "grad_norm": 3.4818050861358643,
308
+ "learning_rate": 0.00013086816720257237,
309
+ "loss": 0.1469,
310
  "step": 430
311
  },
312
  {
313
+ "epoch": 1.41,
314
+ "grad_norm": 6.36860466003418,
315
+ "learning_rate": 0.00012926045016077172,
316
+ "loss": 0.2234,
317
  "step": 440
318
  },
319
  {
320
+ "epoch": 1.45,
321
+ "grad_norm": 7.359828948974609,
322
+ "learning_rate": 0.00012765273311897106,
323
+ "loss": 0.2114,
324
  "step": 450
325
  },
326
  {
327
+ "epoch": 1.48,
328
+ "grad_norm": 0.11759760975837708,
329
+ "learning_rate": 0.00012604501607717044,
330
+ "loss": 0.1059,
331
  "step": 460
332
  },
333
  {
334
+ "epoch": 1.51,
335
+ "grad_norm": 0.049188051372766495,
336
+ "learning_rate": 0.0001244372990353698,
337
+ "loss": 0.207,
338
  "step": 470
339
  },
340
  {
341
+ "epoch": 1.54,
342
+ "grad_norm": 0.06988845020532608,
343
+ "learning_rate": 0.00012282958199356914,
344
+ "loss": 0.1319,
345
  "step": 480
346
  },
347
  {
348
+ "epoch": 1.58,
349
+ "grad_norm": 10.857504844665527,
350
+ "learning_rate": 0.0001212218649517685,
351
+ "loss": 0.3497,
352
  "step": 490
353
  },
354
  {
355
+ "epoch": 1.61,
356
+ "grad_norm": 0.04112955555319786,
357
+ "learning_rate": 0.00011961414790996785,
358
+ "loss": 0.0711,
359
  "step": 500
360
  },
361
  {
362
+ "epoch": 1.64,
363
+ "grad_norm": 20.134990692138672,
364
+ "learning_rate": 0.0001180064308681672,
365
+ "loss": 0.2654,
366
  "step": 510
367
  },
368
  {
369
+ "epoch": 1.67,
370
+ "grad_norm": 0.03998303785920143,
371
+ "learning_rate": 0.00011639871382636655,
372
+ "loss": 0.0911,
373
  "step": 520
374
  },
375
  {
376
+ "epoch": 1.7,
377
+ "grad_norm": 10.199617385864258,
378
+ "learning_rate": 0.00011479099678456593,
379
+ "loss": 0.1106,
380
  "step": 530
381
  },
382
  {
383
+ "epoch": 1.74,
384
+ "grad_norm": 2.3347342014312744,
385
+ "learning_rate": 0.00011318327974276528,
386
+ "loss": 0.1948,
387
  "step": 540
388
  },
389
  {
390
+ "epoch": 1.77,
391
+ "grad_norm": 15.492130279541016,
392
+ "learning_rate": 0.00011157556270096463,
393
+ "loss": 0.2999,
394
  "step": 550
395
  },
396
  {
397
+ "epoch": 1.8,
398
+ "grad_norm": 16.2156982421875,
399
+ "learning_rate": 0.00010996784565916398,
400
+ "loss": 0.1792,
401
  "step": 560
402
  },
403
  {
404
+ "epoch": 1.83,
405
+ "grad_norm": 3.9076225757598877,
406
+ "learning_rate": 0.00010836012861736335,
407
+ "loss": 0.4599,
408
  "step": 570
409
  },
410
  {
411
+ "epoch": 1.86,
412
+ "grad_norm": 0.0662955567240715,
413
+ "learning_rate": 0.0001067524115755627,
414
+ "loss": 0.0834,
415
  "step": 580
416
  },
417
  {
418
+ "epoch": 1.9,
419
+ "grad_norm": 0.43734121322631836,
420
+ "learning_rate": 0.00010514469453376205,
421
+ "loss": 0.1804,
422
  "step": 590
423
  },
424
  {
425
+ "epoch": 1.93,
426
+ "grad_norm": 0.23478691279888153,
427
+ "learning_rate": 0.00010353697749196143,
428
+ "loss": 0.0831,
429
  "step": 600
430
  },
431
  {
432
+ "epoch": 1.96,
433
+ "grad_norm": 8.97579574584961,
434
+ "learning_rate": 0.00010192926045016078,
435
+ "loss": 0.2141,
436
  "step": 610
437
  },
438
  {
439
+ "epoch": 1.99,
440
+ "grad_norm": 5.947574615478516,
441
+ "learning_rate": 0.00010032154340836013,
442
+ "loss": 0.1059,
443
  "step": 620
444
  },
445
  {
446
+ "epoch": 2.03,
447
+ "grad_norm": 0.3693161904811859,
448
+ "learning_rate": 9.871382636655949e-05,
449
+ "loss": 0.0478,
450
  "step": 630
451
  },
452
  {
453
+ "epoch": 2.06,
454
+ "grad_norm": 0.33773139119148254,
455
+ "learning_rate": 9.710610932475884e-05,
456
+ "loss": 0.1512,
457
  "step": 640
458
  },
459
  {
460
+ "epoch": 2.09,
461
+ "grad_norm": 0.07303290069103241,
462
+ "learning_rate": 9.54983922829582e-05,
463
+ "loss": 0.0746,
464
  "step": 650
465
  },
466
  {
467
+ "epoch": 2.12,
468
+ "grad_norm": 0.021892189979553223,
469
+ "learning_rate": 9.389067524115757e-05,
470
+ "loss": 0.0071,
471
  "step": 660
472
  },
473
  {
474
+ "epoch": 2.15,
475
+ "grad_norm": 0.699686586856842,
476
+ "learning_rate": 9.228295819935692e-05,
477
+ "loss": 0.08,
478
  "step": 670
479
  },
480
  {
481
+ "epoch": 2.19,
482
+ "grad_norm": 1.7835339307785034,
483
+ "learning_rate": 9.067524115755628e-05,
484
+ "loss": 0.092,
485
  "step": 680
486
  },
487
  {
488
+ "epoch": 2.22,
489
+ "grad_norm": 0.025796858593821526,
490
+ "learning_rate": 8.906752411575563e-05,
491
+ "loss": 0.041,
492
  "step": 690
493
  },
494
  {
495
+ "epoch": 2.25,
496
+ "grad_norm": 11.788249969482422,
497
+ "learning_rate": 8.7459807073955e-05,
498
+ "loss": 0.0269,
499
  "step": 700
500
  },
501
  {
502
+ "epoch": 2.28,
503
+ "grad_norm": 6.836824893951416,
504
+ "learning_rate": 8.585209003215434e-05,
505
+ "loss": 0.2086,
506
  "step": 710
507
  },
508
  {
509
+ "epoch": 2.32,
510
+ "grad_norm": 0.02837471477687359,
511
+ "learning_rate": 8.42443729903537e-05,
512
+ "loss": 0.0828,
513
  "step": 720
514
  },
515
  {
516
+ "epoch": 2.35,
517
+ "grad_norm": 0.04012266919016838,
518
+ "learning_rate": 8.263665594855306e-05,
519
+ "loss": 0.0057,
520
  "step": 730
521
  },
522
  {
523
+ "epoch": 2.38,
524
+ "grad_norm": 0.05077001079916954,
525
+ "learning_rate": 8.102893890675242e-05,
526
+ "loss": 0.0116,
527
  "step": 740
528
  },
529
  {
530
+ "epoch": 2.41,
531
+ "grad_norm": 0.08000744879245758,
532
+ "learning_rate": 7.942122186495177e-05,
533
+ "loss": 0.0305,
534
  "step": 750
535
  },
536
  {
537
+ "epoch": 2.44,
538
+ "grad_norm": 0.03205496072769165,
539
+ "learning_rate": 7.781350482315113e-05,
540
+ "loss": 0.0451,
541
  "step": 760
542
  },
543
  {
544
+ "epoch": 2.48,
545
+ "grad_norm": 0.027969710528850555,
546
+ "learning_rate": 7.62057877813505e-05,
547
+ "loss": 0.0779,
548
  "step": 770
549
  },
550
  {
551
+ "epoch": 2.51,
552
+ "grad_norm": 3.297053098678589,
553
+ "learning_rate": 7.459807073954984e-05,
554
+ "loss": 0.028,
555
  "step": 780
556
  },
557
  {
558
+ "epoch": 2.54,
559
+ "grad_norm": 1.8469219207763672,
560
+ "learning_rate": 7.299035369774921e-05,
561
+ "loss": 0.1222,
562
  "step": 790
563
  },
564
  {
565
+ "epoch": 2.57,
566
+ "grad_norm": 0.5228595733642578,
567
+ "learning_rate": 7.138263665594856e-05,
568
+ "loss": 0.0087,
569
  "step": 800
570
  },
571
  {
572
+ "epoch": 2.6,
573
+ "grad_norm": 0.028694279491901398,
574
+ "learning_rate": 6.977491961414792e-05,
575
+ "loss": 0.0053,
576
  "step": 810
577
  },
578
  {
579
+ "epoch": 2.64,
580
+ "grad_norm": 0.026992863044142723,
581
+ "learning_rate": 6.816720257234727e-05,
582
+ "loss": 0.0065,
583
  "step": 820
584
  },
585
  {
586
+ "epoch": 2.67,
587
+ "grad_norm": 1.996466040611267,
588
+ "learning_rate": 6.655948553054663e-05,
589
+ "loss": 0.0955,
590
  "step": 830
591
  },
592
  {
593
+ "epoch": 2.7,
594
+ "grad_norm": 0.01983807235956192,
595
+ "learning_rate": 6.495176848874598e-05,
596
+ "loss": 0.1021,
597
  "step": 840
598
  },
599
  {
600
+ "epoch": 2.73,
601
+ "grad_norm": 0.03182640299201012,
602
+ "learning_rate": 6.334405144694535e-05,
603
+ "loss": 0.1796,
604
  "step": 850
605
  },
606
  {
607
+ "epoch": 2.77,
608
+ "grad_norm": 0.049088891595602036,
609
+ "learning_rate": 6.173633440514471e-05,
610
+ "loss": 0.0907,
611
  "step": 860
612
  },
613
  {
614
+ "epoch": 2.8,
615
+ "grad_norm": 0.11043746769428253,
616
+ "learning_rate": 6.012861736334405e-05,
617
+ "loss": 0.0627,
618
  "step": 870
619
  },
620
  {
621
+ "epoch": 2.83,
622
+ "grad_norm": 0.2206079512834549,
623
+ "learning_rate": 5.8520900321543414e-05,
624
+ "loss": 0.0412,
625
  "step": 880
626
  },
627
  {
628
+ "epoch": 2.86,
629
+ "grad_norm": 0.02966146729886532,
630
+ "learning_rate": 5.6913183279742764e-05,
631
+ "loss": 0.1015,
632
  "step": 890
633
  },
634
  {
635
+ "epoch": 2.89,
636
+ "grad_norm": 0.0345352403819561,
637
+ "learning_rate": 5.530546623794213e-05,
638
+ "loss": 0.0629,
639
  "step": 900
640
  },
641
  {
642
+ "epoch": 2.93,
643
+ "grad_norm": 0.06348275393247604,
644
+ "learning_rate": 5.369774919614148e-05,
645
+ "loss": 0.0064,
646
  "step": 910
647
  },
648
  {
649
+ "epoch": 2.96,
650
+ "grad_norm": 0.06559421122074127,
651
+ "learning_rate": 5.209003215434084e-05,
652
+ "loss": 0.0191,
653
  "step": 920
654
  },
655
  {
656
+ "epoch": 2.99,
657
+ "grad_norm": 1.113765835762024,
658
+ "learning_rate": 5.048231511254019e-05,
659
+ "loss": 0.0259,
660
  "step": 930
661
  },
662
  {
663
+ "epoch": 3.02,
664
+ "grad_norm": 0.02486424334347248,
665
+ "learning_rate": 4.887459807073955e-05,
666
+ "loss": 0.0049,
667
  "step": 940
668
  },
669
  {
670
+ "epoch": 3.05,
671
+ "grad_norm": 0.7845320701599121,
672
+ "learning_rate": 4.726688102893891e-05,
673
+ "loss": 0.0043,
674
  "step": 950
675
  },
676
  {
677
+ "epoch": 3.09,
678
+ "grad_norm": 0.021990863606333733,
679
+ "learning_rate": 4.5659163987138265e-05,
680
+ "loss": 0.0042,
681
  "step": 960
682
  },
683
  {
684
+ "epoch": 3.12,
685
+ "grad_norm": 0.022443994879722595,
686
+ "learning_rate": 4.405144694533762e-05,
687
+ "loss": 0.0054,
688
  "step": 970
689
  },
690
  {
691
+ "epoch": 3.15,
692
+ "grad_norm": 0.009742784313857555,
693
+ "learning_rate": 4.244372990353698e-05,
694
+ "loss": 0.0041,
695
  "step": 980
696
  },
697
  {
698
+ "epoch": 3.18,
699
+ "grad_norm": 0.037747763097286224,
700
+ "learning_rate": 4.083601286173634e-05,
701
+ "loss": 0.0242,
702
  "step": 990
703
  },
704
  {
705
+ "epoch": 3.22,
706
+ "grad_norm": 0.010466611944139004,
707
+ "learning_rate": 3.92282958199357e-05,
708
+ "loss": 0.0035,
709
  "step": 1000
710
  },
711
  {
712
+ "epoch": 3.22,
713
+ "eval_accuracy": 0.9485530546623794,
714
+ "eval_loss": 0.21430718898773193,
715
+ "eval_runtime": 14.2198,
716
+ "eval_samples_per_second": 43.742,
717
+ "eval_steps_per_second": 5.485,
718
  "step": 1000
719
  },
720
  {
721
+ "epoch": 3.25,
722
+ "grad_norm": 0.00991890113800764,
723
+ "learning_rate": 3.7620578778135054e-05,
724
+ "loss": 0.0039,
725
  "step": 1010
726
  },
727
  {
728
+ "epoch": 3.28,
729
+ "grad_norm": 0.016740955412387848,
730
+ "learning_rate": 3.601286173633441e-05,
731
+ "loss": 0.0065,
732
  "step": 1020
733
  },
734
  {
735
+ "epoch": 3.31,
736
+ "grad_norm": 0.03466745838522911,
737
+ "learning_rate": 3.4405144694533766e-05,
738
+ "loss": 0.099,
739
  "step": 1030
740
  },
741
  {
742
+ "epoch": 3.34,
743
+ "grad_norm": 0.008615425787866116,
744
+ "learning_rate": 3.279742765273312e-05,
745
+ "loss": 0.0179,
746
  "step": 1040
747
  },
748
  {
749
+ "epoch": 3.38,
750
+ "grad_norm": 0.05827281251549721,
751
+ "learning_rate": 3.118971061093248e-05,
752
+ "loss": 0.0041,
753
  "step": 1050
754
  },
755
  {
756
+ "epoch": 3.41,
757
+ "grad_norm": 0.0276072658598423,
758
+ "learning_rate": 2.9581993569131832e-05,
759
+ "loss": 0.0036,
760
  "step": 1060
761
  },
762
  {
763
+ "epoch": 3.44,
764
+ "grad_norm": 0.011412302032113075,
765
+ "learning_rate": 2.7974276527331188e-05,
766
+ "loss": 0.1013,
767
  "step": 1070
768
  },
769
  {
770
+ "epoch": 3.47,
771
+ "grad_norm": 0.013982011005282402,
772
+ "learning_rate": 2.6366559485530545e-05,
773
+ "loss": 0.0058,
774
  "step": 1080
775
  },
776
  {
777
+ "epoch": 3.5,
778
+ "grad_norm": 0.026057597249746323,
779
+ "learning_rate": 2.4758842443729904e-05,
780
+ "loss": 0.0077,
781
  "step": 1090
782
  },
783
  {
784
+ "epoch": 3.54,
785
+ "grad_norm": 0.04853319376707077,
786
+ "learning_rate": 2.315112540192926e-05,
787
+ "loss": 0.0035,
788
  "step": 1100
789
  },
790
  {
791
+ "epoch": 3.57,
792
+ "grad_norm": 0.013841088861227036,
793
+ "learning_rate": 2.154340836012862e-05,
794
+ "loss": 0.0208,
795
  "step": 1110
796
  },
797
  {
798
+ "epoch": 3.6,
799
+ "grad_norm": 0.03845496475696564,
800
+ "learning_rate": 1.9935691318327977e-05,
801
+ "loss": 0.0038,
802
  "step": 1120
803
  },
804
  {
805
+ "epoch": 3.63,
806
+ "grad_norm": 0.023922910913825035,
807
+ "learning_rate": 1.8327974276527333e-05,
808
+ "loss": 0.0032,
809
  "step": 1130
810
  },
811
  {
812
+ "epoch": 3.67,
813
+ "grad_norm": 0.014864934608340263,
814
+ "learning_rate": 1.672025723472669e-05,
815
+ "loss": 0.0028,
816
  "step": 1140
817
  },
818
  {
819
+ "epoch": 3.7,
820
+ "grad_norm": 0.05655550956726074,
821
+ "learning_rate": 1.5112540192926044e-05,
822
+ "loss": 0.0039,
823
  "step": 1150
824
  },
825
  {
826
+ "epoch": 3.73,
827
+ "grad_norm": 0.012573642656207085,
828
+ "learning_rate": 1.3504823151125404e-05,
829
+ "loss": 0.0028,
830
  "step": 1160
831
  },
832
  {
833
+ "epoch": 3.76,
834
+ "grad_norm": 0.022632773965597153,
835
+ "learning_rate": 1.189710610932476e-05,
836
+ "loss": 0.0033,
837
  "step": 1170
838
  },
839
  {
840
+ "epoch": 3.79,
841
+ "grad_norm": 0.01279931515455246,
842
+ "learning_rate": 1.0289389067524116e-05,
843
+ "loss": 0.0238,
844
  "step": 1180
845
  },
846
  {
847
+ "epoch": 3.83,
848
+ "grad_norm": 0.023662865161895752,
849
+ "learning_rate": 8.681672025723474e-06,
850
+ "loss": 0.0253,
851
  "step": 1190
852
  },
853
  {
854
+ "epoch": 3.86,
855
+ "grad_norm": 0.017510054633021355,
856
+ "learning_rate": 7.07395498392283e-06,
857
+ "loss": 0.0047,
858
  "step": 1200
859
  },
860
  {
861
+ "epoch": 3.89,
862
+ "grad_norm": 0.0257584135979414,
863
+ "learning_rate": 5.466237942122187e-06,
864
+ "loss": 0.004,
865
  "step": 1210
866
  },
867
  {
868
+ "epoch": 3.92,
869
+ "grad_norm": 0.3079407513141632,
870
+ "learning_rate": 3.858520900321544e-06,
871
+ "loss": 0.085,
872
  "step": 1220
873
  },
874
  {
875
+ "epoch": 3.95,
876
+ "grad_norm": 0.00990583747625351,
877
+ "learning_rate": 2.2508038585209006e-06,
878
+ "loss": 0.0029,
879
  "step": 1230
880
  },
881
  {
882
+ "epoch": 3.99,
883
+ "grad_norm": 0.011738813482224941,
884
+ "learning_rate": 6.430868167202573e-07,
885
+ "loss": 0.034,
886
  "step": 1240
887
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  {
889
  "epoch": 4.0,
890
+ "step": 1244,
891
+ "total_flos": 7.703325099767808e+17,
892
+ "train_loss": 0.1579143282675882,
893
+ "train_runtime": 491.3754,
894
+ "train_samples_per_second": 20.229,
895
+ "train_steps_per_second": 2.532
896
  }
897
  ],
898
  "logging_steps": 10,
899
+ "max_steps": 1244,
900
  "num_input_tokens_seen": 0,
901
  "num_train_epochs": 4,
902
  "save_steps": 1000,
903
+ "total_flos": 7.703325099767808e+17,
904
  "train_batch_size": 8,
905
  "trial_name": null,
906
  "trial_params": null