d071696 commited on
Commit
0f2d1dc
1 Parent(s): 27590db

🍻 cheers

Browse files
README.md CHANGED
@@ -1,6 +1,9 @@
1
  ---
2
  base_model: d071696/vit-finetune-scrap
3
  tags:
 
 
 
4
  - generated_from_trainer
5
  datasets:
6
  - arrow
@@ -13,7 +16,7 @@ model-index:
13
  name: Image Classification
14
  type: image-classification
15
  dataset:
16
- name: arrow
17
  type: arrow
18
  config: default
19
  split: train
@@ -29,7 +32,7 @@ should probably proofread and complete it, then remove this comment. -->
29
 
30
  # vit-finetune-scrap
31
 
32
- This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the arrow dataset.
33
  It achieves the following results on the evaluation set:
34
  - Loss: 0.3599
35
  - Accuracy: 0.9260
 
1
  ---
2
  base_model: d071696/vit-finetune-scrap
3
  tags:
4
+ - image-classification
5
+ - image-feature-extraction
6
+ - image-to-text
7
  - generated_from_trainer
8
  datasets:
9
  - arrow
 
16
  name: Image Classification
17
  type: image-classification
18
  dataset:
19
+ name: d071696/scraps1
20
  type: arrow
21
  config: default
22
  split: train
 
32
 
33
  # vit-finetune-scrap
34
 
35
+ This model is a fine-tuned version of [d071696/vit-finetune-scrap](https://huggingface.co/d071696/vit-finetune-scrap) on the d071696/scraps1 dataset.
36
  It achieves the following results on the evaluation set:
37
  - Loss: 0.3599
38
  - Accuracy: 0.9260
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.954983922829582,
4
- "eval_loss": 0.15877817571163177,
5
- "eval_runtime": 9.6813,
6
- "eval_samples_per_second": 64.248,
7
- "eval_steps_per_second": 8.057,
8
  "total_flos": 7.703325099767808e+17,
9
- "train_loss": 0.15572628828410345,
10
- "train_runtime": 552.98,
11
- "train_samples_per_second": 17.975,
12
- "train_steps_per_second": 1.128
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9260450160771704,
4
+ "eval_loss": 0.3599020838737488,
5
+ "eval_runtime": 9.9383,
6
+ "eval_samples_per_second": 62.586,
7
+ "eval_steps_per_second": 7.848,
8
  "total_flos": 7.703325099767808e+17,
9
+ "train_loss": 0.11289434264701495,
10
+ "train_runtime": 3405.5271,
11
+ "train_samples_per_second": 2.919,
12
+ "train_steps_per_second": 0.365
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.954983922829582,
4
- "eval_loss": 0.15877817571163177,
5
- "eval_runtime": 9.6813,
6
- "eval_samples_per_second": 64.248,
7
- "eval_steps_per_second": 8.057
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.9260450160771704,
4
+ "eval_loss": 0.3599020838737488,
5
+ "eval_runtime": 9.9383,
6
+ "eval_samples_per_second": 62.586,
7
+ "eval_steps_per_second": 7.848
8
  }
runs/Apr03_18-00-45_X5C922065N/events.out.tfevents.1712217271.X5C922065N.13113.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76e34509a2c262075a85ede1cd4f01009894a490b5cef5fecbb8a7a82387e238
3
+ size 734
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.703325099767808e+17,
4
- "train_loss": 0.15572628828410345,
5
- "train_runtime": 552.98,
6
- "train_samples_per_second": 17.975,
7
- "train_steps_per_second": 1.128
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 7.703325099767808e+17,
4
+ "train_loss": 0.11289434264701495,
5
+ "train_runtime": 3405.5271,
6
+ "train_samples_per_second": 2.919,
7
+ "train_steps_per_second": 0.365
8
  }
trainer_state.json CHANGED
@@ -1,518 +1,907 @@
1
  {
2
- "best_metric": 0.15877817571163177,
3
- "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-300",
4
  "epoch": 4.0,
5
- "eval_steps": 100,
6
- "global_step": 624,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.06,
13
- "grad_norm": 2.869983673095703,
14
- "learning_rate": 0.00019679487179487178,
15
- "loss": 1.9747,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.13,
20
- "grad_norm": 2.9758315086364746,
21
- "learning_rate": 0.0001935897435897436,
22
- "loss": 1.209,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.19,
27
- "grad_norm": 3.3387157917022705,
28
- "learning_rate": 0.00019038461538461538,
29
- "loss": 0.7205,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.26,
34
- "grad_norm": 2.921093463897705,
35
- "learning_rate": 0.0001871794871794872,
36
- "loss": 0.4159,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.32,
41
- "grad_norm": 2.4197134971618652,
42
- "learning_rate": 0.00018397435897435897,
43
- "loss": 0.3879,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.38,
48
- "grad_norm": 2.498006582260132,
49
- "learning_rate": 0.00018076923076923077,
50
- "loss": 0.2695,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.45,
55
- "grad_norm": 0.9913655519485474,
56
- "learning_rate": 0.00017756410256410257,
57
- "loss": 0.33,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.51,
62
- "grad_norm": 5.167428493499756,
63
- "learning_rate": 0.00017435897435897436,
64
- "loss": 0.2374,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.58,
69
- "grad_norm": 5.158258438110352,
70
- "learning_rate": 0.00017115384615384616,
71
- "loss": 0.2531,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.64,
76
- "grad_norm": 2.1700403690338135,
77
- "learning_rate": 0.00016794871794871796,
78
- "loss": 0.1672,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.64,
83
- "eval_accuracy": 0.9485530546623794,
84
- "eval_loss": 0.22496841847896576,
85
- "eval_runtime": 11.2945,
86
- "eval_samples_per_second": 55.071,
87
- "eval_steps_per_second": 6.906,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 0.71,
92
- "grad_norm": 4.994268894195557,
93
- "learning_rate": 0.00016474358974358976,
94
- "loss": 0.1319,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 0.77,
99
- "grad_norm": 2.2666022777557373,
100
- "learning_rate": 0.00016153846153846155,
101
- "loss": 0.283,
102
  "step": 120
103
  },
104
  {
105
- "epoch": 0.83,
106
- "grad_norm": 3.1912319660186768,
107
- "learning_rate": 0.00015833333333333332,
108
- "loss": 0.1666,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 0.9,
113
- "grad_norm": 3.1358578205108643,
114
- "learning_rate": 0.00015512820512820515,
115
- "loss": 0.2819,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 0.96,
120
- "grad_norm": 2.050241470336914,
121
- "learning_rate": 0.00015192307692307692,
122
- "loss": 0.2874,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 1.03,
127
- "grad_norm": 5.193918704986572,
128
- "learning_rate": 0.00014871794871794872,
129
- "loss": 0.2103,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 1.09,
134
- "grad_norm": 0.405056893825531,
135
- "learning_rate": 0.00014551282051282051,
136
- "loss": 0.1818,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 1.15,
141
- "grad_norm": 11.979884147644043,
142
- "learning_rate": 0.0001423076923076923,
143
- "loss": 0.1626,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.22,
148
- "grad_norm": 0.3927968442440033,
149
- "learning_rate": 0.0001391025641025641,
150
- "loss": 0.0824,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.28,
155
- "grad_norm": 0.11850783228874207,
156
- "learning_rate": 0.0001358974358974359,
157
- "loss": 0.1277,
158
- "step": 200
159
- },
160
- {
161
- "epoch": 1.28,
162
- "eval_accuracy": 0.9372990353697749,
163
- "eval_loss": 0.24667073786258698,
164
- "eval_runtime": 76.2101,
165
- "eval_samples_per_second": 8.162,
166
- "eval_steps_per_second": 1.023,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 1.35,
171
- "grad_norm": 0.8870755434036255,
172
- "learning_rate": 0.0001326923076923077,
173
- "loss": 0.1308,
174
  "step": 210
175
  },
176
  {
177
- "epoch": 1.41,
178
- "grad_norm": 0.12857934832572937,
179
- "learning_rate": 0.0001294871794871795,
180
- "loss": 0.1047,
181
  "step": 220
182
  },
183
  {
184
- "epoch": 1.47,
185
- "grad_norm": 0.11967829614877701,
186
- "learning_rate": 0.00012628205128205127,
187
- "loss": 0.0523,
188
  "step": 230
189
  },
190
  {
191
- "epoch": 1.54,
192
- "grad_norm": 1.8435248136520386,
193
- "learning_rate": 0.0001230769230769231,
194
- "loss": 0.089,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 1.6,
199
- "grad_norm": 0.07049544900655746,
200
- "learning_rate": 0.00011987179487179487,
201
- "loss": 0.0651,
202
  "step": 250
203
  },
204
  {
205
- "epoch": 1.67,
206
- "grad_norm": 7.795147895812988,
207
- "learning_rate": 0.00011666666666666668,
208
- "loss": 0.0827,
209
  "step": 260
210
  },
211
  {
212
- "epoch": 1.73,
213
- "grad_norm": 0.06726662069559097,
214
- "learning_rate": 0.00011346153846153846,
215
- "loss": 0.1727,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 1.79,
220
- "grad_norm": 4.7732672691345215,
221
- "learning_rate": 0.00011025641025641027,
222
- "loss": 0.0867,
223
  "step": 280
224
  },
225
  {
226
- "epoch": 1.86,
227
- "grad_norm": 0.08257576823234558,
228
- "learning_rate": 0.00010705128205128206,
229
- "loss": 0.0349,
230
  "step": 290
231
  },
232
  {
233
- "epoch": 1.92,
234
- "grad_norm": 0.15855157375335693,
235
- "learning_rate": 0.00010384615384615386,
236
- "loss": 0.0253,
237
- "step": 300
238
- },
239
- {
240
- "epoch": 1.92,
241
- "eval_accuracy": 0.954983922829582,
242
- "eval_loss": 0.15877817571163177,
243
- "eval_runtime": 9.849,
244
- "eval_samples_per_second": 63.153,
245
- "eval_steps_per_second": 7.92,
246
  "step": 300
247
  },
248
  {
249
- "epoch": 1.99,
250
- "grad_norm": 11.21109676361084,
251
- "learning_rate": 0.00010064102564102564,
252
- "loss": 0.0988,
253
  "step": 310
254
  },
255
  {
256
- "epoch": 2.05,
257
- "grad_norm": 0.06449388712644577,
258
- "learning_rate": 9.743589743589744e-05,
259
- "loss": 0.0666,
260
  "step": 320
261
  },
262
  {
263
- "epoch": 2.12,
264
- "grad_norm": 0.988405168056488,
265
- "learning_rate": 9.423076923076924e-05,
266
- "loss": 0.0295,
267
  "step": 330
268
  },
269
  {
270
- "epoch": 2.18,
271
- "grad_norm": 0.06675518304109573,
272
- "learning_rate": 9.102564102564103e-05,
273
- "loss": 0.018,
274
  "step": 340
275
  },
276
  {
277
- "epoch": 2.24,
278
- "grad_norm": 0.08486536890268326,
279
- "learning_rate": 8.782051282051283e-05,
280
- "loss": 0.0714,
281
  "step": 350
282
  },
283
  {
284
- "epoch": 2.31,
285
- "grad_norm": 0.05260853096842766,
286
- "learning_rate": 8.461538461538461e-05,
287
- "loss": 0.0354,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 2.37,
292
- "grad_norm": 0.053938668221235275,
293
- "learning_rate": 8.141025641025641e-05,
294
- "loss": 0.0548,
295
  "step": 370
296
  },
297
  {
298
- "epoch": 2.44,
299
- "grad_norm": 0.06470278650522232,
300
- "learning_rate": 7.820512820512821e-05,
301
- "loss": 0.0162,
302
  "step": 380
303
  },
304
  {
305
- "epoch": 2.5,
306
- "grad_norm": 0.0850602388381958,
307
- "learning_rate": 7.500000000000001e-05,
308
- "loss": 0.033,
309
  "step": 390
310
  },
311
  {
312
- "epoch": 2.56,
313
- "grad_norm": 0.04342366382479668,
314
- "learning_rate": 7.17948717948718e-05,
315
- "loss": 0.0224,
316
- "step": 400
317
- },
318
- {
319
- "epoch": 2.56,
320
- "eval_accuracy": 0.9533762057877814,
321
- "eval_loss": 0.16905710101127625,
322
- "eval_runtime": 9.8491,
323
- "eval_samples_per_second": 63.153,
324
- "eval_steps_per_second": 7.92,
325
  "step": 400
326
  },
327
  {
328
- "epoch": 2.63,
329
- "grad_norm": 0.05912560597062111,
330
- "learning_rate": 6.858974358974359e-05,
331
- "loss": 0.0503,
332
  "step": 410
333
  },
334
  {
335
- "epoch": 2.69,
336
- "grad_norm": 0.04359501227736473,
337
- "learning_rate": 6.538461538461539e-05,
338
- "loss": 0.0537,
339
  "step": 420
340
  },
341
  {
342
- "epoch": 2.76,
343
- "grad_norm": 0.0935799852013588,
344
- "learning_rate": 6.217948717948718e-05,
345
- "loss": 0.0145,
346
  "step": 430
347
  },
348
  {
349
- "epoch": 2.82,
350
- "grad_norm": 0.05057013779878616,
351
- "learning_rate": 5.897435897435898e-05,
352
- "loss": 0.0132,
353
  "step": 440
354
  },
355
  {
356
- "epoch": 2.88,
357
- "grad_norm": 0.4015754461288452,
358
- "learning_rate": 5.576923076923077e-05,
359
- "loss": 0.0382,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 2.95,
364
- "grad_norm": 0.03421848267316818,
365
- "learning_rate": 5.256410256410257e-05,
366
- "loss": 0.0123,
367
  "step": 460
368
  },
369
  {
370
- "epoch": 3.01,
371
- "grad_norm": 0.043205052614212036,
372
- "learning_rate": 4.935897435897436e-05,
373
- "loss": 0.0417,
374
  "step": 470
375
  },
376
  {
377
- "epoch": 3.08,
378
- "grad_norm": 0.03611929342150688,
379
- "learning_rate": 4.615384615384616e-05,
380
- "loss": 0.0125,
381
  "step": 480
382
  },
383
  {
384
- "epoch": 3.14,
385
- "grad_norm": 0.04078483581542969,
386
- "learning_rate": 4.294871794871795e-05,
387
- "loss": 0.0193,
388
  "step": 490
389
  },
390
  {
391
- "epoch": 3.21,
392
- "grad_norm": 0.043482307344675064,
393
- "learning_rate": 3.974358974358974e-05,
394
- "loss": 0.0321,
395
  "step": 500
396
  },
397
  {
398
- "epoch": 3.21,
399
- "eval_accuracy": 0.9565916398713826,
400
- "eval_loss": 0.17511475086212158,
401
- "eval_runtime": 9.8922,
402
- "eval_samples_per_second": 62.878,
403
- "eval_steps_per_second": 7.885,
404
- "step": 500
405
- },
406
- {
407
- "epoch": 3.27,
408
- "grad_norm": 0.03763527050614357,
409
- "learning_rate": 3.653846153846154e-05,
410
- "loss": 0.0118,
411
  "step": 510
412
  },
413
  {
414
- "epoch": 3.33,
415
- "grad_norm": 0.05929262936115265,
416
- "learning_rate": 3.3333333333333335e-05,
417
- "loss": 0.0636,
418
  "step": 520
419
  },
420
  {
421
- "epoch": 3.4,
422
- "grad_norm": 0.039751436561346054,
423
- "learning_rate": 3.012820512820513e-05,
424
- "loss": 0.0105,
425
  "step": 530
426
  },
427
  {
428
- "epoch": 3.46,
429
- "grad_norm": 0.03735564276576042,
430
- "learning_rate": 2.6923076923076923e-05,
431
- "loss": 0.0495,
432
  "step": 540
433
  },
434
  {
435
- "epoch": 3.53,
436
- "grad_norm": 0.051983293145895004,
437
- "learning_rate": 2.3717948717948718e-05,
438
- "loss": 0.011,
439
  "step": 550
440
  },
441
  {
442
- "epoch": 3.59,
443
- "grad_norm": 0.034572675824165344,
444
- "learning_rate": 2.0512820512820512e-05,
445
- "loss": 0.0109,
446
  "step": 560
447
  },
448
  {
449
- "epoch": 3.65,
450
- "grad_norm": 0.04169879108667374,
451
- "learning_rate": 1.730769230769231e-05,
452
- "loss": 0.0108,
453
  "step": 570
454
  },
455
  {
456
- "epoch": 3.72,
457
- "grad_norm": 0.032876156270504,
458
- "learning_rate": 1.4102564102564104e-05,
459
- "loss": 0.0104,
460
  "step": 580
461
  },
462
  {
463
- "epoch": 3.78,
464
- "grad_norm": 0.03522384166717529,
465
- "learning_rate": 1.0897435897435898e-05,
466
- "loss": 0.0109,
467
  "step": 590
468
  },
469
  {
470
- "epoch": 3.85,
471
- "grad_norm": 0.11409874260425568,
472
- "learning_rate": 7.692307692307694e-06,
473
- "loss": 0.0112,
474
  "step": 600
475
  },
476
  {
477
- "epoch": 3.85,
478
- "eval_accuracy": 0.954983922829582,
479
- "eval_loss": 0.18050101399421692,
480
- "eval_runtime": 9.8888,
481
- "eval_samples_per_second": 62.899,
482
- "eval_steps_per_second": 7.888,
483
- "step": 600
484
- },
485
- {
486
- "epoch": 3.91,
487
- "grad_norm": 0.04359051212668419,
488
- "learning_rate": 4.487179487179488e-06,
489
- "loss": 0.0109,
490
  "step": 610
491
  },
492
  {
493
- "epoch": 3.97,
494
- "grad_norm": 0.03071708232164383,
495
- "learning_rate": 1.282051282051282e-06,
496
- "loss": 0.0429,
497
  "step": 620
498
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  {
500
  "epoch": 4.0,
501
- "step": 624,
502
  "total_flos": 7.703325099767808e+17,
503
- "train_loss": 0.15572628828410345,
504
- "train_runtime": 552.98,
505
- "train_samples_per_second": 17.975,
506
- "train_steps_per_second": 1.128
507
  }
508
  ],
509
  "logging_steps": 10,
510
- "max_steps": 624,
511
  "num_input_tokens_seen": 0,
512
  "num_train_epochs": 4,
513
- "save_steps": 100,
514
  "total_flos": 7.703325099767808e+17,
515
- "train_batch_size": 16,
516
  "trial_name": null,
517
  "trial_params": null
518
  }
 
1
  {
2
+ "best_metric": 0.3599020838737488,
3
+ "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
4
  "epoch": 4.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 1244,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.7885215282440186,
14
+ "learning_rate": 0.00019839228295819936,
15
+ "loss": 0.3445,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.06,
20
+ "grad_norm": 0.6006595492362976,
21
+ "learning_rate": 0.00019678456591639874,
22
+ "loss": 0.0458,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.1,
27
+ "grad_norm": 0.0847816988825798,
28
+ "learning_rate": 0.00019517684887459809,
29
+ "loss": 0.1535,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.13,
34
+ "grad_norm": 10.684067726135254,
35
+ "learning_rate": 0.00019356913183279743,
36
+ "loss": 0.1208,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.16,
41
+ "grad_norm": 0.08578234165906906,
42
+ "learning_rate": 0.00019196141479099678,
43
+ "loss": 0.0208,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.19,
48
+ "grad_norm": 0.06457830220460892,
49
+ "learning_rate": 0.00019035369774919616,
50
+ "loss": 0.0838,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.23,
55
+ "grad_norm": 12.919880867004395,
56
+ "learning_rate": 0.0001887459807073955,
57
+ "loss": 0.0491,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.26,
62
+ "grad_norm": 10.229437828063965,
63
+ "learning_rate": 0.00018713826366559486,
64
+ "loss": 0.1614,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.29,
69
+ "grad_norm": 0.03680131584405899,
70
+ "learning_rate": 0.0001855305466237942,
71
+ "loss": 0.2226,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.32,
76
+ "grad_norm": 1.575692892074585,
77
+ "learning_rate": 0.0001839228295819936,
78
+ "loss": 0.3464,
 
 
 
 
 
 
 
 
 
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.35,
83
+ "grad_norm": 11.554421424865723,
84
+ "learning_rate": 0.00018231511254019294,
85
+ "loss": 0.267,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.39,
90
+ "grad_norm": 11.113668441772461,
91
+ "learning_rate": 0.00018070739549839229,
92
+ "loss": 0.3107,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.42,
97
+ "grad_norm": 5.346070766448975,
98
+ "learning_rate": 0.00017909967845659166,
99
+ "loss": 0.6457,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.45,
104
+ "grad_norm": 31.744802474975586,
105
+ "learning_rate": 0.000177491961414791,
106
+ "loss": 0.4913,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.48,
111
+ "grad_norm": 6.202216148376465,
112
+ "learning_rate": 0.00017588424437299036,
113
+ "loss": 0.6627,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.51,
118
+ "grad_norm": 12.808435440063477,
119
+ "learning_rate": 0.0001742765273311897,
120
+ "loss": 0.4071,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.55,
125
+ "grad_norm": 0.17637468874454498,
126
+ "learning_rate": 0.0001726688102893891,
127
+ "loss": 0.2312,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.58,
132
+ "grad_norm": 9.431242942810059,
133
+ "learning_rate": 0.00017106109324758844,
134
+ "loss": 0.1206,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.61,
139
+ "grad_norm": 0.0686856061220169,
140
+ "learning_rate": 0.0001694533762057878,
141
+ "loss": 0.2657,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.64,
146
+ "grad_norm": 0.035174936056137085,
147
+ "learning_rate": 0.00016784565916398716,
148
+ "loss": 0.2447,
 
 
 
 
 
 
 
 
 
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 0.68,
153
+ "grad_norm": 0.20221814513206482,
154
+ "learning_rate": 0.0001662379421221865,
155
+ "loss": 0.0932,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 0.71,
160
+ "grad_norm": 22.348093032836914,
161
+ "learning_rate": 0.00016463022508038586,
162
+ "loss": 0.5622,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 0.74,
167
+ "grad_norm": 0.09363432973623276,
168
+ "learning_rate": 0.0001630225080385852,
169
+ "loss": 0.1989,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 0.77,
174
+ "grad_norm": 0.10725115239620209,
175
+ "learning_rate": 0.0001614147909967846,
176
+ "loss": 0.1174,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 0.8,
181
+ "grad_norm": 2.2332615852355957,
182
+ "learning_rate": 0.00015980707395498394,
183
+ "loss": 0.3852,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 0.84,
188
+ "grad_norm": 0.08053239434957504,
189
+ "learning_rate": 0.0001581993569131833,
190
+ "loss": 0.2247,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.87,
195
+ "grad_norm": 0.21441006660461426,
196
+ "learning_rate": 0.00015659163987138264,
197
+ "loss": 0.2879,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 0.9,
202
+ "grad_norm": 0.0631105974316597,
203
+ "learning_rate": 0.00015498392282958201,
204
+ "loss": 0.2922,
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 0.93,
209
+ "grad_norm": 20.96392822265625,
210
+ "learning_rate": 0.00015337620578778136,
211
+ "loss": 0.0413,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 0.96,
216
+ "grad_norm": 0.4054552912712097,
217
+ "learning_rate": 0.0001517684887459807,
218
+ "loss": 0.1037,
 
 
 
 
 
 
 
 
 
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 1.0,
223
+ "grad_norm": 1.8209251165390015,
224
+ "learning_rate": 0.0001501607717041801,
225
+ "loss": 0.1802,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 1.03,
230
+ "grad_norm": 11.979750633239746,
231
+ "learning_rate": 0.00014855305466237944,
232
+ "loss": 0.1788,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 1.06,
237
+ "grad_norm": 0.10045678168535233,
238
+ "learning_rate": 0.0001469453376205788,
239
+ "loss": 0.4287,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 1.09,
244
+ "grad_norm": 0.051606420427560806,
245
+ "learning_rate": 0.00014533762057877814,
246
+ "loss": 0.1055,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 1.13,
251
+ "grad_norm": 0.20983509719371796,
252
+ "learning_rate": 0.00014372990353697752,
253
+ "loss": 0.1667,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 1.16,
258
+ "grad_norm": 0.15858705341815948,
259
+ "learning_rate": 0.00014212218649517686,
260
+ "loss": 0.1135,
261
  "step": 360
262
  },
263
  {
264
+ "epoch": 1.19,
265
+ "grad_norm": 1.420276165008545,
266
+ "learning_rate": 0.00014051446945337621,
267
+ "loss": 0.2112,
268
  "step": 370
269
  },
270
  {
271
+ "epoch": 1.22,
272
+ "grad_norm": 0.02672198973596096,
273
+ "learning_rate": 0.0001389067524115756,
274
+ "loss": 0.0313,
275
  "step": 380
276
  },
277
  {
278
+ "epoch": 1.25,
279
+ "grad_norm": 0.02562582679092884,
280
+ "learning_rate": 0.00013729903536977494,
281
+ "loss": 0.1069,
282
  "step": 390
283
  },
284
  {
285
+ "epoch": 1.29,
286
+ "grad_norm": 4.197677135467529,
287
+ "learning_rate": 0.0001356913183279743,
288
+ "loss": 0.2342,
 
 
 
 
 
 
 
 
 
289
  "step": 400
290
  },
291
  {
292
+ "epoch": 1.32,
293
+ "grad_norm": 0.04306723549962044,
294
+ "learning_rate": 0.00013408360128617364,
295
+ "loss": 0.1313,
296
  "step": 410
297
  },
298
  {
299
+ "epoch": 1.35,
300
+ "grad_norm": 4.383702278137207,
301
+ "learning_rate": 0.00013247588424437302,
302
+ "loss": 0.2145,
303
  "step": 420
304
  },
305
  {
306
+ "epoch": 1.38,
307
+ "grad_norm": 0.20137301087379456,
308
+ "learning_rate": 0.00013086816720257237,
309
+ "loss": 0.1514,
310
  "step": 430
311
  },
312
  {
313
+ "epoch": 1.41,
314
+ "grad_norm": 0.020689483731985092,
315
+ "learning_rate": 0.00012926045016077172,
316
+ "loss": 0.0712,
317
  "step": 440
318
  },
319
  {
320
+ "epoch": 1.45,
321
+ "grad_norm": 0.018355347216129303,
322
+ "learning_rate": 0.00012765273311897106,
323
+ "loss": 0.0923,
324
  "step": 450
325
  },
326
  {
327
+ "epoch": 1.48,
328
+ "grad_norm": 17.85742950439453,
329
+ "learning_rate": 0.00012604501607717044,
330
+ "loss": 0.0779,
331
  "step": 460
332
  },
333
  {
334
+ "epoch": 1.51,
335
+ "grad_norm": 0.05268234387040138,
336
+ "learning_rate": 0.0001244372990353698,
337
+ "loss": 0.0096,
338
  "step": 470
339
  },
340
  {
341
+ "epoch": 1.54,
342
+ "grad_norm": 0.09329644590616226,
343
+ "learning_rate": 0.00012282958199356914,
344
+ "loss": 0.0392,
345
  "step": 480
346
  },
347
  {
348
+ "epoch": 1.58,
349
+ "grad_norm": 8.043782234191895,
350
+ "learning_rate": 0.0001212218649517685,
351
+ "loss": 0.1322,
352
  "step": 490
353
  },
354
  {
355
+ "epoch": 1.61,
356
+ "grad_norm": 0.016368461772799492,
357
+ "learning_rate": 0.00011961414790996785,
358
+ "loss": 0.0843,
359
  "step": 500
360
  },
361
  {
362
+ "epoch": 1.64,
363
+ "grad_norm": 13.989496231079102,
364
+ "learning_rate": 0.0001180064308681672,
365
+ "loss": 0.2665,
 
 
 
 
 
 
 
 
 
366
  "step": 510
367
  },
368
  {
369
+ "epoch": 1.67,
370
+ "grad_norm": 14.703727722167969,
371
+ "learning_rate": 0.00011639871382636655,
372
+ "loss": 0.1551,
373
  "step": 520
374
  },
375
  {
376
+ "epoch": 1.7,
377
+ "grad_norm": 0.13277527689933777,
378
+ "learning_rate": 0.00011479099678456593,
379
+ "loss": 0.1346,
380
  "step": 530
381
  },
382
  {
383
+ "epoch": 1.74,
384
+ "grad_norm": 0.04265744984149933,
385
+ "learning_rate": 0.00011318327974276528,
386
+ "loss": 0.1725,
387
  "step": 540
388
  },
389
  {
390
+ "epoch": 1.77,
391
+ "grad_norm": 0.04861054942011833,
392
+ "learning_rate": 0.00011157556270096463,
393
+ "loss": 0.0696,
394
  "step": 550
395
  },
396
  {
397
+ "epoch": 1.8,
398
+ "grad_norm": 0.015978263691067696,
399
+ "learning_rate": 0.00010996784565916398,
400
+ "loss": 0.0583,
401
  "step": 560
402
  },
403
  {
404
+ "epoch": 1.83,
405
+ "grad_norm": 0.21797218918800354,
406
+ "learning_rate": 0.00010836012861736335,
407
+ "loss": 0.1746,
408
  "step": 570
409
  },
410
  {
411
+ "epoch": 1.86,
412
+ "grad_norm": 16.11418342590332,
413
+ "learning_rate": 0.0001067524115755627,
414
+ "loss": 0.2571,
415
  "step": 580
416
  },
417
  {
418
+ "epoch": 1.9,
419
+ "grad_norm": 0.025191914290189743,
420
+ "learning_rate": 0.00010514469453376205,
421
+ "loss": 0.1326,
422
  "step": 590
423
  },
424
  {
425
+ "epoch": 1.93,
426
+ "grad_norm": 0.03328488767147064,
427
+ "learning_rate": 0.00010353697749196143,
428
+ "loss": 0.1601,
429
  "step": 600
430
  },
431
  {
432
+ "epoch": 1.96,
433
+ "grad_norm": 0.017355144023895264,
434
+ "learning_rate": 0.00010192926045016078,
435
+ "loss": 0.1607,
 
 
 
 
 
 
 
 
 
436
  "step": 610
437
  },
438
  {
439
+ "epoch": 1.99,
440
+ "grad_norm": 0.15774357318878174,
441
+ "learning_rate": 0.00010032154340836013,
442
+ "loss": 0.0182,
443
  "step": 620
444
  },
445
+ {
446
+ "epoch": 2.03,
447
+ "grad_norm": 0.039228443056344986,
448
+ "learning_rate": 9.871382636655949e-05,
449
+ "loss": 0.0898,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 2.06,
454
+ "grad_norm": 27.718074798583984,
455
+ "learning_rate": 9.710610932475884e-05,
456
+ "loss": 0.1895,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 2.09,
461
+ "grad_norm": 0.10635074228048325,
462
+ "learning_rate": 9.54983922829582e-05,
463
+ "loss": 0.0805,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 2.12,
468
+ "grad_norm": 0.01466443482786417,
469
+ "learning_rate": 9.389067524115757e-05,
470
+ "loss": 0.0697,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 2.15,
475
+ "grad_norm": 0.037685755640268326,
476
+ "learning_rate": 9.228295819935692e-05,
477
+ "loss": 0.0031,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 2.19,
482
+ "grad_norm": 0.009445942007005215,
483
+ "learning_rate": 9.067524115755628e-05,
484
+ "loss": 0.0636,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 2.22,
489
+ "grad_norm": 0.011714440770447254,
490
+ "learning_rate": 8.906752411575563e-05,
491
+ "loss": 0.0554,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 2.25,
496
+ "grad_norm": 0.018049364909529686,
497
+ "learning_rate": 8.7459807073955e-05,
498
+ "loss": 0.0073,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 2.28,
503
+ "grad_norm": 0.013684015721082687,
504
+ "learning_rate": 8.585209003215434e-05,
505
+ "loss": 0.004,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 2.32,
510
+ "grad_norm": 0.0740511417388916,
511
+ "learning_rate": 8.42443729903537e-05,
512
+ "loss": 0.0724,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 2.35,
517
+ "grad_norm": 0.0066555822268128395,
518
+ "learning_rate": 8.263665594855306e-05,
519
+ "loss": 0.0029,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 2.38,
524
+ "grad_norm": 0.00826901663094759,
525
+ "learning_rate": 8.102893890675242e-05,
526
+ "loss": 0.0745,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 2.41,
531
+ "grad_norm": 0.009980367496609688,
532
+ "learning_rate": 7.942122186495177e-05,
533
+ "loss": 0.0026,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 2.44,
538
+ "grad_norm": 7.373602390289307,
539
+ "learning_rate": 7.781350482315113e-05,
540
+ "loss": 0.1221,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 2.48,
545
+ "grad_norm": 0.011884603649377823,
546
+ "learning_rate": 7.62057877813505e-05,
547
+ "loss": 0.0404,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 2.51,
552
+ "grad_norm": 28.61383628845215,
553
+ "learning_rate": 7.459807073954984e-05,
554
+ "loss": 0.0483,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 2.54,
559
+ "grad_norm": 0.006681604776531458,
560
+ "learning_rate": 7.299035369774921e-05,
561
+ "loss": 0.0729,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 2.57,
566
+ "grad_norm": 0.02536676451563835,
567
+ "learning_rate": 7.138263665594856e-05,
568
+ "loss": 0.0023,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 2.6,
573
+ "grad_norm": 0.016948092728853226,
574
+ "learning_rate": 6.977491961414792e-05,
575
+ "loss": 0.0426,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 2.64,
580
+ "grad_norm": 0.021912137046456337,
581
+ "learning_rate": 6.816720257234727e-05,
582
+ "loss": 0.064,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 2.67,
587
+ "grad_norm": 0.014551441185176373,
588
+ "learning_rate": 6.655948553054663e-05,
589
+ "loss": 0.065,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 2.7,
594
+ "grad_norm": 0.3249826431274414,
595
+ "learning_rate": 6.495176848874598e-05,
596
+ "loss": 0.1171,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 2.73,
601
+ "grad_norm": 17.302183151245117,
602
+ "learning_rate": 6.334405144694535e-05,
603
+ "loss": 0.066,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 2.77,
608
+ "grad_norm": 0.03599601984024048,
609
+ "learning_rate": 6.173633440514471e-05,
610
+ "loss": 0.0203,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 2.8,
615
+ "grad_norm": 0.011334868147969246,
616
+ "learning_rate": 6.012861736334405e-05,
617
+ "loss": 0.0188,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 2.83,
622
+ "grad_norm": 0.012830100953578949,
623
+ "learning_rate": 5.8520900321543414e-05,
624
+ "loss": 0.0028,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 2.86,
629
+ "grad_norm": 0.09628697484731674,
630
+ "learning_rate": 5.6913183279742764e-05,
631
+ "loss": 0.0857,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 2.89,
636
+ "grad_norm": 0.008330035023391247,
637
+ "learning_rate": 5.530546623794213e-05,
638
+ "loss": 0.0018,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 2.93,
643
+ "grad_norm": 0.007389857899397612,
644
+ "learning_rate": 5.369774919614148e-05,
645
+ "loss": 0.0019,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 2.96,
650
+ "grad_norm": 0.011146631091833115,
651
+ "learning_rate": 5.209003215434084e-05,
652
+ "loss": 0.104,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 2.99,
657
+ "grad_norm": 0.021591678261756897,
658
+ "learning_rate": 5.048231511254019e-05,
659
+ "loss": 0.0016,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 3.02,
664
+ "grad_norm": 0.09255488216876984,
665
+ "learning_rate": 4.887459807073955e-05,
666
+ "loss": 0.1024,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 3.05,
671
+ "grad_norm": 0.01300421915948391,
672
+ "learning_rate": 4.726688102893891e-05,
673
+ "loss": 0.0019,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 3.09,
678
+ "grad_norm": 0.009576304815709591,
679
+ "learning_rate": 4.5659163987138265e-05,
680
+ "loss": 0.0036,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 3.12,
685
+ "grad_norm": 0.006058481056243181,
686
+ "learning_rate": 4.405144694533762e-05,
687
+ "loss": 0.0333,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 3.15,
692
+ "grad_norm": 0.00888640247285366,
693
+ "learning_rate": 4.244372990353698e-05,
694
+ "loss": 0.0026,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 3.18,
699
+ "grad_norm": 0.008608637377619743,
700
+ "learning_rate": 4.083601286173634e-05,
701
+ "loss": 0.002,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 3.22,
706
+ "grad_norm": 0.017658203840255737,
707
+ "learning_rate": 3.92282958199357e-05,
708
+ "loss": 0.0021,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 3.22,
713
+ "eval_accuracy": 0.9260450160771704,
714
+ "eval_loss": 0.3599020838737488,
715
+ "eval_runtime": 19.6251,
716
+ "eval_samples_per_second": 31.694,
717
+ "eval_steps_per_second": 3.975,
718
+ "step": 1000
719
+ },
720
+ {
721
+ "epoch": 3.25,
722
+ "grad_norm": 0.015870286151766777,
723
+ "learning_rate": 3.7620578778135054e-05,
724
+ "loss": 0.002,
725
+ "step": 1010
726
+ },
727
+ {
728
+ "epoch": 3.28,
729
+ "grad_norm": 0.006019009742885828,
730
+ "learning_rate": 3.601286173633441e-05,
731
+ "loss": 0.0021,
732
+ "step": 1020
733
+ },
734
+ {
735
+ "epoch": 3.31,
736
+ "grad_norm": 0.02200642041862011,
737
+ "learning_rate": 3.4405144694533766e-05,
738
+ "loss": 0.0484,
739
+ "step": 1030
740
+ },
741
+ {
742
+ "epoch": 3.34,
743
+ "grad_norm": 0.00513369170948863,
744
+ "learning_rate": 3.279742765273312e-05,
745
+ "loss": 0.0021,
746
+ "step": 1040
747
+ },
748
+ {
749
+ "epoch": 3.38,
750
+ "grad_norm": 0.0066593969240784645,
751
+ "learning_rate": 3.118971061093248e-05,
752
+ "loss": 0.0017,
753
+ "step": 1050
754
+ },
755
+ {
756
+ "epoch": 3.41,
757
+ "grad_norm": 0.010374795645475388,
758
+ "learning_rate": 2.9581993569131832e-05,
759
+ "loss": 0.0015,
760
+ "step": 1060
761
+ },
762
+ {
763
+ "epoch": 3.44,
764
+ "grad_norm": 0.006329710595309734,
765
+ "learning_rate": 2.7974276527331188e-05,
766
+ "loss": 0.0727,
767
+ "step": 1070
768
+ },
769
+ {
770
+ "epoch": 3.47,
771
+ "grad_norm": 0.012291769497096539,
772
+ "learning_rate": 2.6366559485530545e-05,
773
+ "loss": 0.0022,
774
+ "step": 1080
775
+ },
776
+ {
777
+ "epoch": 3.5,
778
+ "grad_norm": 0.009208714589476585,
779
+ "learning_rate": 2.4758842443729904e-05,
780
+ "loss": 0.0017,
781
+ "step": 1090
782
+ },
783
+ {
784
+ "epoch": 3.54,
785
+ "grad_norm": 0.008606062270700932,
786
+ "learning_rate": 2.315112540192926e-05,
787
+ "loss": 0.0355,
788
+ "step": 1100
789
+ },
790
+ {
791
+ "epoch": 3.57,
792
+ "grad_norm": 0.005816610064357519,
793
+ "learning_rate": 2.154340836012862e-05,
794
+ "loss": 0.0024,
795
+ "step": 1110
796
+ },
797
+ {
798
+ "epoch": 3.6,
799
+ "grad_norm": 0.016983453184366226,
800
+ "learning_rate": 1.9935691318327977e-05,
801
+ "loss": 0.0022,
802
+ "step": 1120
803
+ },
804
+ {
805
+ "epoch": 3.63,
806
+ "grad_norm": 0.013002891093492508,
807
+ "learning_rate": 1.8327974276527333e-05,
808
+ "loss": 0.0027,
809
+ "step": 1130
810
+ },
811
+ {
812
+ "epoch": 3.67,
813
+ "grad_norm": 0.004919551312923431,
814
+ "learning_rate": 1.672025723472669e-05,
815
+ "loss": 0.0038,
816
+ "step": 1140
817
+ },
818
+ {
819
+ "epoch": 3.7,
820
+ "grad_norm": 0.01140748430043459,
821
+ "learning_rate": 1.5112540192926044e-05,
822
+ "loss": 0.0018,
823
+ "step": 1150
824
+ },
825
+ {
826
+ "epoch": 3.73,
827
+ "grad_norm": 0.0061045498587191105,
828
+ "learning_rate": 1.3504823151125404e-05,
829
+ "loss": 0.0015,
830
+ "step": 1160
831
+ },
832
+ {
833
+ "epoch": 3.76,
834
+ "grad_norm": 0.01204584538936615,
835
+ "learning_rate": 1.189710610932476e-05,
836
+ "loss": 0.0019,
837
+ "step": 1170
838
+ },
839
+ {
840
+ "epoch": 3.79,
841
+ "grad_norm": 0.005698794033378363,
842
+ "learning_rate": 1.0289389067524116e-05,
843
+ "loss": 0.0015,
844
+ "step": 1180
845
+ },
846
+ {
847
+ "epoch": 3.83,
848
+ "grad_norm": 0.008717156946659088,
849
+ "learning_rate": 8.681672025723474e-06,
850
+ "loss": 0.0018,
851
+ "step": 1190
852
+ },
853
+ {
854
+ "epoch": 3.86,
855
+ "grad_norm": 0.03146166726946831,
856
+ "learning_rate": 7.07395498392283e-06,
857
+ "loss": 0.0305,
858
+ "step": 1200
859
+ },
860
+ {
861
+ "epoch": 3.89,
862
+ "grad_norm": 0.0084818284958601,
863
+ "learning_rate": 5.466237942122187e-06,
864
+ "loss": 0.0046,
865
+ "step": 1210
866
+ },
867
+ {
868
+ "epoch": 3.92,
869
+ "grad_norm": 0.026195811107754707,
870
+ "learning_rate": 3.858520900321544e-06,
871
+ "loss": 0.0018,
872
+ "step": 1220
873
+ },
874
+ {
875
+ "epoch": 3.95,
876
+ "grad_norm": 0.01475218404084444,
877
+ "learning_rate": 2.2508038585209006e-06,
878
+ "loss": 0.0019,
879
+ "step": 1230
880
+ },
881
+ {
882
+ "epoch": 3.99,
883
+ "grad_norm": 0.006672242656350136,
884
+ "learning_rate": 6.430868167202573e-07,
885
+ "loss": 0.0516,
886
+ "step": 1240
887
+ },
888
  {
889
  "epoch": 4.0,
890
+ "step": 1244,
891
  "total_flos": 7.703325099767808e+17,
892
+ "train_loss": 0.11289434264701495,
893
+ "train_runtime": 3405.5271,
894
+ "train_samples_per_second": 2.919,
895
+ "train_steps_per_second": 0.365
896
  }
897
  ],
898
  "logging_steps": 10,
899
+ "max_steps": 1244,
900
  "num_input_tokens_seen": 0,
901
  "num_train_epochs": 4,
902
+ "save_steps": 1000,
903
  "total_flos": 7.703325099767808e+17,
904
+ "train_batch_size": 8,
905
  "trial_name": null,
906
  "trial_params": null
907
  }