d071696 commited on
Commit
7b74f5e
1 Parent(s): d80061b

Model save

Browse files
README.md CHANGED
@@ -34,7 +34,7 @@ More information needed
34
 
35
  The following hyperparameters were used during training:
36
  - learning_rate: 0.0002
37
- - train_batch_size: 8
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 
34
 
35
  The following hyperparameters were used during training:
36
  - learning_rate: 0.0002
37
+ - train_batch_size: 16
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
all_results.json CHANGED
@@ -5,9 +5,9 @@
5
  "eval_runtime": 48.4901,
6
  "eval_samples_per_second": 51.248,
7
  "eval_steps_per_second": 6.414,
8
- "total_flos": 7.703325099767808e+17,
9
- "train_loss": 0.1579143282675882,
10
- "train_runtime": 491.3754,
11
- "train_samples_per_second": 20.229,
12
- "train_steps_per_second": 2.532
13
  }
 
5
  "eval_runtime": 48.4901,
6
  "eval_samples_per_second": 51.248,
7
  "eval_steps_per_second": 6.414,
8
+ "total_flos": 7439897757745152.0,
9
+ "train_loss": 2.0212895274162292,
10
+ "train_runtime": 7.7329,
11
+ "train_samples_per_second": 12.414,
12
+ "train_steps_per_second": 1.552
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b61cb4e03c9bfcb4d59e04874ddb0628ad12043bcb2d77be6e778249886b256
3
  size 343254736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ae2382b9eca4f52fc4d17a9cb546b8971f310d461d0e085613297c710f7ff9
3
  size 343254736
runs/Mar29_18-45-45_X5C922065N/events.out.tfevents.1711734345.X5C922065N.53009.7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e2c19225f69428b191c5ad332d0bce93b54ac73221e2934e3ffbf686d0a57b1
3
+ size 6106
runs/Mar29_18-51-27_X5C922065N/events.out.tfevents.1711734707.X5C922065N.77198.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a0ddc9125c1c57385b0ee044d4b2110eacca12fd4d08dea91f57269f606419
3
+ size 5898
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "total_flos": 7.703325099767808e+17,
4
- "train_loss": 0.1579143282675882,
5
- "train_runtime": 491.3754,
6
- "train_samples_per_second": 20.229,
7
- "train_steps_per_second": 2.532
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "total_flos": 7439897757745152.0,
4
+ "train_loss": 2.0212895274162292,
5
+ "train_runtime": 7.7329,
6
+ "train_samples_per_second": 12.414,
7
+ "train_steps_per_second": 1.552
8
  }
trainer_state.json CHANGED
@@ -1,906 +1,36 @@
1
  {
2
- "best_metric": 0.21430718898773193,
3
- "best_model_checkpoint": "./vit-finetune-scrap/checkpoint-1000",
4
  "epoch": 4.0,
5
  "eval_steps": 1000,
6
- "global_step": 1244,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "grad_norm": 10.849635124206543,
14
- "learning_rate": 0.00019839228295819936,
15
- "loss": 0.1258,
16
  "step": 10
17
  },
18
- {
19
- "epoch": 0.06,
20
- "grad_norm": 29.82047462463379,
21
- "learning_rate": 0.00019678456591639874,
22
- "loss": 0.2394,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.1,
27
- "grad_norm": 4.707005500793457,
28
- "learning_rate": 0.00019517684887459809,
29
- "loss": 0.234,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.13,
34
- "grad_norm": 13.66462516784668,
35
- "learning_rate": 0.00019356913183279743,
36
- "loss": 0.705,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.16,
41
- "grad_norm": 4.419419765472412,
42
- "learning_rate": 0.00019196141479099678,
43
- "loss": 0.6657,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.19,
48
- "grad_norm": 0.14946621656417847,
49
- "learning_rate": 0.00019035369774919616,
50
- "loss": 0.2407,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.23,
55
- "grad_norm": 2.8290460109710693,
56
- "learning_rate": 0.0001887459807073955,
57
- "loss": 0.3973,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.26,
62
- "grad_norm": 15.848897933959961,
63
- "learning_rate": 0.00018713826366559486,
64
- "loss": 0.2432,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.29,
69
- "grad_norm": 0.30860471725463867,
70
- "learning_rate": 0.0001855305466237942,
71
- "loss": 0.2732,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.32,
76
- "grad_norm": 14.233210563659668,
77
- "learning_rate": 0.0001839228295819936,
78
- "loss": 0.261,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.35,
83
- "grad_norm": 9.140750885009766,
84
- "learning_rate": 0.00018231511254019294,
85
- "loss": 0.1776,
86
- "step": 110
87
- },
88
- {
89
- "epoch": 0.39,
90
- "grad_norm": 0.9528696537017822,
91
- "learning_rate": 0.00018070739549839229,
92
- "loss": 0.3849,
93
- "step": 120
94
- },
95
- {
96
- "epoch": 0.42,
97
- "grad_norm": 21.716726303100586,
98
- "learning_rate": 0.00017909967845659166,
99
- "loss": 0.3328,
100
- "step": 130
101
- },
102
- {
103
- "epoch": 0.45,
104
- "grad_norm": 7.960571765899658,
105
- "learning_rate": 0.000177491961414791,
106
- "loss": 0.5052,
107
- "step": 140
108
- },
109
- {
110
- "epoch": 0.48,
111
- "grad_norm": 3.9136505126953125,
112
- "learning_rate": 0.00017588424437299036,
113
- "loss": 0.5026,
114
- "step": 150
115
- },
116
- {
117
- "epoch": 0.51,
118
- "grad_norm": 14.131813049316406,
119
- "learning_rate": 0.0001742765273311897,
120
- "loss": 0.3997,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.55,
125
- "grad_norm": 13.529720306396484,
126
- "learning_rate": 0.0001726688102893891,
127
- "loss": 0.2877,
128
- "step": 170
129
- },
130
- {
131
- "epoch": 0.58,
132
- "grad_norm": 6.182504653930664,
133
- "learning_rate": 0.00017106109324758844,
134
- "loss": 0.3408,
135
- "step": 180
136
- },
137
- {
138
- "epoch": 0.61,
139
- "grad_norm": 0.17119653522968292,
140
- "learning_rate": 0.0001694533762057878,
141
- "loss": 0.2916,
142
- "step": 190
143
- },
144
- {
145
- "epoch": 0.64,
146
- "grad_norm": 13.307029724121094,
147
- "learning_rate": 0.00016784565916398716,
148
- "loss": 0.3485,
149
- "step": 200
150
- },
151
- {
152
- "epoch": 0.68,
153
- "grad_norm": 4.883426666259766,
154
- "learning_rate": 0.0001662379421221865,
155
- "loss": 0.3939,
156
- "step": 210
157
- },
158
- {
159
- "epoch": 0.71,
160
- "grad_norm": 5.17271614074707,
161
- "learning_rate": 0.00016463022508038586,
162
- "loss": 0.4001,
163
- "step": 220
164
- },
165
- {
166
- "epoch": 0.74,
167
- "grad_norm": 0.18887023627758026,
168
- "learning_rate": 0.0001630225080385852,
169
- "loss": 0.2459,
170
- "step": 230
171
- },
172
- {
173
- "epoch": 0.77,
174
- "grad_norm": 0.3397394120693207,
175
- "learning_rate": 0.0001614147909967846,
176
- "loss": 0.3813,
177
- "step": 240
178
- },
179
- {
180
- "epoch": 0.8,
181
- "grad_norm": 7.221404075622559,
182
- "learning_rate": 0.00015980707395498394,
183
- "loss": 0.2913,
184
- "step": 250
185
- },
186
- {
187
- "epoch": 0.84,
188
- "grad_norm": 3.0032007694244385,
189
- "learning_rate": 0.0001581993569131833,
190
- "loss": 0.273,
191
- "step": 260
192
- },
193
- {
194
- "epoch": 0.87,
195
- "grad_norm": 3.486640691757202,
196
- "learning_rate": 0.00015659163987138264,
197
- "loss": 0.5797,
198
- "step": 270
199
- },
200
- {
201
- "epoch": 0.9,
202
- "grad_norm": 0.3199945092201233,
203
- "learning_rate": 0.00015498392282958201,
204
- "loss": 0.4904,
205
- "step": 280
206
- },
207
- {
208
- "epoch": 0.93,
209
- "grad_norm": 38.1386833190918,
210
- "learning_rate": 0.00015337620578778136,
211
- "loss": 0.2789,
212
- "step": 290
213
- },
214
- {
215
- "epoch": 0.96,
216
- "grad_norm": 3.608177661895752,
217
- "learning_rate": 0.0001517684887459807,
218
- "loss": 0.5587,
219
- "step": 300
220
- },
221
- {
222
- "epoch": 1.0,
223
- "grad_norm": 0.1488448977470398,
224
- "learning_rate": 0.0001501607717041801,
225
- "loss": 0.3405,
226
- "step": 310
227
- },
228
- {
229
- "epoch": 1.03,
230
- "grad_norm": 1.542035460472107,
231
- "learning_rate": 0.00014855305466237944,
232
- "loss": 0.2688,
233
- "step": 320
234
- },
235
- {
236
- "epoch": 1.06,
237
- "grad_norm": 2.1089909076690674,
238
- "learning_rate": 0.0001469453376205788,
239
- "loss": 0.2085,
240
- "step": 330
241
- },
242
- {
243
- "epoch": 1.09,
244
- "grad_norm": 11.16602897644043,
245
- "learning_rate": 0.00014533762057877814,
246
- "loss": 0.3437,
247
- "step": 340
248
- },
249
- {
250
- "epoch": 1.13,
251
- "grad_norm": 2.2596559524536133,
252
- "learning_rate": 0.00014372990353697752,
253
- "loss": 0.337,
254
- "step": 350
255
- },
256
- {
257
- "epoch": 1.16,
258
- "grad_norm": 2.616323947906494,
259
- "learning_rate": 0.00014212218649517686,
260
- "loss": 0.2074,
261
- "step": 360
262
- },
263
- {
264
- "epoch": 1.19,
265
- "grad_norm": 0.5269195437431335,
266
- "learning_rate": 0.00014051446945337621,
267
- "loss": 0.0913,
268
- "step": 370
269
- },
270
- {
271
- "epoch": 1.22,
272
- "grad_norm": 7.34785270690918,
273
- "learning_rate": 0.0001389067524115756,
274
- "loss": 0.1462,
275
- "step": 380
276
- },
277
- {
278
- "epoch": 1.25,
279
- "grad_norm": 0.13304546475410461,
280
- "learning_rate": 0.00013729903536977494,
281
- "loss": 0.0303,
282
- "step": 390
283
- },
284
- {
285
- "epoch": 1.29,
286
- "grad_norm": 0.42307400703430176,
287
- "learning_rate": 0.0001356913183279743,
288
- "loss": 0.2195,
289
- "step": 400
290
- },
291
- {
292
- "epoch": 1.32,
293
- "grad_norm": 0.16662320494651794,
294
- "learning_rate": 0.00013408360128617364,
295
- "loss": 0.2999,
296
- "step": 410
297
- },
298
- {
299
- "epoch": 1.35,
300
- "grad_norm": 35.579891204833984,
301
- "learning_rate": 0.00013247588424437302,
302
- "loss": 0.1194,
303
- "step": 420
304
- },
305
- {
306
- "epoch": 1.38,
307
- "grad_norm": 3.4818050861358643,
308
- "learning_rate": 0.00013086816720257237,
309
- "loss": 0.1469,
310
- "step": 430
311
- },
312
- {
313
- "epoch": 1.41,
314
- "grad_norm": 6.36860466003418,
315
- "learning_rate": 0.00012926045016077172,
316
- "loss": 0.2234,
317
- "step": 440
318
- },
319
- {
320
- "epoch": 1.45,
321
- "grad_norm": 7.359828948974609,
322
- "learning_rate": 0.00012765273311897106,
323
- "loss": 0.2114,
324
- "step": 450
325
- },
326
- {
327
- "epoch": 1.48,
328
- "grad_norm": 0.11759760975837708,
329
- "learning_rate": 0.00012604501607717044,
330
- "loss": 0.1059,
331
- "step": 460
332
- },
333
- {
334
- "epoch": 1.51,
335
- "grad_norm": 0.049188051372766495,
336
- "learning_rate": 0.0001244372990353698,
337
- "loss": 0.207,
338
- "step": 470
339
- },
340
- {
341
- "epoch": 1.54,
342
- "grad_norm": 0.06988845020532608,
343
- "learning_rate": 0.00012282958199356914,
344
- "loss": 0.1319,
345
- "step": 480
346
- },
347
- {
348
- "epoch": 1.58,
349
- "grad_norm": 10.857504844665527,
350
- "learning_rate": 0.0001212218649517685,
351
- "loss": 0.3497,
352
- "step": 490
353
- },
354
- {
355
- "epoch": 1.61,
356
- "grad_norm": 0.04112955555319786,
357
- "learning_rate": 0.00011961414790996785,
358
- "loss": 0.0711,
359
- "step": 500
360
- },
361
- {
362
- "epoch": 1.64,
363
- "grad_norm": 20.134990692138672,
364
- "learning_rate": 0.0001180064308681672,
365
- "loss": 0.2654,
366
- "step": 510
367
- },
368
- {
369
- "epoch": 1.67,
370
- "grad_norm": 0.03998303785920143,
371
- "learning_rate": 0.00011639871382636655,
372
- "loss": 0.0911,
373
- "step": 520
374
- },
375
- {
376
- "epoch": 1.7,
377
- "grad_norm": 10.199617385864258,
378
- "learning_rate": 0.00011479099678456593,
379
- "loss": 0.1106,
380
- "step": 530
381
- },
382
- {
383
- "epoch": 1.74,
384
- "grad_norm": 2.3347342014312744,
385
- "learning_rate": 0.00011318327974276528,
386
- "loss": 0.1948,
387
- "step": 540
388
- },
389
- {
390
- "epoch": 1.77,
391
- "grad_norm": 15.492130279541016,
392
- "learning_rate": 0.00011157556270096463,
393
- "loss": 0.2999,
394
- "step": 550
395
- },
396
- {
397
- "epoch": 1.8,
398
- "grad_norm": 16.2156982421875,
399
- "learning_rate": 0.00010996784565916398,
400
- "loss": 0.1792,
401
- "step": 560
402
- },
403
- {
404
- "epoch": 1.83,
405
- "grad_norm": 3.9076225757598877,
406
- "learning_rate": 0.00010836012861736335,
407
- "loss": 0.4599,
408
- "step": 570
409
- },
410
- {
411
- "epoch": 1.86,
412
- "grad_norm": 0.0662955567240715,
413
- "learning_rate": 0.0001067524115755627,
414
- "loss": 0.0834,
415
- "step": 580
416
- },
417
- {
418
- "epoch": 1.9,
419
- "grad_norm": 0.43734121322631836,
420
- "learning_rate": 0.00010514469453376205,
421
- "loss": 0.1804,
422
- "step": 590
423
- },
424
- {
425
- "epoch": 1.93,
426
- "grad_norm": 0.23478691279888153,
427
- "learning_rate": 0.00010353697749196143,
428
- "loss": 0.0831,
429
- "step": 600
430
- },
431
- {
432
- "epoch": 1.96,
433
- "grad_norm": 8.97579574584961,
434
- "learning_rate": 0.00010192926045016078,
435
- "loss": 0.2141,
436
- "step": 610
437
- },
438
- {
439
- "epoch": 1.99,
440
- "grad_norm": 5.947574615478516,
441
- "learning_rate": 0.00010032154340836013,
442
- "loss": 0.1059,
443
- "step": 620
444
- },
445
- {
446
- "epoch": 2.03,
447
- "grad_norm": 0.3693161904811859,
448
- "learning_rate": 9.871382636655949e-05,
449
- "loss": 0.0478,
450
- "step": 630
451
- },
452
- {
453
- "epoch": 2.06,
454
- "grad_norm": 0.33773139119148254,
455
- "learning_rate": 9.710610932475884e-05,
456
- "loss": 0.1512,
457
- "step": 640
458
- },
459
- {
460
- "epoch": 2.09,
461
- "grad_norm": 0.07303290069103241,
462
- "learning_rate": 9.54983922829582e-05,
463
- "loss": 0.0746,
464
- "step": 650
465
- },
466
- {
467
- "epoch": 2.12,
468
- "grad_norm": 0.021892189979553223,
469
- "learning_rate": 9.389067524115757e-05,
470
- "loss": 0.0071,
471
- "step": 660
472
- },
473
- {
474
- "epoch": 2.15,
475
- "grad_norm": 0.699686586856842,
476
- "learning_rate": 9.228295819935692e-05,
477
- "loss": 0.08,
478
- "step": 670
479
- },
480
- {
481
- "epoch": 2.19,
482
- "grad_norm": 1.7835339307785034,
483
- "learning_rate": 9.067524115755628e-05,
484
- "loss": 0.092,
485
- "step": 680
486
- },
487
- {
488
- "epoch": 2.22,
489
- "grad_norm": 0.025796858593821526,
490
- "learning_rate": 8.906752411575563e-05,
491
- "loss": 0.041,
492
- "step": 690
493
- },
494
- {
495
- "epoch": 2.25,
496
- "grad_norm": 11.788249969482422,
497
- "learning_rate": 8.7459807073955e-05,
498
- "loss": 0.0269,
499
- "step": 700
500
- },
501
- {
502
- "epoch": 2.28,
503
- "grad_norm": 6.836824893951416,
504
- "learning_rate": 8.585209003215434e-05,
505
- "loss": 0.2086,
506
- "step": 710
507
- },
508
- {
509
- "epoch": 2.32,
510
- "grad_norm": 0.02837471477687359,
511
- "learning_rate": 8.42443729903537e-05,
512
- "loss": 0.0828,
513
- "step": 720
514
- },
515
- {
516
- "epoch": 2.35,
517
- "grad_norm": 0.04012266919016838,
518
- "learning_rate": 8.263665594855306e-05,
519
- "loss": 0.0057,
520
- "step": 730
521
- },
522
- {
523
- "epoch": 2.38,
524
- "grad_norm": 0.05077001079916954,
525
- "learning_rate": 8.102893890675242e-05,
526
- "loss": 0.0116,
527
- "step": 740
528
- },
529
- {
530
- "epoch": 2.41,
531
- "grad_norm": 0.08000744879245758,
532
- "learning_rate": 7.942122186495177e-05,
533
- "loss": 0.0305,
534
- "step": 750
535
- },
536
- {
537
- "epoch": 2.44,
538
- "grad_norm": 0.03205496072769165,
539
- "learning_rate": 7.781350482315113e-05,
540
- "loss": 0.0451,
541
- "step": 760
542
- },
543
- {
544
- "epoch": 2.48,
545
- "grad_norm": 0.027969710528850555,
546
- "learning_rate": 7.62057877813505e-05,
547
- "loss": 0.0779,
548
- "step": 770
549
- },
550
- {
551
- "epoch": 2.51,
552
- "grad_norm": 3.297053098678589,
553
- "learning_rate": 7.459807073954984e-05,
554
- "loss": 0.028,
555
- "step": 780
556
- },
557
- {
558
- "epoch": 2.54,
559
- "grad_norm": 1.8469219207763672,
560
- "learning_rate": 7.299035369774921e-05,
561
- "loss": 0.1222,
562
- "step": 790
563
- },
564
- {
565
- "epoch": 2.57,
566
- "grad_norm": 0.5228595733642578,
567
- "learning_rate": 7.138263665594856e-05,
568
- "loss": 0.0087,
569
- "step": 800
570
- },
571
- {
572
- "epoch": 2.6,
573
- "grad_norm": 0.028694279491901398,
574
- "learning_rate": 6.977491961414792e-05,
575
- "loss": 0.0053,
576
- "step": 810
577
- },
578
- {
579
- "epoch": 2.64,
580
- "grad_norm": 0.026992863044142723,
581
- "learning_rate": 6.816720257234727e-05,
582
- "loss": 0.0065,
583
- "step": 820
584
- },
585
- {
586
- "epoch": 2.67,
587
- "grad_norm": 1.996466040611267,
588
- "learning_rate": 6.655948553054663e-05,
589
- "loss": 0.0955,
590
- "step": 830
591
- },
592
- {
593
- "epoch": 2.7,
594
- "grad_norm": 0.01983807235956192,
595
- "learning_rate": 6.495176848874598e-05,
596
- "loss": 0.1021,
597
- "step": 840
598
- },
599
- {
600
- "epoch": 2.73,
601
- "grad_norm": 0.03182640299201012,
602
- "learning_rate": 6.334405144694535e-05,
603
- "loss": 0.1796,
604
- "step": 850
605
- },
606
- {
607
- "epoch": 2.77,
608
- "grad_norm": 0.049088891595602036,
609
- "learning_rate": 6.173633440514471e-05,
610
- "loss": 0.0907,
611
- "step": 860
612
- },
613
- {
614
- "epoch": 2.8,
615
- "grad_norm": 0.11043746769428253,
616
- "learning_rate": 6.012861736334405e-05,
617
- "loss": 0.0627,
618
- "step": 870
619
- },
620
- {
621
- "epoch": 2.83,
622
- "grad_norm": 0.2206079512834549,
623
- "learning_rate": 5.8520900321543414e-05,
624
- "loss": 0.0412,
625
- "step": 880
626
- },
627
- {
628
- "epoch": 2.86,
629
- "grad_norm": 0.02966146729886532,
630
- "learning_rate": 5.6913183279742764e-05,
631
- "loss": 0.1015,
632
- "step": 890
633
- },
634
- {
635
- "epoch": 2.89,
636
- "grad_norm": 0.0345352403819561,
637
- "learning_rate": 5.530546623794213e-05,
638
- "loss": 0.0629,
639
- "step": 900
640
- },
641
- {
642
- "epoch": 2.93,
643
- "grad_norm": 0.06348275393247604,
644
- "learning_rate": 5.369774919614148e-05,
645
- "loss": 0.0064,
646
- "step": 910
647
- },
648
- {
649
- "epoch": 2.96,
650
- "grad_norm": 0.06559421122074127,
651
- "learning_rate": 5.209003215434084e-05,
652
- "loss": 0.0191,
653
- "step": 920
654
- },
655
- {
656
- "epoch": 2.99,
657
- "grad_norm": 1.113765835762024,
658
- "learning_rate": 5.048231511254019e-05,
659
- "loss": 0.0259,
660
- "step": 930
661
- },
662
- {
663
- "epoch": 3.02,
664
- "grad_norm": 0.02486424334347248,
665
- "learning_rate": 4.887459807073955e-05,
666
- "loss": 0.0049,
667
- "step": 940
668
- },
669
- {
670
- "epoch": 3.05,
671
- "grad_norm": 0.7845320701599121,
672
- "learning_rate": 4.726688102893891e-05,
673
- "loss": 0.0043,
674
- "step": 950
675
- },
676
- {
677
- "epoch": 3.09,
678
- "grad_norm": 0.021990863606333733,
679
- "learning_rate": 4.5659163987138265e-05,
680
- "loss": 0.0042,
681
- "step": 960
682
- },
683
- {
684
- "epoch": 3.12,
685
- "grad_norm": 0.022443994879722595,
686
- "learning_rate": 4.405144694533762e-05,
687
- "loss": 0.0054,
688
- "step": 970
689
- },
690
- {
691
- "epoch": 3.15,
692
- "grad_norm": 0.009742784313857555,
693
- "learning_rate": 4.244372990353698e-05,
694
- "loss": 0.0041,
695
- "step": 980
696
- },
697
- {
698
- "epoch": 3.18,
699
- "grad_norm": 0.037747763097286224,
700
- "learning_rate": 4.083601286173634e-05,
701
- "loss": 0.0242,
702
- "step": 990
703
- },
704
- {
705
- "epoch": 3.22,
706
- "grad_norm": 0.010466611944139004,
707
- "learning_rate": 3.92282958199357e-05,
708
- "loss": 0.0035,
709
- "step": 1000
710
- },
711
- {
712
- "epoch": 3.22,
713
- "eval_accuracy": 0.9485530546623794,
714
- "eval_loss": 0.21430718898773193,
715
- "eval_runtime": 14.2198,
716
- "eval_samples_per_second": 43.742,
717
- "eval_steps_per_second": 5.485,
718
- "step": 1000
719
- },
720
- {
721
- "epoch": 3.25,
722
- "grad_norm": 0.00991890113800764,
723
- "learning_rate": 3.7620578778135054e-05,
724
- "loss": 0.0039,
725
- "step": 1010
726
- },
727
- {
728
- "epoch": 3.28,
729
- "grad_norm": 0.016740955412387848,
730
- "learning_rate": 3.601286173633441e-05,
731
- "loss": 0.0065,
732
- "step": 1020
733
- },
734
- {
735
- "epoch": 3.31,
736
- "grad_norm": 0.03466745838522911,
737
- "learning_rate": 3.4405144694533766e-05,
738
- "loss": 0.099,
739
- "step": 1030
740
- },
741
- {
742
- "epoch": 3.34,
743
- "grad_norm": 0.008615425787866116,
744
- "learning_rate": 3.279742765273312e-05,
745
- "loss": 0.0179,
746
- "step": 1040
747
- },
748
- {
749
- "epoch": 3.38,
750
- "grad_norm": 0.05827281251549721,
751
- "learning_rate": 3.118971061093248e-05,
752
- "loss": 0.0041,
753
- "step": 1050
754
- },
755
- {
756
- "epoch": 3.41,
757
- "grad_norm": 0.0276072658598423,
758
- "learning_rate": 2.9581993569131832e-05,
759
- "loss": 0.0036,
760
- "step": 1060
761
- },
762
- {
763
- "epoch": 3.44,
764
- "grad_norm": 0.011412302032113075,
765
- "learning_rate": 2.7974276527331188e-05,
766
- "loss": 0.1013,
767
- "step": 1070
768
- },
769
- {
770
- "epoch": 3.47,
771
- "grad_norm": 0.013982011005282402,
772
- "learning_rate": 2.6366559485530545e-05,
773
- "loss": 0.0058,
774
- "step": 1080
775
- },
776
- {
777
- "epoch": 3.5,
778
- "grad_norm": 0.026057597249746323,
779
- "learning_rate": 2.4758842443729904e-05,
780
- "loss": 0.0077,
781
- "step": 1090
782
- },
783
- {
784
- "epoch": 3.54,
785
- "grad_norm": 0.04853319376707077,
786
- "learning_rate": 2.315112540192926e-05,
787
- "loss": 0.0035,
788
- "step": 1100
789
- },
790
- {
791
- "epoch": 3.57,
792
- "grad_norm": 0.013841088861227036,
793
- "learning_rate": 2.154340836012862e-05,
794
- "loss": 0.0208,
795
- "step": 1110
796
- },
797
- {
798
- "epoch": 3.6,
799
- "grad_norm": 0.03845496475696564,
800
- "learning_rate": 1.9935691318327977e-05,
801
- "loss": 0.0038,
802
- "step": 1120
803
- },
804
- {
805
- "epoch": 3.63,
806
- "grad_norm": 0.023922910913825035,
807
- "learning_rate": 1.8327974276527333e-05,
808
- "loss": 0.0032,
809
- "step": 1130
810
- },
811
- {
812
- "epoch": 3.67,
813
- "grad_norm": 0.014864934608340263,
814
- "learning_rate": 1.672025723472669e-05,
815
- "loss": 0.0028,
816
- "step": 1140
817
- },
818
- {
819
- "epoch": 3.7,
820
- "grad_norm": 0.05655550956726074,
821
- "learning_rate": 1.5112540192926044e-05,
822
- "loss": 0.0039,
823
- "step": 1150
824
- },
825
- {
826
- "epoch": 3.73,
827
- "grad_norm": 0.012573642656207085,
828
- "learning_rate": 1.3504823151125404e-05,
829
- "loss": 0.0028,
830
- "step": 1160
831
- },
832
- {
833
- "epoch": 3.76,
834
- "grad_norm": 0.022632773965597153,
835
- "learning_rate": 1.189710610932476e-05,
836
- "loss": 0.0033,
837
- "step": 1170
838
- },
839
- {
840
- "epoch": 3.79,
841
- "grad_norm": 0.01279931515455246,
842
- "learning_rate": 1.0289389067524116e-05,
843
- "loss": 0.0238,
844
- "step": 1180
845
- },
846
- {
847
- "epoch": 3.83,
848
- "grad_norm": 0.023662865161895752,
849
- "learning_rate": 8.681672025723474e-06,
850
- "loss": 0.0253,
851
- "step": 1190
852
- },
853
- {
854
- "epoch": 3.86,
855
- "grad_norm": 0.017510054633021355,
856
- "learning_rate": 7.07395498392283e-06,
857
- "loss": 0.0047,
858
- "step": 1200
859
- },
860
- {
861
- "epoch": 3.89,
862
- "grad_norm": 0.0257584135979414,
863
- "learning_rate": 5.466237942122187e-06,
864
- "loss": 0.004,
865
- "step": 1210
866
- },
867
- {
868
- "epoch": 3.92,
869
- "grad_norm": 0.3079407513141632,
870
- "learning_rate": 3.858520900321544e-06,
871
- "loss": 0.085,
872
- "step": 1220
873
- },
874
- {
875
- "epoch": 3.95,
876
- "grad_norm": 0.00990583747625351,
877
- "learning_rate": 2.2508038585209006e-06,
878
- "loss": 0.0029,
879
- "step": 1230
880
- },
881
- {
882
- "epoch": 3.99,
883
- "grad_norm": 0.011738813482224941,
884
- "learning_rate": 6.430868167202573e-07,
885
- "loss": 0.034,
886
- "step": 1240
887
- },
888
  {
889
  "epoch": 4.0,
890
- "step": 1244,
891
- "total_flos": 7.703325099767808e+17,
892
- "train_loss": 0.1579143282675882,
893
- "train_runtime": 491.3754,
894
- "train_samples_per_second": 20.229,
895
- "train_steps_per_second": 2.532
896
  }
897
  ],
898
  "logging_steps": 10,
899
- "max_steps": 1244,
900
  "num_input_tokens_seen": 0,
901
  "num_train_epochs": 4,
902
  "save_steps": 1000,
903
- "total_flos": 7.703325099767808e+17,
904
  "train_batch_size": 8,
905
  "trial_name": null,
906
  "trial_params": null
 
1
  {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
  "epoch": 4.0,
5
  "eval_steps": 1000,
6
+ "global_step": 12,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 3.33,
13
+ "grad_norm": 2.8058300018310547,
14
+ "learning_rate": 3.3333333333333335e-05,
15
+ "loss": 2.1093,
16
  "step": 10
17
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  {
19
  "epoch": 4.0,
20
+ "step": 12,
21
+ "total_flos": 7439897757745152.0,
22
+ "train_loss": 2.0212895274162292,
23
+ "train_runtime": 7.7329,
24
+ "train_samples_per_second": 12.414,
25
+ "train_steps_per_second": 1.552
26
  }
27
  ],
28
  "logging_steps": 10,
29
+ "max_steps": 12,
30
  "num_input_tokens_seen": 0,
31
  "num_train_epochs": 4,
32
  "save_steps": 1000,
33
+ "total_flos": 7439897757745152.0,
34
  "train_batch_size": 8,
35
  "trial_name": null,
36
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72eb25d1bd046d95141d050a7046c0b70c3fb9f17a91b9b38a15dbfa48b5b07c
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a661c95712059d0902cb2770fd16a90f1cd9488c7804b9523f02c0e91000074b
3
  size 4920