flatala-research commited on
Commit
3a0dc97
1 Parent(s): e184710

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +8 -0
  2. test_results.json +8 -0
  3. trainer_state.json +699 -0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.135531135531136,
3
+ "eval_accuracy": 0.5756097560975609,
4
+ "eval_loss": 1.497787356376648,
5
+ "eval_runtime": 33.1995,
6
+ "eval_samples_per_second": 6.175,
7
+ "eval_steps_per_second": 0.783
8
+ }
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 6.135531135531136,
3
+ "eval_accuracy": 0.5756097560975609,
4
+ "eval_loss": 1.497787356376648,
5
+ "eval_runtime": 33.1995,
6
+ "eval_samples_per_second": 6.175,
7
+ "eval_steps_per_second": 0.783
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,699 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6019417475728155,
3
+ "best_model_checkpoint": "videomae-base-finetuned-kinetics-finetuned-conflab-traj-direction-rh-v10/checkpoint-819",
4
+ "epoch": 6.135531135531136,
5
+ "eval_steps": 500,
6
+ "global_step": 819,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01221001221001221,
13
+ "grad_norm": 8.122791290283203,
14
+ "learning_rate": 6.0975609756097564e-06,
15
+ "loss": 2.1095,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.02442002442002442,
20
+ "grad_norm": 6.661695957183838,
21
+ "learning_rate": 1.2195121951219513e-05,
22
+ "loss": 2.0298,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.03663003663003663,
27
+ "grad_norm": 7.086618900299072,
28
+ "learning_rate": 1.8292682926829268e-05,
29
+ "loss": 2.0189,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.04884004884004884,
34
+ "grad_norm": 8.178045272827148,
35
+ "learning_rate": 2.4390243902439026e-05,
36
+ "loss": 2.0099,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.06105006105006105,
41
+ "grad_norm": 7.585304260253906,
42
+ "learning_rate": 3.048780487804878e-05,
43
+ "loss": 1.9196,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.07326007326007326,
48
+ "grad_norm": 7.4838409423828125,
49
+ "learning_rate": 3.6585365853658535e-05,
50
+ "loss": 1.9165,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.08547008547008547,
55
+ "grad_norm": 6.174314498901367,
56
+ "learning_rate": 4.26829268292683e-05,
57
+ "loss": 1.8358,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.09768009768009768,
62
+ "grad_norm": 5.699541091918945,
63
+ "learning_rate": 4.878048780487805e-05,
64
+ "loss": 1.8794,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.10989010989010989,
69
+ "grad_norm": 6.571558475494385,
70
+ "learning_rate": 4.94572591587517e-05,
71
+ "loss": 1.8931,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.1221001221001221,
76
+ "grad_norm": 7.583756446838379,
77
+ "learning_rate": 4.877883310719132e-05,
78
+ "loss": 1.8477,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.1343101343101343,
83
+ "grad_norm": 6.113456726074219,
84
+ "learning_rate": 4.810040705563094e-05,
85
+ "loss": 1.989,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.14407814407814407,
90
+ "eval_accuracy": 0.2669902912621359,
91
+ "eval_loss": 1.8786594867706299,
92
+ "eval_runtime": 28.8858,
93
+ "eval_samples_per_second": 7.132,
94
+ "eval_steps_per_second": 0.9,
95
+ "step": 118
96
+ },
97
+ {
98
+ "epoch": 1.0024420024420024,
99
+ "grad_norm": 6.775885105133057,
100
+ "learning_rate": 4.742198100407056e-05,
101
+ "loss": 1.7872,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 1.0146520146520146,
106
+ "grad_norm": 5.531322956085205,
107
+ "learning_rate": 4.674355495251018e-05,
108
+ "loss": 1.8331,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 1.0268620268620268,
113
+ "grad_norm": 7.1956353187561035,
114
+ "learning_rate": 4.60651289009498e-05,
115
+ "loss": 1.7703,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 1.0390720390720392,
120
+ "grad_norm": 8.779479026794434,
121
+ "learning_rate": 4.5386702849389416e-05,
122
+ "loss": 1.6198,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 1.0512820512820513,
127
+ "grad_norm": 10.621439933776855,
128
+ "learning_rate": 4.470827679782904e-05,
129
+ "loss": 1.6431,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 1.0634920634920635,
134
+ "grad_norm": 7.896602630615234,
135
+ "learning_rate": 4.402985074626866e-05,
136
+ "loss": 1.6686,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 1.0757020757020757,
141
+ "grad_norm": 6.063083171844482,
142
+ "learning_rate": 4.335142469470828e-05,
143
+ "loss": 1.4762,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.0879120879120878,
148
+ "grad_norm": 10.578692436218262,
149
+ "learning_rate": 4.26729986431479e-05,
150
+ "loss": 1.6948,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.1001221001221002,
155
+ "grad_norm": 8.612629890441895,
156
+ "learning_rate": 4.199457259158752e-05,
157
+ "loss": 1.5687,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.1123321123321124,
162
+ "grad_norm": 8.634360313415527,
163
+ "learning_rate": 4.131614654002714e-05,
164
+ "loss": 1.4138,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 1.1245421245421245,
169
+ "grad_norm": 8.842486381530762,
170
+ "learning_rate": 4.063772048846676e-05,
171
+ "loss": 1.4202,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 1.1367521367521367,
176
+ "grad_norm": 8.472060203552246,
177
+ "learning_rate": 3.995929443690638e-05,
178
+ "loss": 1.3092,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 1.144078144078144,
183
+ "eval_accuracy": 0.42718446601941745,
184
+ "eval_loss": 1.6427617073059082,
185
+ "eval_runtime": 21.3576,
186
+ "eval_samples_per_second": 9.645,
187
+ "eval_steps_per_second": 1.217,
188
+ "step": 236
189
+ },
190
+ {
191
+ "epoch": 2.004884004884005,
192
+ "grad_norm": 9.463373184204102,
193
+ "learning_rate": 3.9280868385345995e-05,
194
+ "loss": 1.5065,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 2.017094017094017,
199
+ "grad_norm": 8.447351455688477,
200
+ "learning_rate": 3.860244233378562e-05,
201
+ "loss": 1.3116,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 2.029304029304029,
206
+ "grad_norm": 9.61158561706543,
207
+ "learning_rate": 3.792401628222524e-05,
208
+ "loss": 1.0903,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 2.0415140415140414,
213
+ "grad_norm": 10.502912521362305,
214
+ "learning_rate": 3.724559023066486e-05,
215
+ "loss": 1.1713,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 2.0537240537240535,
220
+ "grad_norm": 11.798907279968262,
221
+ "learning_rate": 3.656716417910448e-05,
222
+ "loss": 1.1561,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 2.065934065934066,
227
+ "grad_norm": 16.395263671875,
228
+ "learning_rate": 3.58887381275441e-05,
229
+ "loss": 1.2809,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 2.0781440781440783,
234
+ "grad_norm": 11.183971405029297,
235
+ "learning_rate": 3.521031207598372e-05,
236
+ "loss": 1.1276,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 2.0903540903540905,
241
+ "grad_norm": 10.743521690368652,
242
+ "learning_rate": 3.453188602442334e-05,
243
+ "loss": 1.3756,
244
+ "step": 310
245
+ },
246
+ {
247
+ "epoch": 2.1025641025641026,
248
+ "grad_norm": 8.513188362121582,
249
+ "learning_rate": 3.385345997286296e-05,
250
+ "loss": 0.9218,
251
+ "step": 320
252
+ },
253
+ {
254
+ "epoch": 2.114774114774115,
255
+ "grad_norm": 12.773633003234863,
256
+ "learning_rate": 3.3175033921302575e-05,
257
+ "loss": 1.2335,
258
+ "step": 330
259
+ },
260
+ {
261
+ "epoch": 2.126984126984127,
262
+ "grad_norm": 11.963506698608398,
263
+ "learning_rate": 3.24966078697422e-05,
264
+ "loss": 1.1391,
265
+ "step": 340
266
+ },
267
+ {
268
+ "epoch": 2.139194139194139,
269
+ "grad_norm": 10.355384826660156,
270
+ "learning_rate": 3.181818181818182e-05,
271
+ "loss": 1.0096,
272
+ "step": 350
273
+ },
274
+ {
275
+ "epoch": 2.144078144078144,
276
+ "eval_accuracy": 0.47572815533980584,
277
+ "eval_loss": 1.4351158142089844,
278
+ "eval_runtime": 27.857,
279
+ "eval_samples_per_second": 7.395,
280
+ "eval_steps_per_second": 0.933,
281
+ "step": 354
282
+ },
283
+ {
284
+ "epoch": 3.0073260073260073,
285
+ "grad_norm": 10.925836563110352,
286
+ "learning_rate": 3.113975576662144e-05,
287
+ "loss": 0.9783,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 3.0195360195360195,
292
+ "grad_norm": 6.559803009033203,
293
+ "learning_rate": 3.046132971506106e-05,
294
+ "loss": 0.8608,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 3.0317460317460316,
299
+ "grad_norm": 10.63405990600586,
300
+ "learning_rate": 2.9782903663500678e-05,
301
+ "loss": 0.7684,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 3.043956043956044,
306
+ "grad_norm": 11.322102546691895,
307
+ "learning_rate": 2.91044776119403e-05,
308
+ "loss": 0.8308,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 3.056166056166056,
313
+ "grad_norm": 5.719593524932861,
314
+ "learning_rate": 2.842605156037992e-05,
315
+ "loss": 0.7868,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 3.0683760683760686,
320
+ "grad_norm": 13.967921257019043,
321
+ "learning_rate": 2.7747625508819542e-05,
322
+ "loss": 0.7707,
323
+ "step": 410
324
+ },
325
+ {
326
+ "epoch": 3.0805860805860807,
327
+ "grad_norm": 9.134116172790527,
328
+ "learning_rate": 2.7069199457259158e-05,
329
+ "loss": 0.5804,
330
+ "step": 420
331
+ },
332
+ {
333
+ "epoch": 3.092796092796093,
334
+ "grad_norm": 14.580177307128906,
335
+ "learning_rate": 2.639077340569878e-05,
336
+ "loss": 0.7846,
337
+ "step": 430
338
+ },
339
+ {
340
+ "epoch": 3.105006105006105,
341
+ "grad_norm": 12.265727043151855,
342
+ "learning_rate": 2.57123473541384e-05,
343
+ "loss": 0.6807,
344
+ "step": 440
345
+ },
346
+ {
347
+ "epoch": 3.1172161172161172,
348
+ "grad_norm": 15.816527366638184,
349
+ "learning_rate": 2.5033921302578023e-05,
350
+ "loss": 0.6878,
351
+ "step": 450
352
+ },
353
+ {
354
+ "epoch": 3.1294261294261294,
355
+ "grad_norm": 9.39810848236084,
356
+ "learning_rate": 2.4355495251017642e-05,
357
+ "loss": 0.5625,
358
+ "step": 460
359
+ },
360
+ {
361
+ "epoch": 3.1416361416361416,
362
+ "grad_norm": 7.950680732727051,
363
+ "learning_rate": 2.367706919945726e-05,
364
+ "loss": 0.604,
365
+ "step": 470
366
+ },
367
+ {
368
+ "epoch": 3.144078144078144,
369
+ "eval_accuracy": 0.5,
370
+ "eval_loss": 1.3919281959533691,
371
+ "eval_runtime": 21.6391,
372
+ "eval_samples_per_second": 9.52,
373
+ "eval_steps_per_second": 1.202,
374
+ "step": 472
375
+ },
376
+ {
377
+ "epoch": 4.00976800976801,
378
+ "grad_norm": 3.859422206878662,
379
+ "learning_rate": 2.299864314789688e-05,
380
+ "loss": 0.3818,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 4.021978021978022,
385
+ "grad_norm": 4.586574077606201,
386
+ "learning_rate": 2.2320217096336503e-05,
387
+ "loss": 0.3743,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 4.034188034188034,
392
+ "grad_norm": 11.923030853271484,
393
+ "learning_rate": 2.164179104477612e-05,
394
+ "loss": 0.4857,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 4.046398046398046,
399
+ "grad_norm": 8.866025924682617,
400
+ "learning_rate": 2.0963364993215738e-05,
401
+ "loss": 0.5601,
402
+ "step": 510
403
+ },
404
+ {
405
+ "epoch": 4.058608058608058,
406
+ "grad_norm": 8.028688430786133,
407
+ "learning_rate": 2.028493894165536e-05,
408
+ "loss": 0.4649,
409
+ "step": 520
410
+ },
411
+ {
412
+ "epoch": 4.070818070818071,
413
+ "grad_norm": 8.852441787719727,
414
+ "learning_rate": 1.960651289009498e-05,
415
+ "loss": 0.3592,
416
+ "step": 530
417
+ },
418
+ {
419
+ "epoch": 4.083028083028083,
420
+ "grad_norm": 22.12917137145996,
421
+ "learning_rate": 1.89280868385346e-05,
422
+ "loss": 0.4787,
423
+ "step": 540
424
+ },
425
+ {
426
+ "epoch": 4.095238095238095,
427
+ "grad_norm": 9.262681007385254,
428
+ "learning_rate": 1.824966078697422e-05,
429
+ "loss": 0.4364,
430
+ "step": 550
431
+ },
432
+ {
433
+ "epoch": 4.107448107448107,
434
+ "grad_norm": 5.102321624755859,
435
+ "learning_rate": 1.757123473541384e-05,
436
+ "loss": 0.3868,
437
+ "step": 560
438
+ },
439
+ {
440
+ "epoch": 4.119658119658119,
441
+ "grad_norm": 13.144558906555176,
442
+ "learning_rate": 1.689280868385346e-05,
443
+ "loss": 0.4023,
444
+ "step": 570
445
+ },
446
+ {
447
+ "epoch": 4.131868131868132,
448
+ "grad_norm": 16.35342788696289,
449
+ "learning_rate": 1.6214382632293083e-05,
450
+ "loss": 0.4355,
451
+ "step": 580
452
+ },
453
+ {
454
+ "epoch": 4.1440781440781445,
455
+ "grad_norm": 12.760746955871582,
456
+ "learning_rate": 1.55359565807327e-05,
457
+ "loss": 0.2381,
458
+ "step": 590
459
+ },
460
+ {
461
+ "epoch": 4.1440781440781445,
462
+ "eval_accuracy": 0.5436893203883495,
463
+ "eval_loss": 1.355545163154602,
464
+ "eval_runtime": 21.2157,
465
+ "eval_samples_per_second": 9.71,
466
+ "eval_steps_per_second": 1.226,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 5.012210012210012,
471
+ "grad_norm": 7.593497276306152,
472
+ "learning_rate": 1.485753052917232e-05,
473
+ "loss": 0.2075,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 5.024420024420024,
478
+ "grad_norm": 4.697848320007324,
479
+ "learning_rate": 1.417910447761194e-05,
480
+ "loss": 0.2499,
481
+ "step": 610
482
+ },
483
+ {
484
+ "epoch": 5.0366300366300365,
485
+ "grad_norm": 5.646294116973877,
486
+ "learning_rate": 1.3500678426051561e-05,
487
+ "loss": 0.1955,
488
+ "step": 620
489
+ },
490
+ {
491
+ "epoch": 5.048840048840049,
492
+ "grad_norm": 1.646572232246399,
493
+ "learning_rate": 1.282225237449118e-05,
494
+ "loss": 0.2132,
495
+ "step": 630
496
+ },
497
+ {
498
+ "epoch": 5.061050061050061,
499
+ "grad_norm": 13.153250694274902,
500
+ "learning_rate": 1.2143826322930801e-05,
501
+ "loss": 0.2291,
502
+ "step": 640
503
+ },
504
+ {
505
+ "epoch": 5.073260073260073,
506
+ "grad_norm": 3.749263286590576,
507
+ "learning_rate": 1.1465400271370422e-05,
508
+ "loss": 0.2316,
509
+ "step": 650
510
+ },
511
+ {
512
+ "epoch": 5.085470085470085,
513
+ "grad_norm": 1.2367647886276245,
514
+ "learning_rate": 1.0786974219810041e-05,
515
+ "loss": 0.1593,
516
+ "step": 660
517
+ },
518
+ {
519
+ "epoch": 5.097680097680097,
520
+ "grad_norm": 14.28999137878418,
521
+ "learning_rate": 1.010854816824966e-05,
522
+ "loss": 0.1864,
523
+ "step": 670
524
+ },
525
+ {
526
+ "epoch": 5.1098901098901095,
527
+ "grad_norm": 4.065025329589844,
528
+ "learning_rate": 9.430122116689281e-06,
529
+ "loss": 0.261,
530
+ "step": 680
531
+ },
532
+ {
533
+ "epoch": 5.122100122100122,
534
+ "grad_norm": 5.471700668334961,
535
+ "learning_rate": 8.751696065128902e-06,
536
+ "loss": 0.1759,
537
+ "step": 690
538
+ },
539
+ {
540
+ "epoch": 5.134310134310135,
541
+ "grad_norm": 1.2977887392044067,
542
+ "learning_rate": 8.073270013568522e-06,
543
+ "loss": 0.2201,
544
+ "step": 700
545
+ },
546
+ {
547
+ "epoch": 5.1440781440781445,
548
+ "eval_accuracy": 0.5776699029126213,
549
+ "eval_loss": 1.3875343799591064,
550
+ "eval_runtime": 33.2439,
551
+ "eval_samples_per_second": 6.197,
552
+ "eval_steps_per_second": 0.782,
553
+ "step": 708
554
+ },
555
+ {
556
+ "epoch": 6.002442002442002,
557
+ "grad_norm": 5.552870750427246,
558
+ "learning_rate": 7.394843962008141e-06,
559
+ "loss": 0.2692,
560
+ "step": 710
561
+ },
562
+ {
563
+ "epoch": 6.014652014652015,
564
+ "grad_norm": 20.285839080810547,
565
+ "learning_rate": 6.716417910447762e-06,
566
+ "loss": 0.1318,
567
+ "step": 720
568
+ },
569
+ {
570
+ "epoch": 6.026862026862027,
571
+ "grad_norm": 1.199399471282959,
572
+ "learning_rate": 6.037991858887382e-06,
573
+ "loss": 0.0786,
574
+ "step": 730
575
+ },
576
+ {
577
+ "epoch": 6.039072039072039,
578
+ "grad_norm": 0.7117233872413635,
579
+ "learning_rate": 5.359565807327002e-06,
580
+ "loss": 0.0873,
581
+ "step": 740
582
+ },
583
+ {
584
+ "epoch": 6.051282051282051,
585
+ "grad_norm": 1.9136446714401245,
586
+ "learning_rate": 4.681139755766622e-06,
587
+ "loss": 0.1112,
588
+ "step": 750
589
+ },
590
+ {
591
+ "epoch": 6.063492063492063,
592
+ "grad_norm": 3.076906204223633,
593
+ "learning_rate": 4.002713704206242e-06,
594
+ "loss": 0.1026,
595
+ "step": 760
596
+ },
597
+ {
598
+ "epoch": 6.075702075702075,
599
+ "grad_norm": 26.244754791259766,
600
+ "learning_rate": 3.324287652645862e-06,
601
+ "loss": 0.1529,
602
+ "step": 770
603
+ },
604
+ {
605
+ "epoch": 6.087912087912088,
606
+ "grad_norm": 0.7138678431510925,
607
+ "learning_rate": 2.645861601085482e-06,
608
+ "loss": 0.1352,
609
+ "step": 780
610
+ },
611
+ {
612
+ "epoch": 6.1001221001221,
613
+ "grad_norm": 10.830814361572266,
614
+ "learning_rate": 1.967435549525102e-06,
615
+ "loss": 0.0969,
616
+ "step": 790
617
+ },
618
+ {
619
+ "epoch": 6.112332112332112,
620
+ "grad_norm": 5.247445106506348,
621
+ "learning_rate": 1.289009497964722e-06,
622
+ "loss": 0.1702,
623
+ "step": 800
624
+ },
625
+ {
626
+ "epoch": 6.124542124542124,
627
+ "grad_norm": 1.4733346700668335,
628
+ "learning_rate": 6.10583446404342e-07,
629
+ "loss": 0.1171,
630
+ "step": 810
631
+ },
632
+ {
633
+ "epoch": 6.135531135531136,
634
+ "eval_accuracy": 0.6019417475728155,
635
+ "eval_loss": 1.3527742624282837,
636
+ "eval_runtime": 23.7516,
637
+ "eval_samples_per_second": 8.673,
638
+ "eval_steps_per_second": 1.095,
639
+ "step": 819
640
+ },
641
+ {
642
+ "epoch": 6.135531135531136,
643
+ "step": 819,
644
+ "total_flos": 8.149698472747991e+18,
645
+ "train_loss": 0.8887154050216861,
646
+ "train_runtime": 1543.5432,
647
+ "train_samples_per_second": 4.245,
648
+ "train_steps_per_second": 0.531
649
+ },
650
+ {
651
+ "epoch": 6.135531135531136,
652
+ "eval_accuracy": 0.5756097560975609,
653
+ "eval_loss": 1.497787594795227,
654
+ "eval_runtime": 33.2888,
655
+ "eval_samples_per_second": 6.158,
656
+ "eval_steps_per_second": 0.781,
657
+ "step": 819
658
+ },
659
+ {
660
+ "epoch": 6.135531135531136,
661
+ "eval_accuracy": 0.5756097560975609,
662
+ "eval_loss": 1.497787356376648,
663
+ "eval_runtime": 33.1995,
664
+ "eval_samples_per_second": 6.175,
665
+ "eval_steps_per_second": 0.783,
666
+ "step": 819
667
+ }
668
+ ],
669
+ "logging_steps": 10,
670
+ "max_steps": 819,
671
+ "num_input_tokens_seen": 0,
672
+ "num_train_epochs": 9223372036854775807,
673
+ "save_steps": 500,
674
+ "stateful_callbacks": {
675
+ "EarlyStoppingCallback": {
676
+ "args": {
677
+ "early_stopping_patience": 3,
678
+ "early_stopping_threshold": 0.0
679
+ },
680
+ "attributes": {
681
+ "early_stopping_patience_counter": 0
682
+ }
683
+ },
684
+ "TrainerControl": {
685
+ "args": {
686
+ "should_epoch_stop": false,
687
+ "should_evaluate": false,
688
+ "should_log": false,
689
+ "should_save": true,
690
+ "should_training_stop": true
691
+ },
692
+ "attributes": {}
693
+ }
694
+ },
695
+ "total_flos": 8.149698472747991e+18,
696
+ "train_batch_size": 8,
697
+ "trial_name": null,
698
+ "trial_params": null
699
+ }