NiharGupte commited on
Commit
e833e43
1 Parent(s): 760618c

Training in progress, epoch 1

Browse files
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 6.302667737382912e+17,
4
+ "train_loss": 0.13929352825309368,
5
+ "train_runtime": 703.6937,
6
+ "train_samples_per_second": 42.177,
7
+ "train_steps_per_second": 1.336
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:306ecec9a84b992d1f4c8c5d230e2425aef41c22041fe256b8c6cf55d2f03b68
3
  size 94302952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780eabfb1ede8e9e88f021a6b3eb3e36687167bee7cd7ec380ba2499c1df5c17
3
  size 94302952
runs/May04_07-50-27_4f22111e1b44/events.out.tfevents.1714809034.4f22111e1b44.9006.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d895d001f660732d94a8edaf7b514109fafece53dd472db6f7e499142d751d62
3
+ size 5991
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 6.302667737382912e+17,
4
+ "train_loss": 0.13929352825309368,
5
+ "train_runtime": 703.6937,
6
+ "train_samples_per_second": 42.177,
7
+ "train_steps_per_second": 1.336
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.0,
3
+ "best_model_checkpoint": "resnet-50-finetuned-student_kaggle/checkpoint-423",
4
+ "epoch": 20.0,
5
+ "eval_steps": 500,
6
+ "global_step": 940,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.2127659574468085,
13
+ "grad_norm": 54.35947799682617,
14
+ "learning_rate": 5.319148936170213e-06,
15
+ "loss": 0.9341,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.425531914893617,
20
+ "grad_norm": 34.59556579589844,
21
+ "learning_rate": 1.0638297872340426e-05,
22
+ "loss": 0.9157,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.6382978723404256,
27
+ "grad_norm": 42.179847717285156,
28
+ "learning_rate": 1.595744680851064e-05,
29
+ "loss": 0.7801,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.851063829787234,
34
+ "grad_norm": 30.099029541015625,
35
+ "learning_rate": 2.1276595744680852e-05,
36
+ "loss": 0.7142,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_accuracy": 0.610062893081761,
42
+ "eval_loss": 0.6418222188949585,
43
+ "eval_runtime": 7.6299,
44
+ "eval_samples_per_second": 83.356,
45
+ "eval_steps_per_second": 2.621,
46
+ "step": 47
47
+ },
48
+ {
49
+ "epoch": 1.0638297872340425,
50
+ "grad_norm": 48.046546936035156,
51
+ "learning_rate": 2.6595744680851064e-05,
52
+ "loss": 0.7114,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 1.2765957446808511,
57
+ "grad_norm": 45.94879913330078,
58
+ "learning_rate": 3.191489361702128e-05,
59
+ "loss": 0.6014,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 1.4893617021276595,
64
+ "grad_norm": 17.69209861755371,
65
+ "learning_rate": 3.723404255319149e-05,
66
+ "loss": 0.4815,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 1.702127659574468,
71
+ "grad_norm": 18.821670532226562,
72
+ "learning_rate": 4.2553191489361704e-05,
73
+ "loss": 0.463,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 1.9148936170212765,
78
+ "grad_norm": 23.751588821411133,
79
+ "learning_rate": 4.787234042553192e-05,
80
+ "loss": 0.3351,
81
+ "step": 90
82
+ },
83
+ {
84
+ "epoch": 2.0,
85
+ "eval_accuracy": 0.8946540880503144,
86
+ "eval_loss": 0.25965991616249084,
87
+ "eval_runtime": 7.6659,
88
+ "eval_samples_per_second": 82.964,
89
+ "eval_steps_per_second": 2.609,
90
+ "step": 94
91
+ },
92
+ {
93
+ "epoch": 2.127659574468085,
94
+ "grad_norm": 18.11069679260254,
95
+ "learning_rate": 4.964539007092199e-05,
96
+ "loss": 0.3193,
97
+ "step": 100
98
+ },
99
+ {
100
+ "epoch": 2.3404255319148937,
101
+ "grad_norm": 12.397391319274902,
102
+ "learning_rate": 4.905437352245863e-05,
103
+ "loss": 0.2768,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 2.5531914893617023,
108
+ "grad_norm": 16.857635498046875,
109
+ "learning_rate": 4.846335697399527e-05,
110
+ "loss": 0.2594,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 2.7659574468085104,
115
+ "grad_norm": 12.635449409484863,
116
+ "learning_rate": 4.787234042553192e-05,
117
+ "loss": 0.2063,
118
+ "step": 130
119
+ },
120
+ {
121
+ "epoch": 2.978723404255319,
122
+ "grad_norm": 15.277303695678711,
123
+ "learning_rate": 4.728132387706856e-05,
124
+ "loss": 0.2574,
125
+ "step": 140
126
+ },
127
+ {
128
+ "epoch": 3.0,
129
+ "eval_accuracy": 0.9779874213836478,
130
+ "eval_loss": 0.10460298508405685,
131
+ "eval_runtime": 8.3391,
132
+ "eval_samples_per_second": 76.267,
133
+ "eval_steps_per_second": 2.398,
134
+ "step": 141
135
+ },
136
+ {
137
+ "epoch": 3.1914893617021276,
138
+ "grad_norm": 14.497098922729492,
139
+ "learning_rate": 4.669030732860521e-05,
140
+ "loss": 0.2349,
141
+ "step": 150
142
+ },
143
+ {
144
+ "epoch": 3.404255319148936,
145
+ "grad_norm": 17.647092819213867,
146
+ "learning_rate": 4.609929078014185e-05,
147
+ "loss": 0.1631,
148
+ "step": 160
149
+ },
150
+ {
151
+ "epoch": 3.617021276595745,
152
+ "grad_norm": 12.856146812438965,
153
+ "learning_rate": 4.550827423167849e-05,
154
+ "loss": 0.1675,
155
+ "step": 170
156
+ },
157
+ {
158
+ "epoch": 3.829787234042553,
159
+ "grad_norm": 7.248583793640137,
160
+ "learning_rate": 4.491725768321513e-05,
161
+ "loss": 0.1479,
162
+ "step": 180
163
+ },
164
+ {
165
+ "epoch": 4.0,
166
+ "eval_accuracy": 0.9874213836477987,
167
+ "eval_loss": 0.061614990234375,
168
+ "eval_runtime": 8.4097,
169
+ "eval_samples_per_second": 75.627,
170
+ "eval_steps_per_second": 2.378,
171
+ "step": 188
172
+ },
173
+ {
174
+ "epoch": 4.042553191489362,
175
+ "grad_norm": 25.721847534179688,
176
+ "learning_rate": 4.432624113475177e-05,
177
+ "loss": 0.1528,
178
+ "step": 190
179
+ },
180
+ {
181
+ "epoch": 4.25531914893617,
182
+ "grad_norm": 6.252942085266113,
183
+ "learning_rate": 4.373522458628842e-05,
184
+ "loss": 0.145,
185
+ "step": 200
186
+ },
187
+ {
188
+ "epoch": 4.468085106382979,
189
+ "grad_norm": 6.672601222991943,
190
+ "learning_rate": 4.3144208037825064e-05,
191
+ "loss": 0.1247,
192
+ "step": 210
193
+ },
194
+ {
195
+ "epoch": 4.680851063829787,
196
+ "grad_norm": 20.4866886138916,
197
+ "learning_rate": 4.2553191489361704e-05,
198
+ "loss": 0.1405,
199
+ "step": 220
200
+ },
201
+ {
202
+ "epoch": 4.8936170212765955,
203
+ "grad_norm": 19.644893646240234,
204
+ "learning_rate": 4.1962174940898345e-05,
205
+ "loss": 0.1284,
206
+ "step": 230
207
+ },
208
+ {
209
+ "epoch": 5.0,
210
+ "eval_accuracy": 0.9952830188679245,
211
+ "eval_loss": 0.02317511849105358,
212
+ "eval_runtime": 10.3441,
213
+ "eval_samples_per_second": 61.485,
214
+ "eval_steps_per_second": 1.933,
215
+ "step": 235
216
+ },
217
+ {
218
+ "epoch": 5.1063829787234045,
219
+ "grad_norm": 10.905556678771973,
220
+ "learning_rate": 4.1371158392434986e-05,
221
+ "loss": 0.1178,
222
+ "step": 240
223
+ },
224
+ {
225
+ "epoch": 5.319148936170213,
226
+ "grad_norm": 11.02078628540039,
227
+ "learning_rate": 4.078014184397163e-05,
228
+ "loss": 0.1176,
229
+ "step": 250
230
+ },
231
+ {
232
+ "epoch": 5.531914893617021,
233
+ "grad_norm": 7.510810375213623,
234
+ "learning_rate": 4.018912529550828e-05,
235
+ "loss": 0.0881,
236
+ "step": 260
237
+ },
238
+ {
239
+ "epoch": 5.74468085106383,
240
+ "grad_norm": 2.0541610717773438,
241
+ "learning_rate": 3.959810874704492e-05,
242
+ "loss": 0.1274,
243
+ "step": 270
244
+ },
245
+ {
246
+ "epoch": 5.957446808510638,
247
+ "grad_norm": 7.680713653564453,
248
+ "learning_rate": 3.900709219858156e-05,
249
+ "loss": 0.077,
250
+ "step": 280
251
+ },
252
+ {
253
+ "epoch": 6.0,
254
+ "eval_accuracy": 0.9952830188679245,
255
+ "eval_loss": 0.015012426301836967,
256
+ "eval_runtime": 8.4118,
257
+ "eval_samples_per_second": 75.608,
258
+ "eval_steps_per_second": 2.378,
259
+ "step": 282
260
+ },
261
+ {
262
+ "epoch": 6.170212765957447,
263
+ "grad_norm": 8.117420196533203,
264
+ "learning_rate": 3.84160756501182e-05,
265
+ "loss": 0.172,
266
+ "step": 290
267
+ },
268
+ {
269
+ "epoch": 6.382978723404255,
270
+ "grad_norm": 23.871868133544922,
271
+ "learning_rate": 3.782505910165485e-05,
272
+ "loss": 0.0613,
273
+ "step": 300
274
+ },
275
+ {
276
+ "epoch": 6.595744680851064,
277
+ "grad_norm": 15.407998085021973,
278
+ "learning_rate": 3.723404255319149e-05,
279
+ "loss": 0.1287,
280
+ "step": 310
281
+ },
282
+ {
283
+ "epoch": 6.808510638297872,
284
+ "grad_norm": 6.940992832183838,
285
+ "learning_rate": 3.664302600472813e-05,
286
+ "loss": 0.103,
287
+ "step": 320
288
+ },
289
+ {
290
+ "epoch": 7.0,
291
+ "eval_accuracy": 0.9984276729559748,
292
+ "eval_loss": 0.010532047599554062,
293
+ "eval_runtime": 7.9689,
294
+ "eval_samples_per_second": 79.81,
295
+ "eval_steps_per_second": 2.51,
296
+ "step": 329
297
+ },
298
+ {
299
+ "epoch": 7.0212765957446805,
300
+ "grad_norm": 4.598968029022217,
301
+ "learning_rate": 3.605200945626478e-05,
302
+ "loss": 0.0906,
303
+ "step": 330
304
+ },
305
+ {
306
+ "epoch": 7.23404255319149,
307
+ "grad_norm": 7.53684663772583,
308
+ "learning_rate": 3.546099290780142e-05,
309
+ "loss": 0.0792,
310
+ "step": 340
311
+ },
312
+ {
313
+ "epoch": 7.446808510638298,
314
+ "grad_norm": 2.750072479248047,
315
+ "learning_rate": 3.4869976359338065e-05,
316
+ "loss": 0.091,
317
+ "step": 350
318
+ },
319
+ {
320
+ "epoch": 7.659574468085106,
321
+ "grad_norm": 4.067008018493652,
322
+ "learning_rate": 3.4278959810874706e-05,
323
+ "loss": 0.0777,
324
+ "step": 360
325
+ },
326
+ {
327
+ "epoch": 7.872340425531915,
328
+ "grad_norm": 15.093037605285645,
329
+ "learning_rate": 3.3687943262411347e-05,
330
+ "loss": 0.0922,
331
+ "step": 370
332
+ },
333
+ {
334
+ "epoch": 8.0,
335
+ "eval_accuracy": 0.9984276729559748,
336
+ "eval_loss": 0.009353628382086754,
337
+ "eval_runtime": 7.7264,
338
+ "eval_samples_per_second": 82.315,
339
+ "eval_steps_per_second": 2.589,
340
+ "step": 376
341
+ },
342
+ {
343
+ "epoch": 8.085106382978724,
344
+ "grad_norm": 8.675619125366211,
345
+ "learning_rate": 3.309692671394799e-05,
346
+ "loss": 0.1211,
347
+ "step": 380
348
+ },
349
+ {
350
+ "epoch": 8.297872340425531,
351
+ "grad_norm": 5.723608493804932,
352
+ "learning_rate": 3.2505910165484634e-05,
353
+ "loss": 0.0645,
354
+ "step": 390
355
+ },
356
+ {
357
+ "epoch": 8.51063829787234,
358
+ "grad_norm": 8.031245231628418,
359
+ "learning_rate": 3.191489361702128e-05,
360
+ "loss": 0.0787,
361
+ "step": 400
362
+ },
363
+ {
364
+ "epoch": 8.72340425531915,
365
+ "grad_norm": 2.483238935470581,
366
+ "learning_rate": 3.132387706855792e-05,
367
+ "loss": 0.0683,
368
+ "step": 410
369
+ },
370
+ {
371
+ "epoch": 8.936170212765958,
372
+ "grad_norm": 7.612273216247559,
373
+ "learning_rate": 3.073286052009456e-05,
374
+ "loss": 0.08,
375
+ "step": 420
376
+ },
377
+ {
378
+ "epoch": 9.0,
379
+ "eval_accuracy": 1.0,
380
+ "eval_loss": 0.00555912172421813,
381
+ "eval_runtime": 8.4427,
382
+ "eval_samples_per_second": 75.331,
383
+ "eval_steps_per_second": 2.369,
384
+ "step": 423
385
+ },
386
+ {
387
+ "epoch": 9.148936170212766,
388
+ "grad_norm": 2.0842654705047607,
389
+ "learning_rate": 3.0141843971631207e-05,
390
+ "loss": 0.0812,
391
+ "step": 430
392
+ },
393
+ {
394
+ "epoch": 9.361702127659575,
395
+ "grad_norm": 17.500883102416992,
396
+ "learning_rate": 2.9550827423167847e-05,
397
+ "loss": 0.0688,
398
+ "step": 440
399
+ },
400
+ {
401
+ "epoch": 9.574468085106384,
402
+ "grad_norm": 9.671988487243652,
403
+ "learning_rate": 2.895981087470449e-05,
404
+ "loss": 0.0837,
405
+ "step": 450
406
+ },
407
+ {
408
+ "epoch": 9.787234042553191,
409
+ "grad_norm": 13.061412811279297,
410
+ "learning_rate": 2.836879432624114e-05,
411
+ "loss": 0.1024,
412
+ "step": 460
413
+ },
414
+ {
415
+ "epoch": 10.0,
416
+ "grad_norm": 11.679757118225098,
417
+ "learning_rate": 2.777777777777778e-05,
418
+ "loss": 0.0492,
419
+ "step": 470
420
+ },
421
+ {
422
+ "epoch": 10.0,
423
+ "eval_accuracy": 1.0,
424
+ "eval_loss": 0.004496446345001459,
425
+ "eval_runtime": 9.0183,
426
+ "eval_samples_per_second": 70.523,
427
+ "eval_steps_per_second": 2.218,
428
+ "step": 470
429
+ },
430
+ {
431
+ "epoch": 10.212765957446809,
432
+ "grad_norm": 3.253716230392456,
433
+ "learning_rate": 2.7186761229314423e-05,
434
+ "loss": 0.0495,
435
+ "step": 480
436
+ },
437
+ {
438
+ "epoch": 10.425531914893616,
439
+ "grad_norm": 5.826026916503906,
440
+ "learning_rate": 2.6595744680851064e-05,
441
+ "loss": 0.0266,
442
+ "step": 490
443
+ },
444
+ {
445
+ "epoch": 10.638297872340425,
446
+ "grad_norm": 9.738965034484863,
447
+ "learning_rate": 2.6004728132387708e-05,
448
+ "loss": 0.0739,
449
+ "step": 500
450
+ },
451
+ {
452
+ "epoch": 10.851063829787234,
453
+ "grad_norm": 9.04916763305664,
454
+ "learning_rate": 2.5413711583924348e-05,
455
+ "loss": 0.0574,
456
+ "step": 510
457
+ },
458
+ {
459
+ "epoch": 11.0,
460
+ "eval_accuracy": 1.0,
461
+ "eval_loss": 0.004308629781007767,
462
+ "eval_runtime": 8.4135,
463
+ "eval_samples_per_second": 75.593,
464
+ "eval_steps_per_second": 2.377,
465
+ "step": 517
466
+ },
467
+ {
468
+ "epoch": 11.063829787234043,
469
+ "grad_norm": 11.041916847229004,
470
+ "learning_rate": 2.4822695035460995e-05,
471
+ "loss": 0.0977,
472
+ "step": 520
473
+ },
474
+ {
475
+ "epoch": 11.27659574468085,
476
+ "grad_norm": 31.96723175048828,
477
+ "learning_rate": 2.4231678486997636e-05,
478
+ "loss": 0.0541,
479
+ "step": 530
480
+ },
481
+ {
482
+ "epoch": 11.48936170212766,
483
+ "grad_norm": 2.877957344055176,
484
+ "learning_rate": 2.364066193853428e-05,
485
+ "loss": 0.0401,
486
+ "step": 540
487
+ },
488
+ {
489
+ "epoch": 11.702127659574469,
490
+ "grad_norm": 3.020596742630005,
491
+ "learning_rate": 2.3049645390070924e-05,
492
+ "loss": 0.0284,
493
+ "step": 550
494
+ },
495
+ {
496
+ "epoch": 11.914893617021276,
497
+ "grad_norm": 7.811126232147217,
498
+ "learning_rate": 2.2458628841607564e-05,
499
+ "loss": 0.0382,
500
+ "step": 560
501
+ },
502
+ {
503
+ "epoch": 12.0,
504
+ "eval_accuracy": 1.0,
505
+ "eval_loss": 0.0022806336637586355,
506
+ "eval_runtime": 7.7232,
507
+ "eval_samples_per_second": 82.35,
508
+ "eval_steps_per_second": 2.59,
509
+ "step": 564
510
+ },
511
+ {
512
+ "epoch": 12.127659574468085,
513
+ "grad_norm": 11.633199691772461,
514
+ "learning_rate": 2.186761229314421e-05,
515
+ "loss": 0.0589,
516
+ "step": 570
517
+ },
518
+ {
519
+ "epoch": 12.340425531914894,
520
+ "grad_norm": 0.3253759443759918,
521
+ "learning_rate": 2.1276595744680852e-05,
522
+ "loss": 0.041,
523
+ "step": 580
524
+ },
525
+ {
526
+ "epoch": 12.553191489361701,
527
+ "grad_norm": 2.2859044075012207,
528
+ "learning_rate": 2.0685579196217493e-05,
529
+ "loss": 0.0578,
530
+ "step": 590
531
+ },
532
+ {
533
+ "epoch": 12.76595744680851,
534
+ "grad_norm": 1.5324259996414185,
535
+ "learning_rate": 2.009456264775414e-05,
536
+ "loss": 0.0312,
537
+ "step": 600
538
+ },
539
+ {
540
+ "epoch": 12.97872340425532,
541
+ "grad_norm": 6.985143661499023,
542
+ "learning_rate": 1.950354609929078e-05,
543
+ "loss": 0.0666,
544
+ "step": 610
545
+ },
546
+ {
547
+ "epoch": 13.0,
548
+ "eval_accuracy": 1.0,
549
+ "eval_loss": 0.0022491966374218464,
550
+ "eval_runtime": 8.1081,
551
+ "eval_samples_per_second": 78.44,
552
+ "eval_steps_per_second": 2.467,
553
+ "step": 611
554
+ },
555
+ {
556
+ "epoch": 13.191489361702128,
557
+ "grad_norm": 8.847366333007812,
558
+ "learning_rate": 1.8912529550827425e-05,
559
+ "loss": 0.0539,
560
+ "step": 620
561
+ },
562
+ {
563
+ "epoch": 13.404255319148936,
564
+ "grad_norm": 10.476814270019531,
565
+ "learning_rate": 1.8321513002364065e-05,
566
+ "loss": 0.0369,
567
+ "step": 630
568
+ },
569
+ {
570
+ "epoch": 13.617021276595745,
571
+ "grad_norm": 5.339621067047119,
572
+ "learning_rate": 1.773049645390071e-05,
573
+ "loss": 0.0308,
574
+ "step": 640
575
+ },
576
+ {
577
+ "epoch": 13.829787234042554,
578
+ "grad_norm": 14.648975372314453,
579
+ "learning_rate": 1.7139479905437353e-05,
580
+ "loss": 0.0477,
581
+ "step": 650
582
+ },
583
+ {
584
+ "epoch": 14.0,
585
+ "eval_accuracy": 1.0,
586
+ "eval_loss": 0.0021932125091552734,
587
+ "eval_runtime": 8.4493,
588
+ "eval_samples_per_second": 75.272,
589
+ "eval_steps_per_second": 2.367,
590
+ "step": 658
591
+ },
592
+ {
593
+ "epoch": 14.042553191489361,
594
+ "grad_norm": 5.510159969329834,
595
+ "learning_rate": 1.6548463356973994e-05,
596
+ "loss": 0.028,
597
+ "step": 660
598
+ },
599
+ {
600
+ "epoch": 14.25531914893617,
601
+ "grad_norm": 5.803068161010742,
602
+ "learning_rate": 1.595744680851064e-05,
603
+ "loss": 0.0522,
604
+ "step": 670
605
+ },
606
+ {
607
+ "epoch": 14.46808510638298,
608
+ "grad_norm": 1.1623107194900513,
609
+ "learning_rate": 1.536643026004728e-05,
610
+ "loss": 0.0481,
611
+ "step": 680
612
+ },
613
+ {
614
+ "epoch": 14.680851063829786,
615
+ "grad_norm": 12.495600700378418,
616
+ "learning_rate": 1.4775413711583924e-05,
617
+ "loss": 0.0588,
618
+ "step": 690
619
+ },
620
+ {
621
+ "epoch": 14.893617021276595,
622
+ "grad_norm": 3.4236888885498047,
623
+ "learning_rate": 1.418439716312057e-05,
624
+ "loss": 0.0614,
625
+ "step": 700
626
+ },
627
+ {
628
+ "epoch": 15.0,
629
+ "eval_accuracy": 1.0,
630
+ "eval_loss": 0.002270177938044071,
631
+ "eval_runtime": 8.5563,
632
+ "eval_samples_per_second": 74.331,
633
+ "eval_steps_per_second": 2.337,
634
+ "step": 705
635
+ },
636
+ {
637
+ "epoch": 15.106382978723405,
638
+ "grad_norm": 11.681058883666992,
639
+ "learning_rate": 1.3593380614657212e-05,
640
+ "loss": 0.0674,
641
+ "step": 710
642
+ },
643
+ {
644
+ "epoch": 15.319148936170214,
645
+ "grad_norm": 1.846946120262146,
646
+ "learning_rate": 1.3002364066193854e-05,
647
+ "loss": 0.0415,
648
+ "step": 720
649
+ },
650
+ {
651
+ "epoch": 15.53191489361702,
652
+ "grad_norm": 8.939858436584473,
653
+ "learning_rate": 1.2411347517730498e-05,
654
+ "loss": 0.0189,
655
+ "step": 730
656
+ },
657
+ {
658
+ "epoch": 15.74468085106383,
659
+ "grad_norm": 3.521784782409668,
660
+ "learning_rate": 1.182033096926714e-05,
661
+ "loss": 0.0585,
662
+ "step": 740
663
+ },
664
+ {
665
+ "epoch": 15.957446808510639,
666
+ "grad_norm": 1.9891993999481201,
667
+ "learning_rate": 1.1229314420803782e-05,
668
+ "loss": 0.0282,
669
+ "step": 750
670
+ },
671
+ {
672
+ "epoch": 16.0,
673
+ "eval_accuracy": 1.0,
674
+ "eval_loss": 0.0013930280692875385,
675
+ "eval_runtime": 8.1789,
676
+ "eval_samples_per_second": 77.761,
677
+ "eval_steps_per_second": 2.445,
678
+ "step": 752
679
+ },
680
+ {
681
+ "epoch": 16.170212765957448,
682
+ "grad_norm": 7.0705246925354,
683
+ "learning_rate": 1.0638297872340426e-05,
684
+ "loss": 0.0508,
685
+ "step": 760
686
+ },
687
+ {
688
+ "epoch": 16.382978723404257,
689
+ "grad_norm": 11.365514755249023,
690
+ "learning_rate": 1.004728132387707e-05,
691
+ "loss": 0.0393,
692
+ "step": 770
693
+ },
694
+ {
695
+ "epoch": 16.595744680851062,
696
+ "grad_norm": 8.82397747039795,
697
+ "learning_rate": 9.456264775413712e-06,
698
+ "loss": 0.0509,
699
+ "step": 780
700
+ },
701
+ {
702
+ "epoch": 16.80851063829787,
703
+ "grad_norm": 5.013731002807617,
704
+ "learning_rate": 8.865248226950355e-06,
705
+ "loss": 0.0659,
706
+ "step": 790
707
+ },
708
+ {
709
+ "epoch": 17.0,
710
+ "eval_accuracy": 1.0,
711
+ "eval_loss": 0.0016287014586851,
712
+ "eval_runtime": 7.6703,
713
+ "eval_samples_per_second": 82.917,
714
+ "eval_steps_per_second": 2.607,
715
+ "step": 799
716
+ },
717
+ {
718
+ "epoch": 17.02127659574468,
719
+ "grad_norm": 2.8596644401550293,
720
+ "learning_rate": 8.274231678486997e-06,
721
+ "loss": 0.0285,
722
+ "step": 800
723
+ },
724
+ {
725
+ "epoch": 17.23404255319149,
726
+ "grad_norm": 10.184608459472656,
727
+ "learning_rate": 7.68321513002364e-06,
728
+ "loss": 0.062,
729
+ "step": 810
730
+ },
731
+ {
732
+ "epoch": 17.4468085106383,
733
+ "grad_norm": 6.029819011688232,
734
+ "learning_rate": 7.092198581560285e-06,
735
+ "loss": 0.0672,
736
+ "step": 820
737
+ },
738
+ {
739
+ "epoch": 17.659574468085108,
740
+ "grad_norm": 0.9212875366210938,
741
+ "learning_rate": 6.501182033096927e-06,
742
+ "loss": 0.0404,
743
+ "step": 830
744
+ },
745
+ {
746
+ "epoch": 17.872340425531917,
747
+ "grad_norm": 0.5147794485092163,
748
+ "learning_rate": 5.91016548463357e-06,
749
+ "loss": 0.0586,
750
+ "step": 840
751
+ },
752
+ {
753
+ "epoch": 18.0,
754
+ "eval_accuracy": 1.0,
755
+ "eval_loss": 0.0009691208251751959,
756
+ "eval_runtime": 8.4381,
757
+ "eval_samples_per_second": 75.373,
758
+ "eval_steps_per_second": 2.37,
759
+ "step": 846
760
+ },
761
+ {
762
+ "epoch": 18.085106382978722,
763
+ "grad_norm": 3.351142406463623,
764
+ "learning_rate": 5.319148936170213e-06,
765
+ "loss": 0.0333,
766
+ "step": 850
767
+ },
768
+ {
769
+ "epoch": 18.29787234042553,
770
+ "grad_norm": 7.570976257324219,
771
+ "learning_rate": 4.728132387706856e-06,
772
+ "loss": 0.0401,
773
+ "step": 860
774
+ },
775
+ {
776
+ "epoch": 18.51063829787234,
777
+ "grad_norm": 6.660007953643799,
778
+ "learning_rate": 4.137115839243498e-06,
779
+ "loss": 0.0329,
780
+ "step": 870
781
+ },
782
+ {
783
+ "epoch": 18.72340425531915,
784
+ "grad_norm": 11.373592376708984,
785
+ "learning_rate": 3.5460992907801423e-06,
786
+ "loss": 0.0523,
787
+ "step": 880
788
+ },
789
+ {
790
+ "epoch": 18.93617021276596,
791
+ "grad_norm": 4.553757667541504,
792
+ "learning_rate": 2.955082742316785e-06,
793
+ "loss": 0.0557,
794
+ "step": 890
795
+ },
796
+ {
797
+ "epoch": 19.0,
798
+ "eval_accuracy": 1.0,
799
+ "eval_loss": 0.0012750416062772274,
800
+ "eval_runtime": 8.3957,
801
+ "eval_samples_per_second": 75.753,
802
+ "eval_steps_per_second": 2.382,
803
+ "step": 893
804
+ },
805
+ {
806
+ "epoch": 19.148936170212767,
807
+ "grad_norm": 1.8293001651763916,
808
+ "learning_rate": 2.364066193853428e-06,
809
+ "loss": 0.0248,
810
+ "step": 900
811
+ },
812
+ {
813
+ "epoch": 19.361702127659573,
814
+ "grad_norm": 3.5974695682525635,
815
+ "learning_rate": 1.7730496453900712e-06,
816
+ "loss": 0.0298,
817
+ "step": 910
818
+ },
819
+ {
820
+ "epoch": 19.574468085106382,
821
+ "grad_norm": 4.631837844848633,
822
+ "learning_rate": 1.182033096926714e-06,
823
+ "loss": 0.0272,
824
+ "step": 920
825
+ },
826
+ {
827
+ "epoch": 19.78723404255319,
828
+ "grad_norm": 1.5552431344985962,
829
+ "learning_rate": 5.91016548463357e-07,
830
+ "loss": 0.0281,
831
+ "step": 930
832
+ },
833
+ {
834
+ "epoch": 20.0,
835
+ "grad_norm": 5.388515949249268,
836
+ "learning_rate": 0.0,
837
+ "loss": 0.07,
838
+ "step": 940
839
+ },
840
+ {
841
+ "epoch": 20.0,
842
+ "eval_accuracy": 1.0,
843
+ "eval_loss": 0.001178326434455812,
844
+ "eval_runtime": 8.4555,
845
+ "eval_samples_per_second": 75.217,
846
+ "eval_steps_per_second": 2.365,
847
+ "step": 940
848
+ },
849
+ {
850
+ "epoch": 20.0,
851
+ "step": 940,
852
+ "total_flos": 6.302667737382912e+17,
853
+ "train_loss": 0.13929352825309368,
854
+ "train_runtime": 703.6937,
855
+ "train_samples_per_second": 42.177,
856
+ "train_steps_per_second": 1.336
857
+ }
858
+ ],
859
+ "logging_steps": 10,
860
+ "max_steps": 940,
861
+ "num_input_tokens_seen": 0,
862
+ "num_train_epochs": 20,
863
+ "save_steps": 500,
864
+ "total_flos": 6.302667737382912e+17,
865
+ "train_batch_size": 32,
866
+ "trial_name": null,
867
+ "trial_params": null
868
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45165d80675de7e1625e0819e7f711729565fe46225805a51d6b0cb725a4d244
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e86333774fdc2cb3ed0a9195f9238c2f5423847f1665b9f8b4b6f56508f38d3c
3
  size 5048