t1msan commited on
Commit
dfa127e
1 Parent(s): a370735

End of training

Browse files
README.md CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.0711
21
 
22
  ## Model description
23
 
 
17
 
18
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.0200
21
 
22
  ## Model description
23
 
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.019963504746556282,
4
+ "eval_runtime": 89.035,
5
+ "eval_samples_per_second": 59.392,
6
+ "eval_steps_per_second": 1.247,
7
+ "total_flos": 5.914482579184435e+18,
8
+ "train_loss": 0.04452576920570385,
9
+ "train_runtime": 5366.5614,
10
+ "train_samples_per_second": 44.339,
11
+ "train_steps_per_second": 0.231
12
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.019963504746556282,
4
+ "eval_runtime": 89.035,
5
+ "eval_samples_per_second": 59.392,
6
+ "eval_steps_per_second": 1.247
7
+ }
runs/Apr17_20-11-24_cf8be2e8a73d/events.out.tfevents.1713395353.cf8be2e8a73d.34.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05a65b76b5905816338c010536c233166e4a388ffe512b4b8428e5906a4cd01c
3
+ size 359
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 5.914482579184435e+18,
4
+ "train_loss": 0.04452576920570385,
5
+ "train_runtime": 5366.5614,
6
+ "train_samples_per_second": 44.339,
7
+ "train_steps_per_second": 0.231
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,938 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.019963504746556282,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-Kontur-competition-52K/checkpoint-496",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1240,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "grad_norm": 3.4303202629089355,
14
+ "learning_rate": 4.032258064516129e-06,
15
+ "loss": 0.7196,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.08,
20
+ "grad_norm": 2.8987796306610107,
21
+ "learning_rate": 8.064516129032258e-06,
22
+ "loss": 0.6457,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.12,
27
+ "grad_norm": 2.8838603496551514,
28
+ "learning_rate": 1.2096774193548388e-05,
29
+ "loss": 0.5219,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.16,
34
+ "grad_norm": 2.1233789920806885,
35
+ "learning_rate": 1.6129032258064517e-05,
36
+ "loss": 0.4099,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.2,
41
+ "grad_norm": 2.453838348388672,
42
+ "learning_rate": 2.0161290322580645e-05,
43
+ "loss": 0.2775,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.24,
48
+ "grad_norm": 5.15359354019165,
49
+ "learning_rate": 2.4193548387096777e-05,
50
+ "loss": 0.1655,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.28,
55
+ "grad_norm": 5.333171367645264,
56
+ "learning_rate": 2.822580645161291e-05,
57
+ "loss": 0.112,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.32,
62
+ "grad_norm": 3.0966763496398926,
63
+ "learning_rate": 3.2258064516129034e-05,
64
+ "loss": 0.0818,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.36,
69
+ "grad_norm": 13.956466674804688,
70
+ "learning_rate": 3.6290322580645165e-05,
71
+ "loss": 0.0884,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.4,
76
+ "grad_norm": 3.109863758087158,
77
+ "learning_rate": 4.032258064516129e-05,
78
+ "loss": 0.0755,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.44,
83
+ "grad_norm": 2.274998664855957,
84
+ "learning_rate": 4.435483870967742e-05,
85
+ "loss": 0.0562,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.48,
90
+ "grad_norm": 5.711202621459961,
91
+ "learning_rate": 4.8387096774193554e-05,
92
+ "loss": 0.0622,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.52,
97
+ "grad_norm": 9.838350296020508,
98
+ "learning_rate": 4.973118279569893e-05,
99
+ "loss": 0.0646,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.56,
104
+ "grad_norm": 8.744378089904785,
105
+ "learning_rate": 4.92831541218638e-05,
106
+ "loss": 0.0631,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.6,
111
+ "grad_norm": 13.287856101989746,
112
+ "learning_rate": 4.8835125448028677e-05,
113
+ "loss": 0.0732,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.65,
118
+ "grad_norm": 4.635748863220215,
119
+ "learning_rate": 4.8387096774193554e-05,
120
+ "loss": 0.049,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.69,
125
+ "grad_norm": 5.990776062011719,
126
+ "learning_rate": 4.7939068100358424e-05,
127
+ "loss": 0.0709,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.73,
132
+ "grad_norm": 6.079687595367432,
133
+ "learning_rate": 4.74910394265233e-05,
134
+ "loss": 0.0492,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.77,
139
+ "grad_norm": 1.6072505712509155,
140
+ "learning_rate": 4.704301075268818e-05,
141
+ "loss": 0.0527,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.81,
146
+ "grad_norm": 1.9706329107284546,
147
+ "learning_rate": 4.659498207885305e-05,
148
+ "loss": 0.0609,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.85,
153
+ "grad_norm": 5.284497261047363,
154
+ "learning_rate": 4.614695340501792e-05,
155
+ "loss": 0.0431,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.89,
160
+ "grad_norm": 3.325084924697876,
161
+ "learning_rate": 4.56989247311828e-05,
162
+ "loss": 0.0321,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.93,
167
+ "grad_norm": 9.077991485595703,
168
+ "learning_rate": 4.5250896057347674e-05,
169
+ "loss": 0.0375,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.97,
174
+ "grad_norm": 4.708994388580322,
175
+ "learning_rate": 4.4802867383512545e-05,
176
+ "loss": 0.0376,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 1.0,
181
+ "eval_loss": 0.06623569875955582,
182
+ "eval_runtime": 113.2602,
183
+ "eval_samples_per_second": 46.689,
184
+ "eval_steps_per_second": 0.98,
185
+ "step": 248
186
+ },
187
+ {
188
+ "epoch": 1.01,
189
+ "grad_norm": 4.394409656524658,
190
+ "learning_rate": 4.435483870967742e-05,
191
+ "loss": 0.0348,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 1.05,
196
+ "grad_norm": 20.027359008789062,
197
+ "learning_rate": 4.390681003584229e-05,
198
+ "loss": 0.0685,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 1.09,
203
+ "grad_norm": 2.6204428672790527,
204
+ "learning_rate": 4.345878136200717e-05,
205
+ "loss": 0.0445,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 1.13,
210
+ "grad_norm": 6.732935905456543,
211
+ "learning_rate": 4.301075268817205e-05,
212
+ "loss": 0.0357,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 1.17,
217
+ "grad_norm": 9.537686347961426,
218
+ "learning_rate": 4.256272401433692e-05,
219
+ "loss": 0.0367,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 1.21,
224
+ "grad_norm": 4.934083938598633,
225
+ "learning_rate": 4.2114695340501795e-05,
226
+ "loss": 0.0249,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 1.25,
231
+ "grad_norm": 2.7931134700775146,
232
+ "learning_rate": 4.166666666666667e-05,
233
+ "loss": 0.0312,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 1.29,
238
+ "grad_norm": 1.5536555051803589,
239
+ "learning_rate": 4.121863799283154e-05,
240
+ "loss": 0.0329,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 1.33,
245
+ "grad_norm": 4.139862060546875,
246
+ "learning_rate": 4.077060931899642e-05,
247
+ "loss": 0.0306,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 1.37,
252
+ "grad_norm": 2.881141185760498,
253
+ "learning_rate": 4.032258064516129e-05,
254
+ "loss": 0.0272,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 1.41,
259
+ "grad_norm": 4.688608646392822,
260
+ "learning_rate": 3.987455197132617e-05,
261
+ "loss": 0.0332,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 1.45,
266
+ "grad_norm": 3.6917848587036133,
267
+ "learning_rate": 3.9426523297491045e-05,
268
+ "loss": 0.0365,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 1.49,
273
+ "grad_norm": 3.620156764984131,
274
+ "learning_rate": 3.8978494623655915e-05,
275
+ "loss": 0.0232,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 1.53,
280
+ "grad_norm": 5.679781913757324,
281
+ "learning_rate": 3.8530465949820786e-05,
282
+ "loss": 0.0259,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 1.57,
287
+ "grad_norm": 3.670245885848999,
288
+ "learning_rate": 3.808243727598566e-05,
289
+ "loss": 0.0296,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 1.61,
294
+ "grad_norm": 5.259506702423096,
295
+ "learning_rate": 3.763440860215054e-05,
296
+ "loss": 0.0155,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 1.65,
301
+ "grad_norm": 2.921619176864624,
302
+ "learning_rate": 3.718637992831541e-05,
303
+ "loss": 0.0228,
304
+ "step": 410
305
+ },
306
+ {
307
+ "epoch": 1.69,
308
+ "grad_norm": 1.523906946182251,
309
+ "learning_rate": 3.673835125448029e-05,
310
+ "loss": 0.0197,
311
+ "step": 420
312
+ },
313
+ {
314
+ "epoch": 1.73,
315
+ "grad_norm": 4.975411891937256,
316
+ "learning_rate": 3.6290322580645165e-05,
317
+ "loss": 0.0296,
318
+ "step": 430
319
+ },
320
+ {
321
+ "epoch": 1.77,
322
+ "grad_norm": 3.5619239807128906,
323
+ "learning_rate": 3.5842293906810036e-05,
324
+ "loss": 0.0265,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 1.81,
329
+ "grad_norm": 2.4044506549835205,
330
+ "learning_rate": 3.539426523297491e-05,
331
+ "loss": 0.0217,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 1.85,
336
+ "grad_norm": 8.599495887756348,
337
+ "learning_rate": 3.494623655913979e-05,
338
+ "loss": 0.034,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 1.9,
343
+ "grad_norm": 4.817391395568848,
344
+ "learning_rate": 3.449820788530466e-05,
345
+ "loss": 0.0244,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 1.94,
350
+ "grad_norm": 0.9919489622116089,
351
+ "learning_rate": 3.405017921146954e-05,
352
+ "loss": 0.0285,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 1.98,
357
+ "grad_norm": 1.3462491035461426,
358
+ "learning_rate": 3.360215053763441e-05,
359
+ "loss": 0.0213,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 2.0,
364
+ "eval_loss": 0.019963504746556282,
365
+ "eval_runtime": 83.0975,
366
+ "eval_samples_per_second": 63.636,
367
+ "eval_steps_per_second": 1.336,
368
+ "step": 496
369
+ },
370
+ {
371
+ "epoch": 2.02,
372
+ "grad_norm": 2.6078598499298096,
373
+ "learning_rate": 3.3154121863799286e-05,
374
+ "loss": 0.029,
375
+ "step": 500
376
+ },
377
+ {
378
+ "epoch": 2.06,
379
+ "grad_norm": 0.9603502750396729,
380
+ "learning_rate": 3.270609318996416e-05,
381
+ "loss": 0.0263,
382
+ "step": 510
383
+ },
384
+ {
385
+ "epoch": 2.1,
386
+ "grad_norm": 11.797562599182129,
387
+ "learning_rate": 3.2258064516129034e-05,
388
+ "loss": 0.0256,
389
+ "step": 520
390
+ },
391
+ {
392
+ "epoch": 2.14,
393
+ "grad_norm": 6.010237693786621,
394
+ "learning_rate": 3.1810035842293904e-05,
395
+ "loss": 0.0259,
396
+ "step": 530
397
+ },
398
+ {
399
+ "epoch": 2.18,
400
+ "grad_norm": 11.067808151245117,
401
+ "learning_rate": 3.136200716845878e-05,
402
+ "loss": 0.0244,
403
+ "step": 540
404
+ },
405
+ {
406
+ "epoch": 2.22,
407
+ "grad_norm": 8.755197525024414,
408
+ "learning_rate": 3.091397849462366e-05,
409
+ "loss": 0.0172,
410
+ "step": 550
411
+ },
412
+ {
413
+ "epoch": 2.26,
414
+ "grad_norm": 5.274145603179932,
415
+ "learning_rate": 3.046594982078853e-05,
416
+ "loss": 0.0219,
417
+ "step": 560
418
+ },
419
+ {
420
+ "epoch": 2.3,
421
+ "grad_norm": 3.9025063514709473,
422
+ "learning_rate": 3.0017921146953403e-05,
423
+ "loss": 0.0121,
424
+ "step": 570
425
+ },
426
+ {
427
+ "epoch": 2.34,
428
+ "grad_norm": 0.16941958665847778,
429
+ "learning_rate": 2.9569892473118284e-05,
430
+ "loss": 0.0151,
431
+ "step": 580
432
+ },
433
+ {
434
+ "epoch": 2.38,
435
+ "grad_norm": 5.620195388793945,
436
+ "learning_rate": 2.9121863799283154e-05,
437
+ "loss": 0.02,
438
+ "step": 590
439
+ },
440
+ {
441
+ "epoch": 2.42,
442
+ "grad_norm": 4.685577869415283,
443
+ "learning_rate": 2.8673835125448028e-05,
444
+ "loss": 0.0212,
445
+ "step": 600
446
+ },
447
+ {
448
+ "epoch": 2.46,
449
+ "grad_norm": 0.9184539914131165,
450
+ "learning_rate": 2.822580645161291e-05,
451
+ "loss": 0.0243,
452
+ "step": 610
453
+ },
454
+ {
455
+ "epoch": 2.5,
456
+ "grad_norm": 4.498030662536621,
457
+ "learning_rate": 2.777777777777778e-05,
458
+ "loss": 0.0199,
459
+ "step": 620
460
+ },
461
+ {
462
+ "epoch": 2.54,
463
+ "grad_norm": 0.5963709354400635,
464
+ "learning_rate": 2.7329749103942653e-05,
465
+ "loss": 0.0189,
466
+ "step": 630
467
+ },
468
+ {
469
+ "epoch": 2.58,
470
+ "grad_norm": 0.7427639961242676,
471
+ "learning_rate": 2.6881720430107527e-05,
472
+ "loss": 0.01,
473
+ "step": 640
474
+ },
475
+ {
476
+ "epoch": 2.62,
477
+ "grad_norm": 1.3474091291427612,
478
+ "learning_rate": 2.6433691756272404e-05,
479
+ "loss": 0.0121,
480
+ "step": 650
481
+ },
482
+ {
483
+ "epoch": 2.66,
484
+ "grad_norm": 2.8091518878936768,
485
+ "learning_rate": 2.5985663082437278e-05,
486
+ "loss": 0.0252,
487
+ "step": 660
488
+ },
489
+ {
490
+ "epoch": 2.7,
491
+ "grad_norm": 1.0539556741714478,
492
+ "learning_rate": 2.5537634408602152e-05,
493
+ "loss": 0.0192,
494
+ "step": 670
495
+ },
496
+ {
497
+ "epoch": 2.74,
498
+ "grad_norm": 0.38558557629585266,
499
+ "learning_rate": 2.5089605734767026e-05,
500
+ "loss": 0.0115,
501
+ "step": 680
502
+ },
503
+ {
504
+ "epoch": 2.78,
505
+ "grad_norm": 2.709695339202881,
506
+ "learning_rate": 2.46415770609319e-05,
507
+ "loss": 0.0178,
508
+ "step": 690
509
+ },
510
+ {
511
+ "epoch": 2.82,
512
+ "grad_norm": 4.6164751052856445,
513
+ "learning_rate": 2.4193548387096777e-05,
514
+ "loss": 0.0118,
515
+ "step": 700
516
+ },
517
+ {
518
+ "epoch": 2.86,
519
+ "grad_norm": 2.706479549407959,
520
+ "learning_rate": 2.374551971326165e-05,
521
+ "loss": 0.0165,
522
+ "step": 710
523
+ },
524
+ {
525
+ "epoch": 2.9,
526
+ "grad_norm": 2.981581211090088,
527
+ "learning_rate": 2.3297491039426525e-05,
528
+ "loss": 0.0206,
529
+ "step": 720
530
+ },
531
+ {
532
+ "epoch": 2.94,
533
+ "grad_norm": 1.4225656986236572,
534
+ "learning_rate": 2.28494623655914e-05,
535
+ "loss": 0.0092,
536
+ "step": 730
537
+ },
538
+ {
539
+ "epoch": 2.98,
540
+ "grad_norm": 1.1643812656402588,
541
+ "learning_rate": 2.2401433691756272e-05,
542
+ "loss": 0.0094,
543
+ "step": 740
544
+ },
545
+ {
546
+ "epoch": 3.0,
547
+ "eval_loss": 0.13013091683387756,
548
+ "eval_runtime": 83.9945,
549
+ "eval_samples_per_second": 62.957,
550
+ "eval_steps_per_second": 1.322,
551
+ "step": 744
552
+ },
553
+ {
554
+ "epoch": 3.02,
555
+ "grad_norm": 2.4861252307891846,
556
+ "learning_rate": 2.1953405017921146e-05,
557
+ "loss": 0.0198,
558
+ "step": 750
559
+ },
560
+ {
561
+ "epoch": 3.06,
562
+ "grad_norm": 2.1413233280181885,
563
+ "learning_rate": 2.1505376344086024e-05,
564
+ "loss": 0.0142,
565
+ "step": 760
566
+ },
567
+ {
568
+ "epoch": 3.1,
569
+ "grad_norm": 2.2505719661712646,
570
+ "learning_rate": 2.1057347670250897e-05,
571
+ "loss": 0.0151,
572
+ "step": 770
573
+ },
574
+ {
575
+ "epoch": 3.15,
576
+ "grad_norm": 2.7683489322662354,
577
+ "learning_rate": 2.060931899641577e-05,
578
+ "loss": 0.0202,
579
+ "step": 780
580
+ },
581
+ {
582
+ "epoch": 3.19,
583
+ "grad_norm": 1.9615308046340942,
584
+ "learning_rate": 2.0161290322580645e-05,
585
+ "loss": 0.0076,
586
+ "step": 790
587
+ },
588
+ {
589
+ "epoch": 3.23,
590
+ "grad_norm": 2.115872621536255,
591
+ "learning_rate": 1.9713261648745522e-05,
592
+ "loss": 0.0133,
593
+ "step": 800
594
+ },
595
+ {
596
+ "epoch": 3.27,
597
+ "grad_norm": 0.3355218768119812,
598
+ "learning_rate": 1.9265232974910393e-05,
599
+ "loss": 0.0149,
600
+ "step": 810
601
+ },
602
+ {
603
+ "epoch": 3.31,
604
+ "grad_norm": 1.101788878440857,
605
+ "learning_rate": 1.881720430107527e-05,
606
+ "loss": 0.0168,
607
+ "step": 820
608
+ },
609
+ {
610
+ "epoch": 3.35,
611
+ "grad_norm": 4.441117763519287,
612
+ "learning_rate": 1.8369175627240144e-05,
613
+ "loss": 0.0101,
614
+ "step": 830
615
+ },
616
+ {
617
+ "epoch": 3.39,
618
+ "grad_norm": 0.4963151216506958,
619
+ "learning_rate": 1.7921146953405018e-05,
620
+ "loss": 0.0079,
621
+ "step": 840
622
+ },
623
+ {
624
+ "epoch": 3.43,
625
+ "grad_norm": 0.14219801127910614,
626
+ "learning_rate": 1.7473118279569895e-05,
627
+ "loss": 0.0117,
628
+ "step": 850
629
+ },
630
+ {
631
+ "epoch": 3.47,
632
+ "grad_norm": 0.04577568918466568,
633
+ "learning_rate": 1.702508960573477e-05,
634
+ "loss": 0.0065,
635
+ "step": 860
636
+ },
637
+ {
638
+ "epoch": 3.51,
639
+ "grad_norm": 3.448432207107544,
640
+ "learning_rate": 1.6577060931899643e-05,
641
+ "loss": 0.0141,
642
+ "step": 870
643
+ },
644
+ {
645
+ "epoch": 3.55,
646
+ "grad_norm": 0.43813198804855347,
647
+ "learning_rate": 1.6129032258064517e-05,
648
+ "loss": 0.0121,
649
+ "step": 880
650
+ },
651
+ {
652
+ "epoch": 3.59,
653
+ "grad_norm": 1.8220750093460083,
654
+ "learning_rate": 1.568100358422939e-05,
655
+ "loss": 0.0057,
656
+ "step": 890
657
+ },
658
+ {
659
+ "epoch": 3.63,
660
+ "grad_norm": 4.196650505065918,
661
+ "learning_rate": 1.5232974910394265e-05,
662
+ "loss": 0.0082,
663
+ "step": 900
664
+ },
665
+ {
666
+ "epoch": 3.67,
667
+ "grad_norm": 1.0396842956542969,
668
+ "learning_rate": 1.4784946236559142e-05,
669
+ "loss": 0.0113,
670
+ "step": 910
671
+ },
672
+ {
673
+ "epoch": 3.71,
674
+ "grad_norm": 3.6332926750183105,
675
+ "learning_rate": 1.4336917562724014e-05,
676
+ "loss": 0.0056,
677
+ "step": 920
678
+ },
679
+ {
680
+ "epoch": 3.75,
681
+ "grad_norm": 6.7421369552612305,
682
+ "learning_rate": 1.388888888888889e-05,
683
+ "loss": 0.0085,
684
+ "step": 930
685
+ },
686
+ {
687
+ "epoch": 3.79,
688
+ "grad_norm": 1.0675561428070068,
689
+ "learning_rate": 1.3440860215053763e-05,
690
+ "loss": 0.0061,
691
+ "step": 940
692
+ },
693
+ {
694
+ "epoch": 3.83,
695
+ "grad_norm": 3.8124799728393555,
696
+ "learning_rate": 1.2992831541218639e-05,
697
+ "loss": 0.0087,
698
+ "step": 950
699
+ },
700
+ {
701
+ "epoch": 3.87,
702
+ "grad_norm": 4.4822540283203125,
703
+ "learning_rate": 1.2544802867383513e-05,
704
+ "loss": 0.0048,
705
+ "step": 960
706
+ },
707
+ {
708
+ "epoch": 3.91,
709
+ "grad_norm": 1.812424898147583,
710
+ "learning_rate": 1.2096774193548388e-05,
711
+ "loss": 0.0074,
712
+ "step": 970
713
+ },
714
+ {
715
+ "epoch": 3.95,
716
+ "grad_norm": 6.007579803466797,
717
+ "learning_rate": 1.1648745519713262e-05,
718
+ "loss": 0.0116,
719
+ "step": 980
720
+ },
721
+ {
722
+ "epoch": 3.99,
723
+ "grad_norm": 0.2030833214521408,
724
+ "learning_rate": 1.1200716845878136e-05,
725
+ "loss": 0.0103,
726
+ "step": 990
727
+ },
728
+ {
729
+ "epoch": 4.0,
730
+ "eval_loss": 0.060427818447351456,
731
+ "eval_runtime": 84.6044,
732
+ "eval_samples_per_second": 62.503,
733
+ "eval_steps_per_second": 1.312,
734
+ "step": 992
735
+ },
736
+ {
737
+ "epoch": 4.03,
738
+ "grad_norm": 0.05641289800405502,
739
+ "learning_rate": 1.0752688172043012e-05,
740
+ "loss": 0.0085,
741
+ "step": 1000
742
+ },
743
+ {
744
+ "epoch": 4.07,
745
+ "grad_norm": 0.805799126625061,
746
+ "learning_rate": 1.0304659498207886e-05,
747
+ "loss": 0.0077,
748
+ "step": 1010
749
+ },
750
+ {
751
+ "epoch": 4.11,
752
+ "grad_norm": 4.972249984741211,
753
+ "learning_rate": 9.856630824372761e-06,
754
+ "loss": 0.0113,
755
+ "step": 1020
756
+ },
757
+ {
758
+ "epoch": 4.15,
759
+ "grad_norm": 0.03549932688474655,
760
+ "learning_rate": 9.408602150537635e-06,
761
+ "loss": 0.0052,
762
+ "step": 1030
763
+ },
764
+ {
765
+ "epoch": 4.19,
766
+ "grad_norm": 1.279232144355774,
767
+ "learning_rate": 8.960573476702509e-06,
768
+ "loss": 0.0077,
769
+ "step": 1040
770
+ },
771
+ {
772
+ "epoch": 4.23,
773
+ "grad_norm": 0.22537721693515778,
774
+ "learning_rate": 8.512544802867385e-06,
775
+ "loss": 0.0082,
776
+ "step": 1050
777
+ },
778
+ {
779
+ "epoch": 4.27,
780
+ "grad_norm": 1.7011988162994385,
781
+ "learning_rate": 8.064516129032258e-06,
782
+ "loss": 0.0036,
783
+ "step": 1060
784
+ },
785
+ {
786
+ "epoch": 4.31,
787
+ "grad_norm": 4.37709379196167,
788
+ "learning_rate": 7.616487455197132e-06,
789
+ "loss": 0.0041,
790
+ "step": 1070
791
+ },
792
+ {
793
+ "epoch": 4.35,
794
+ "grad_norm": 2.2503268718719482,
795
+ "learning_rate": 7.168458781362007e-06,
796
+ "loss": 0.0152,
797
+ "step": 1080
798
+ },
799
+ {
800
+ "epoch": 4.4,
801
+ "grad_norm": 3.610405683517456,
802
+ "learning_rate": 6.720430107526882e-06,
803
+ "loss": 0.0107,
804
+ "step": 1090
805
+ },
806
+ {
807
+ "epoch": 4.44,
808
+ "grad_norm": 0.22053413093090057,
809
+ "learning_rate": 6.2724014336917564e-06,
810
+ "loss": 0.0021,
811
+ "step": 1100
812
+ },
813
+ {
814
+ "epoch": 4.48,
815
+ "grad_norm": 8.507258415222168,
816
+ "learning_rate": 5.824372759856631e-06,
817
+ "loss": 0.0141,
818
+ "step": 1110
819
+ },
820
+ {
821
+ "epoch": 4.52,
822
+ "grad_norm": 5.034768581390381,
823
+ "learning_rate": 5.376344086021506e-06,
824
+ "loss": 0.0142,
825
+ "step": 1120
826
+ },
827
+ {
828
+ "epoch": 4.56,
829
+ "grad_norm": 0.22189994156360626,
830
+ "learning_rate": 4.928315412186381e-06,
831
+ "loss": 0.0094,
832
+ "step": 1130
833
+ },
834
+ {
835
+ "epoch": 4.6,
836
+ "grad_norm": 2.6591525077819824,
837
+ "learning_rate": 4.4802867383512545e-06,
838
+ "loss": 0.0039,
839
+ "step": 1140
840
+ },
841
+ {
842
+ "epoch": 4.64,
843
+ "grad_norm": 4.526949405670166,
844
+ "learning_rate": 4.032258064516129e-06,
845
+ "loss": 0.0032,
846
+ "step": 1150
847
+ },
848
+ {
849
+ "epoch": 4.68,
850
+ "grad_norm": 3.290435791015625,
851
+ "learning_rate": 3.5842293906810035e-06,
852
+ "loss": 0.0031,
853
+ "step": 1160
854
+ },
855
+ {
856
+ "epoch": 4.72,
857
+ "grad_norm": 0.6715773344039917,
858
+ "learning_rate": 3.1362007168458782e-06,
859
+ "loss": 0.0028,
860
+ "step": 1170
861
+ },
862
+ {
863
+ "epoch": 4.76,
864
+ "grad_norm": 0.7959145903587341,
865
+ "learning_rate": 2.688172043010753e-06,
866
+ "loss": 0.0063,
867
+ "step": 1180
868
+ },
869
+ {
870
+ "epoch": 4.8,
871
+ "grad_norm": 0.10818018019199371,
872
+ "learning_rate": 2.2401433691756272e-06,
873
+ "loss": 0.0045,
874
+ "step": 1190
875
+ },
876
+ {
877
+ "epoch": 4.84,
878
+ "grad_norm": 0.03141075000166893,
879
+ "learning_rate": 1.7921146953405017e-06,
880
+ "loss": 0.0068,
881
+ "step": 1200
882
+ },
883
+ {
884
+ "epoch": 4.88,
885
+ "grad_norm": 5.118274688720703,
886
+ "learning_rate": 1.3440860215053765e-06,
887
+ "loss": 0.0071,
888
+ "step": 1210
889
+ },
890
+ {
891
+ "epoch": 4.92,
892
+ "grad_norm": 1.3246444463729858,
893
+ "learning_rate": 8.960573476702509e-07,
894
+ "loss": 0.0026,
895
+ "step": 1220
896
+ },
897
+ {
898
+ "epoch": 4.96,
899
+ "grad_norm": 2.687753915786743,
900
+ "learning_rate": 4.4802867383512544e-07,
901
+ "loss": 0.0046,
902
+ "step": 1230
903
+ },
904
+ {
905
+ "epoch": 5.0,
906
+ "grad_norm": 0.166173055768013,
907
+ "learning_rate": 0.0,
908
+ "loss": 0.0073,
909
+ "step": 1240
910
+ },
911
+ {
912
+ "epoch": 5.0,
913
+ "eval_loss": 0.07113554328680038,
914
+ "eval_runtime": 85.5998,
915
+ "eval_samples_per_second": 61.776,
916
+ "eval_steps_per_second": 1.297,
917
+ "step": 1240
918
+ },
919
+ {
920
+ "epoch": 5.0,
921
+ "step": 1240,
922
+ "total_flos": 5.914482579184435e+18,
923
+ "train_loss": 0.04452576920570385,
924
+ "train_runtime": 5366.5614,
925
+ "train_samples_per_second": 44.339,
926
+ "train_steps_per_second": 0.231
927
+ }
928
+ ],
929
+ "logging_steps": 10,
930
+ "max_steps": 1240,
931
+ "num_input_tokens_seen": 0,
932
+ "num_train_epochs": 5,
933
+ "save_steps": 500,
934
+ "total_flos": 5.914482579184435e+18,
935
+ "train_batch_size": 48,
936
+ "trial_name": null,
937
+ "trial_params": null
938
+ }