rshrott commited on
Commit
fbc1625
1 Parent(s): 0a897bb

🍻 cheers

Browse files
README.md CHANGED
@@ -2,6 +2,7 @@
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
 
5
  - generated_from_trainer
6
  datasets:
7
  - renovation
@@ -14,7 +15,7 @@ model-index:
14
  name: Image Classification
15
  type: image-classification
16
  dataset:
17
- name: renovation
18
  type: renovation
19
  config: default
20
  split: validation
@@ -22,7 +23,7 @@ model-index:
22
  metrics:
23
  - name: Accuracy
24
  type: accuracy
25
- value: 0.6950596252129472
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  # vit-base-beans-demo-v5
32
 
33
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the renovation dataset.
34
  It achieves the following results on the evaluation set:
35
- - Loss: 1.2470
36
- - Accuracy: 0.6951
37
 
38
  ## Model description
39
 
 
2
  license: apache-2.0
3
  base_model: google/vit-base-patch16-224-in21k
4
  tags:
5
+ - image-classification
6
  - generated_from_trainer
7
  datasets:
8
  - renovation
 
15
  name: Image Classification
16
  type: image-classification
17
  dataset:
18
+ name: beans
19
  type: renovation
20
  config: default
21
  split: validation
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.6695059625212947
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
31
 
32
  # vit-base-beans-demo-v5
33
 
34
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.8460
37
+ - Accuracy: 0.6695
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.634703196347032,
4
- "eval_loss": 0.929534375667572,
5
- "eval_runtime": 8.1809,
6
- "eval_samples_per_second": 26.77,
7
- "eval_steps_per_second": 3.423,
8
- "total_flos": 6.10974224738132e+17,
9
- "train_loss": 0.25425288126233125,
10
- "train_runtime": 387.3536,
11
- "train_samples_per_second": 20.353,
12
- "train_steps_per_second": 1.28
13
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.6695059625212947,
4
+ "eval_loss": 0.8459659218788147,
5
+ "eval_runtime": 36.9315,
6
+ "eval_samples_per_second": 31.789,
7
+ "eval_steps_per_second": 3.98,
8
+ "total_flos": 2.910419581971751e+18,
9
+ "train_loss": 0.4888155373286145,
10
+ "train_runtime": 2894.9609,
11
+ "train_samples_per_second": 12.973,
12
+ "train_steps_per_second": 0.811
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_accuracy": 0.634703196347032,
4
- "eval_loss": 0.929534375667572,
5
- "eval_runtime": 8.1809,
6
- "eval_samples_per_second": 26.77,
7
- "eval_steps_per_second": 3.423
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_accuracy": 0.6695059625212947,
4
+ "eval_loss": 0.8459659218788147,
5
+ "eval_runtime": 36.9315,
6
+ "eval_samples_per_second": 31.789,
7
+ "eval_steps_per_second": 3.98
8
  }
runs/Mar23_12-27-13_65433f580760/events.out.tfevents.1711199781.65433f580760.3628.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302b161eab291b644cbc90f82eeed5595548682cc98dcc111575d766a1cc0332
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "total_flos": 6.10974224738132e+17,
4
- "train_loss": 0.25425288126233125,
5
- "train_runtime": 387.3536,
6
- "train_samples_per_second": 20.353,
7
- "train_steps_per_second": 1.28
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "total_flos": 2.910419581971751e+18,
4
+ "train_loss": 0.4888155373286145,
5
+ "train_runtime": 2894.9609,
6
+ "train_samples_per_second": 12.973,
7
+ "train_steps_per_second": 0.811
8
  }
trainer_state.json CHANGED
@@ -1,408 +1,1874 @@
1
  {
2
- "best_metric": 0.929534375667572,
3
- "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-100",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 496,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08,
13
- "grad_norm": 1.9071108102798462,
14
- "learning_rate": 0.00019596774193548388,
15
- "loss": 0.7806,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.16,
20
- "grad_norm": 2.2380499839782715,
21
- "learning_rate": 0.00019193548387096775,
22
- "loss": 0.7214,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.24,
27
- "grad_norm": 1.4890930652618408,
28
- "learning_rate": 0.00018790322580645164,
29
- "loss": 0.6215,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.32,
34
- "grad_norm": 3.2323720455169678,
35
- "learning_rate": 0.00018387096774193548,
36
- "loss": 0.6378,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.4,
41
- "grad_norm": 2.838930606842041,
42
- "learning_rate": 0.00017983870967741935,
43
- "loss": 0.7502,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.48,
48
- "grad_norm": 3.2034356594085693,
49
- "learning_rate": 0.00017580645161290325,
50
- "loss": 0.5904,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.56,
55
- "grad_norm": 3.1891825199127197,
56
- "learning_rate": 0.00017177419354838711,
57
- "loss": 0.5718,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.65,
62
- "grad_norm": 2.0921356678009033,
63
- "learning_rate": 0.00016774193548387098,
64
- "loss": 0.3783,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.73,
69
- "grad_norm": 2.864804983139038,
70
- "learning_rate": 0.00016370967741935485,
71
- "loss": 0.6002,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.81,
76
- "grad_norm": 3.1752126216888428,
77
- "learning_rate": 0.00015967741935483872,
78
- "loss": 0.6438,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.81,
83
- "eval_accuracy": 0.634703196347032,
84
- "eval_loss": 0.929534375667572,
85
- "eval_runtime": 7.2962,
86
- "eval_samples_per_second": 30.016,
87
- "eval_steps_per_second": 3.838,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 0.89,
92
- "grad_norm": 2.728193521499634,
93
- "learning_rate": 0.0001556451612903226,
94
- "loss": 0.5441,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 0.97,
99
- "grad_norm": 2.140393018722534,
100
- "learning_rate": 0.00015161290322580646,
101
- "loss": 0.4403,
102
  "step": 120
103
  },
104
  {
105
- "epoch": 1.05,
106
- "grad_norm": 0.6765386462211609,
107
- "learning_rate": 0.00014758064516129032,
108
- "loss": 0.3251,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 1.13,
113
- "grad_norm": 0.9497590661048889,
114
- "learning_rate": 0.00014354838709677422,
115
- "loss": 0.2046,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 1.21,
120
- "grad_norm": 4.010074615478516,
121
- "learning_rate": 0.00013991935483870967,
122
- "loss": 0.3276,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 1.29,
127
- "grad_norm": 3.7631189823150635,
128
- "learning_rate": 0.00013588709677419357,
129
- "loss": 0.2937,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 1.37,
134
- "grad_norm": 0.5803029537200928,
135
- "learning_rate": 0.00013185483870967743,
136
- "loss": 0.1906,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 1.45,
141
- "grad_norm": 5.088043212890625,
142
- "learning_rate": 0.0001278225806451613,
143
- "loss": 0.2207,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.53,
148
- "grad_norm": 2.3816022872924805,
149
- "learning_rate": 0.00012379032258064514,
150
- "loss": 0.1919,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.61,
155
- "grad_norm": 5.558553218841553,
156
- "learning_rate": 0.00011975806451612903,
157
- "loss": 0.3105,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.61,
162
- "eval_accuracy": 0.6575342465753424,
163
- "eval_loss": 0.9350173473358154,
164
- "eval_runtime": 7.7793,
165
- "eval_samples_per_second": 28.152,
166
- "eval_steps_per_second": 3.599,
167
  "step": 200
168
  },
169
  {
170
- "epoch": 1.69,
171
- "grad_norm": 3.439823865890503,
172
- "learning_rate": 0.00011572580645161291,
173
- "loss": 0.3714,
174
  "step": 210
175
  },
176
  {
177
- "epoch": 1.77,
178
- "grad_norm": 2.6023850440979004,
179
- "learning_rate": 0.00011169354838709678,
180
- "loss": 0.2869,
181
  "step": 220
182
  },
183
  {
184
- "epoch": 1.85,
185
- "grad_norm": 3.2238519191741943,
186
- "learning_rate": 0.00010766129032258066,
187
- "loss": 0.4462,
188
  "step": 230
189
  },
190
  {
191
- "epoch": 1.94,
192
- "grad_norm": 1.0531260967254639,
193
- "learning_rate": 0.00010362903225806453,
194
- "loss": 0.3634,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 2.02,
199
- "grad_norm": 0.5729889869689941,
200
- "learning_rate": 9.95967741935484e-05,
201
- "loss": 0.2624,
202
  "step": 250
203
  },
204
  {
205
- "epoch": 2.1,
206
- "grad_norm": 0.1924820989370346,
207
- "learning_rate": 9.556451612903226e-05,
208
- "loss": 0.0999,
209
  "step": 260
210
  },
211
  {
212
- "epoch": 2.18,
213
- "grad_norm": 0.39775505661964417,
214
- "learning_rate": 9.153225806451613e-05,
215
- "loss": 0.0938,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 2.26,
220
- "grad_norm": 0.22179947793483734,
221
- "learning_rate": 8.75e-05,
222
- "loss": 0.1017,
223
  "step": 280
224
  },
225
  {
226
- "epoch": 2.34,
227
- "grad_norm": 1.6249357461929321,
228
- "learning_rate": 8.346774193548388e-05,
229
- "loss": 0.1745,
230
  "step": 290
231
  },
232
  {
233
- "epoch": 2.42,
234
- "grad_norm": 0.34801536798477173,
235
- "learning_rate": 7.943548387096774e-05,
236
- "loss": 0.0634,
237
  "step": 300
238
  },
239
  {
240
- "epoch": 2.42,
241
- "eval_accuracy": 0.6894977168949772,
242
- "eval_loss": 1.0781886577606201,
243
- "eval_runtime": 7.6715,
244
- "eval_samples_per_second": 28.547,
245
- "eval_steps_per_second": 3.65,
246
  "step": 300
247
  },
248
  {
249
- "epoch": 2.5,
250
- "grad_norm": 2.6541597843170166,
251
- "learning_rate": 7.540322580645162e-05,
252
- "loss": 0.0772,
253
  "step": 310
254
  },
255
  {
256
- "epoch": 2.58,
257
- "grad_norm": 0.1635380983352661,
258
- "learning_rate": 7.137096774193549e-05,
259
- "loss": 0.1042,
260
  "step": 320
261
  },
262
  {
263
- "epoch": 2.66,
264
- "grad_norm": 1.126976490020752,
265
- "learning_rate": 6.733870967741935e-05,
266
- "loss": 0.1643,
267
  "step": 330
268
  },
269
  {
270
- "epoch": 2.74,
271
- "grad_norm": 0.2140628844499588,
272
- "learning_rate": 6.330645161290322e-05,
273
- "loss": 0.0479,
274
  "step": 340
275
  },
276
  {
277
- "epoch": 2.82,
278
- "grad_norm": 0.14856065809726715,
279
- "learning_rate": 5.9274193548387104e-05,
280
- "loss": 0.0606,
281
  "step": 350
282
  },
283
  {
284
- "epoch": 2.9,
285
- "grad_norm": 1.9021470546722412,
286
- "learning_rate": 5.5241935483870966e-05,
287
- "loss": 0.0576,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 2.98,
292
- "grad_norm": 0.488421767950058,
293
- "learning_rate": 5.120967741935484e-05,
294
- "loss": 0.1573,
295
  "step": 370
296
  },
297
  {
298
- "epoch": 3.06,
299
- "grad_norm": 0.27475953102111816,
300
- "learning_rate": 4.7177419354838716e-05,
301
- "loss": 0.0264,
302
  "step": 380
303
  },
304
  {
305
- "epoch": 3.15,
306
- "grad_norm": 0.08814023435115814,
307
- "learning_rate": 4.3145161290322584e-05,
308
- "loss": 0.0197,
309
  "step": 390
310
  },
311
  {
312
- "epoch": 3.23,
313
- "grad_norm": 0.10707065463066101,
314
- "learning_rate": 3.911290322580645e-05,
315
- "loss": 0.0257,
316
  "step": 400
317
  },
318
  {
319
- "epoch": 3.23,
320
- "eval_accuracy": 0.6986301369863014,
321
- "eval_loss": 1.06435227394104,
322
- "eval_runtime": 7.0971,
323
- "eval_samples_per_second": 30.858,
324
- "eval_steps_per_second": 3.945,
325
  "step": 400
326
  },
327
  {
328
- "epoch": 3.31,
329
- "grad_norm": 0.06996390968561172,
330
- "learning_rate": 3.508064516129033e-05,
331
- "loss": 0.0192,
332
  "step": 410
333
  },
334
  {
335
- "epoch": 3.39,
336
- "grad_norm": 1.358115315437317,
337
- "learning_rate": 3.1048387096774195e-05,
338
- "loss": 0.0431,
339
  "step": 420
340
  },
341
  {
342
- "epoch": 3.47,
343
- "grad_norm": 0.4962191581726074,
344
- "learning_rate": 2.7016129032258064e-05,
345
- "loss": 0.0573,
346
  "step": 430
347
  },
348
  {
349
- "epoch": 3.55,
350
- "grad_norm": 0.08283121138811111,
351
- "learning_rate": 2.2983870967741935e-05,
352
- "loss": 0.0216,
353
  "step": 440
354
  },
355
  {
356
- "epoch": 3.63,
357
- "grad_norm": 0.06285007297992706,
358
- "learning_rate": 1.8951612903225807e-05,
359
- "loss": 0.0169,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 3.71,
364
- "grad_norm": 0.10198648273944855,
365
- "learning_rate": 1.4919354838709679e-05,
366
- "loss": 0.0188,
367
  "step": 460
368
  },
369
  {
370
- "epoch": 3.79,
371
- "grad_norm": 1.5539321899414062,
372
- "learning_rate": 1.0887096774193549e-05,
373
- "loss": 0.0227,
374
  "step": 470
375
  },
376
  {
377
- "epoch": 3.87,
378
- "grad_norm": 0.06271003931760788,
379
- "learning_rate": 6.854838709677419e-06,
380
- "loss": 0.0212,
381
  "step": 480
382
  },
383
  {
384
- "epoch": 3.95,
385
- "grad_norm": 0.1244824230670929,
386
- "learning_rate": 2.82258064516129e-06,
387
- "loss": 0.0183,
388
  "step": 490
389
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  {
391
  "epoch": 4.0,
392
- "step": 496,
393
- "total_flos": 6.10974224738132e+17,
394
- "train_loss": 0.25425288126233125,
395
- "train_runtime": 387.3536,
396
- "train_samples_per_second": 20.353,
397
- "train_steps_per_second": 1.28
398
  }
399
  ],
400
  "logging_steps": 10,
401
- "max_steps": 496,
402
  "num_input_tokens_seen": 0,
403
  "num_train_epochs": 4,
404
  "save_steps": 100,
405
- "total_flos": 6.10974224738132e+17,
406
  "train_batch_size": 16,
407
  "trial_name": null,
408
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8459659218788147,
3
+ "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-900",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 2348,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "grad_norm": 1.6497292518615723,
14
+ "learning_rate": 0.00019914821124361162,
15
+ "loss": 1.6003,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.03,
20
+ "grad_norm": 1.307145357131958,
21
+ "learning_rate": 0.00019829642248722317,
22
+ "loss": 1.2767,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.05,
27
+ "grad_norm": 2.9354941844940186,
28
+ "learning_rate": 0.00019744463373083478,
29
+ "loss": 1.2612,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.07,
34
+ "grad_norm": 1.3261815309524536,
35
+ "learning_rate": 0.00019659284497444633,
36
+ "loss": 1.2354,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.09,
41
+ "grad_norm": 1.5586915016174316,
42
+ "learning_rate": 0.00019574105621805794,
43
+ "loss": 1.0959,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.1,
48
+ "grad_norm": 1.490173578262329,
49
+ "learning_rate": 0.00019488926746166952,
50
+ "loss": 1.0438,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.12,
55
+ "grad_norm": 2.0831446647644043,
56
+ "learning_rate": 0.0001940374787052811,
57
+ "loss": 1.0841,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.14,
62
+ "grad_norm": 2.6207799911499023,
63
+ "learning_rate": 0.00019318568994889268,
64
+ "loss": 1.0983,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.15,
69
+ "grad_norm": 1.7383110523223877,
70
+ "learning_rate": 0.00019233390119250426,
71
+ "loss": 1.1775,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.17,
76
+ "grad_norm": 2.1954941749572754,
77
+ "learning_rate": 0.00019148211243611585,
78
+ "loss": 1.0616,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.17,
83
+ "eval_accuracy": 0.5817717206132879,
84
+ "eval_loss": 1.0267014503479004,
85
+ "eval_runtime": 39.3874,
86
+ "eval_samples_per_second": 29.806,
87
+ "eval_steps_per_second": 3.732,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 0.19,
92
+ "grad_norm": 1.597124695777893,
93
+ "learning_rate": 0.00019063032367972745,
94
+ "loss": 1.007,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 0.2,
99
+ "grad_norm": 1.289490818977356,
100
+ "learning_rate": 0.000189778534923339,
101
+ "loss": 1.0065,
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 0.22,
106
+ "grad_norm": 1.7088607549667358,
107
+ "learning_rate": 0.00018892674616695061,
108
+ "loss": 1.0204,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 0.24,
113
+ "grad_norm": 2.730241537094116,
114
+ "learning_rate": 0.00018807495741056217,
115
+ "loss": 0.8969,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 0.26,
120
+ "grad_norm": 2.9691402912139893,
121
+ "learning_rate": 0.00018722316865417378,
122
+ "loss": 0.953,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 0.27,
127
+ "grad_norm": 2.2519712448120117,
128
+ "learning_rate": 0.00018637137989778536,
129
+ "loss": 0.9269,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 0.29,
134
+ "grad_norm": 1.8000602722167969,
135
+ "learning_rate": 0.00018551959114139694,
136
+ "loss": 1.1314,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 0.31,
141
+ "grad_norm": 1.5348334312438965,
142
+ "learning_rate": 0.00018466780238500855,
143
+ "loss": 0.9615,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 0.32,
148
+ "grad_norm": 1.599938988685608,
149
+ "learning_rate": 0.0001838160136286201,
150
+ "loss": 0.8033,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 0.34,
155
+ "grad_norm": 1.50412917137146,
156
+ "learning_rate": 0.0001829642248722317,
157
+ "loss": 0.9594,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 0.34,
162
+ "eval_accuracy": 0.6073253833049403,
163
+ "eval_loss": 0.9467767477035522,
164
+ "eval_runtime": 38.8829,
165
+ "eval_samples_per_second": 30.193,
166
+ "eval_steps_per_second": 3.781,
167
  "step": 200
168
  },
169
  {
170
+ "epoch": 0.36,
171
+ "grad_norm": 2.1896722316741943,
172
+ "learning_rate": 0.0001821124361158433,
173
+ "loss": 0.9217,
174
  "step": 210
175
  },
176
  {
177
+ "epoch": 0.37,
178
+ "grad_norm": 1.9687891006469727,
179
+ "learning_rate": 0.00018126064735945487,
180
+ "loss": 1.0296,
181
  "step": 220
182
  },
183
  {
184
+ "epoch": 0.39,
185
+ "grad_norm": 1.9628914594650269,
186
+ "learning_rate": 0.00018040885860306645,
187
+ "loss": 0.8122,
188
  "step": 230
189
  },
190
  {
191
+ "epoch": 0.41,
192
+ "grad_norm": 2.598545789718628,
193
+ "learning_rate": 0.00017955706984667803,
194
+ "loss": 0.8393,
195
  "step": 240
196
  },
197
  {
198
+ "epoch": 0.43,
199
+ "grad_norm": 2.2483532428741455,
200
+ "learning_rate": 0.0001787052810902896,
201
+ "loss": 0.9047,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 0.44,
206
+ "grad_norm": 2.1274337768554688,
207
+ "learning_rate": 0.0001778534923339012,
208
+ "loss": 0.91,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 0.46,
213
+ "grad_norm": 2.436018466949463,
214
+ "learning_rate": 0.00017700170357751277,
215
+ "loss": 1.0615,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 0.48,
220
+ "grad_norm": 2.069586992263794,
221
+ "learning_rate": 0.00017614991482112438,
222
+ "loss": 1.0799,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 0.49,
227
+ "grad_norm": 1.7266385555267334,
228
+ "learning_rate": 0.00017529812606473594,
229
+ "loss": 0.9465,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 0.51,
234
+ "grad_norm": 2.0491390228271484,
235
+ "learning_rate": 0.00017444633730834754,
236
+ "loss": 1.1785,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 0.51,
241
+ "eval_accuracy": 0.5868824531516184,
242
+ "eval_loss": 0.997596025466919,
243
+ "eval_runtime": 39.3421,
244
+ "eval_samples_per_second": 29.841,
245
+ "eval_steps_per_second": 3.736,
246
  "step": 300
247
  },
248
  {
249
+ "epoch": 0.53,
250
+ "grad_norm": 1.4697805643081665,
251
+ "learning_rate": 0.00017359454855195912,
252
+ "loss": 1.094,
253
  "step": 310
254
  },
255
  {
256
+ "epoch": 0.55,
257
+ "grad_norm": 2.369339942932129,
258
+ "learning_rate": 0.0001727427597955707,
259
+ "loss": 0.9398,
260
  "step": 320
261
  },
262
  {
263
+ "epoch": 0.56,
264
+ "grad_norm": 2.325148344039917,
265
+ "learning_rate": 0.00017189097103918229,
266
+ "loss": 0.9718,
267
  "step": 330
268
  },
269
  {
270
+ "epoch": 0.58,
271
+ "grad_norm": 1.9404678344726562,
272
+ "learning_rate": 0.00017103918228279387,
273
+ "loss": 0.9091,
274
  "step": 340
275
  },
276
  {
277
+ "epoch": 0.6,
278
+ "grad_norm": 2.4493370056152344,
279
+ "learning_rate": 0.00017018739352640547,
280
+ "loss": 0.9295,
281
  "step": 350
282
  },
283
  {
284
+ "epoch": 0.61,
285
+ "grad_norm": 1.6286579370498657,
286
+ "learning_rate": 0.00016933560477001706,
287
+ "loss": 1.1049,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 0.63,
292
+ "grad_norm": 3.559056043624878,
293
+ "learning_rate": 0.00016848381601362864,
294
+ "loss": 0.9566,
295
  "step": 370
296
  },
297
  {
298
+ "epoch": 0.65,
299
+ "grad_norm": 1.4250924587249756,
300
+ "learning_rate": 0.00016763202725724022,
301
+ "loss": 0.7772,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 0.66,
306
+ "grad_norm": 1.5668089389801025,
307
+ "learning_rate": 0.0001667802385008518,
308
+ "loss": 0.8869,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 0.68,
313
+ "grad_norm": 2.725231885910034,
314
+ "learning_rate": 0.00016592844974446338,
315
+ "loss": 0.865,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 0.68,
320
+ "eval_accuracy": 0.6388415672913118,
321
+ "eval_loss": 0.9287859201431274,
322
+ "eval_runtime": 38.5489,
323
+ "eval_samples_per_second": 30.455,
324
+ "eval_steps_per_second": 3.813,
325
  "step": 400
326
  },
327
  {
328
+ "epoch": 0.7,
329
+ "grad_norm": 2.6907713413238525,
330
+ "learning_rate": 0.00016507666098807496,
331
+ "loss": 0.899,
332
  "step": 410
333
  },
334
  {
335
+ "epoch": 0.72,
336
+ "grad_norm": 2.402860164642334,
337
+ "learning_rate": 0.00016422487223168654,
338
+ "loss": 0.9506,
339
  "step": 420
340
  },
341
  {
342
+ "epoch": 0.73,
343
+ "grad_norm": 2.749433994293213,
344
+ "learning_rate": 0.00016337308347529815,
345
+ "loss": 0.8529,
346
  "step": 430
347
  },
348
  {
349
+ "epoch": 0.75,
350
+ "grad_norm": 1.92979097366333,
351
+ "learning_rate": 0.0001625212947189097,
352
+ "loss": 0.8695,
353
  "step": 440
354
  },
355
  {
356
+ "epoch": 0.77,
357
+ "grad_norm": 2.793747901916504,
358
+ "learning_rate": 0.0001616695059625213,
359
+ "loss": 0.8614,
360
  "step": 450
361
  },
362
  {
363
+ "epoch": 0.78,
364
+ "grad_norm": 2.483780860900879,
365
+ "learning_rate": 0.0001608177172061329,
366
+ "loss": 0.9176,
367
  "step": 460
368
  },
369
  {
370
+ "epoch": 0.8,
371
+ "grad_norm": 1.7278929948806763,
372
+ "learning_rate": 0.00015996592844974447,
373
+ "loss": 0.9656,
374
  "step": 470
375
  },
376
  {
377
+ "epoch": 0.82,
378
+ "grad_norm": 2.649017810821533,
379
+ "learning_rate": 0.00015911413969335605,
380
+ "loss": 0.8653,
381
  "step": 480
382
  },
383
  {
384
+ "epoch": 0.83,
385
+ "grad_norm": 1.8457053899765015,
386
+ "learning_rate": 0.00015826235093696763,
387
+ "loss": 0.7707,
388
  "step": 490
389
  },
390
+ {
391
+ "epoch": 0.85,
392
+ "grad_norm": 2.824699640274048,
393
+ "learning_rate": 0.00015741056218057921,
394
+ "loss": 0.8494,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 0.85,
399
+ "eval_accuracy": 0.651618398637138,
400
+ "eval_loss": 0.8572959303855896,
401
+ "eval_runtime": 38.0883,
402
+ "eval_samples_per_second": 30.823,
403
+ "eval_steps_per_second": 3.859,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 0.87,
408
+ "grad_norm": 1.9104124307632446,
409
+ "learning_rate": 0.0001565587734241908,
410
+ "loss": 0.8113,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 0.89,
415
+ "grad_norm": 2.2717394828796387,
416
+ "learning_rate": 0.0001557069846678024,
417
+ "loss": 0.9194,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 0.9,
422
+ "grad_norm": 1.891735553741455,
423
+ "learning_rate": 0.00015485519591141398,
424
+ "loss": 0.9337,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 0.92,
429
+ "grad_norm": 4.8229146003723145,
430
+ "learning_rate": 0.00015400340715502557,
431
+ "loss": 0.9033,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 0.94,
436
+ "grad_norm": 2.656970977783203,
437
+ "learning_rate": 0.00015315161839863715,
438
+ "loss": 0.8985,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 0.95,
443
+ "grad_norm": 2.2908411026000977,
444
+ "learning_rate": 0.00015229982964224873,
445
+ "loss": 0.8708,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 0.97,
450
+ "grad_norm": 2.141950845718384,
451
+ "learning_rate": 0.0001514480408858603,
452
+ "loss": 0.9298,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 0.99,
457
+ "grad_norm": 2.5572831630706787,
458
+ "learning_rate": 0.00015059625212947192,
459
+ "loss": 0.9101,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 1.01,
464
+ "grad_norm": 2.2453222274780273,
465
+ "learning_rate": 0.00014982964224872234,
466
+ "loss": 0.8034,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 1.02,
471
+ "grad_norm": 2.2874865531921387,
472
+ "learning_rate": 0.00014897785349233392,
473
+ "loss": 0.8151,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 1.02,
478
+ "eval_accuracy": 0.6396933560477002,
479
+ "eval_loss": 0.87294602394104,
480
+ "eval_runtime": 38.7251,
481
+ "eval_samples_per_second": 30.316,
482
+ "eval_steps_per_second": 3.796,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 1.04,
487
+ "grad_norm": 2.950303554534912,
488
+ "learning_rate": 0.0001481260647359455,
489
+ "loss": 0.7484,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 1.06,
494
+ "grad_norm": 1.9773017168045044,
495
+ "learning_rate": 0.00014727427597955708,
496
+ "loss": 0.6572,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 1.07,
501
+ "grad_norm": 2.9777700901031494,
502
+ "learning_rate": 0.00014642248722316866,
503
+ "loss": 0.6927,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 1.09,
508
+ "grad_norm": 3.323662519454956,
509
+ "learning_rate": 0.00014557069846678024,
510
+ "loss": 0.5812,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 1.11,
515
+ "grad_norm": 1.9647018909454346,
516
+ "learning_rate": 0.00014471890971039185,
517
+ "loss": 0.6166,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 1.12,
522
+ "grad_norm": 3.215794563293457,
523
+ "learning_rate": 0.0001438671209540034,
524
+ "loss": 0.602,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 1.14,
529
+ "grad_norm": 2.8758130073547363,
530
+ "learning_rate": 0.000143015332197615,
531
+ "loss": 0.5224,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 1.16,
536
+ "grad_norm": 2.142829179763794,
537
+ "learning_rate": 0.00014216354344122656,
538
+ "loss": 0.5663,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 1.18,
543
+ "grad_norm": 6.860159397125244,
544
+ "learning_rate": 0.00014131175468483817,
545
+ "loss": 0.6479,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 1.19,
550
+ "grad_norm": 3.3176701068878174,
551
+ "learning_rate": 0.00014045996592844975,
552
+ "loss": 0.5787,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 1.19,
557
+ "eval_accuracy": 0.6448040885860307,
558
+ "eval_loss": 0.9067147970199585,
559
+ "eval_runtime": 38.2427,
560
+ "eval_samples_per_second": 30.699,
561
+ "eval_steps_per_second": 3.844,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 1.21,
566
+ "grad_norm": 2.322371482849121,
567
+ "learning_rate": 0.00013960817717206133,
568
+ "loss": 0.6849,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 1.23,
573
+ "grad_norm": 1.875775933265686,
574
+ "learning_rate": 0.00013875638841567291,
575
+ "loss": 0.6399,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 1.24,
580
+ "grad_norm": 2.0012145042419434,
581
+ "learning_rate": 0.0001379045996592845,
582
+ "loss": 0.725,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 1.26,
587
+ "grad_norm": 2.5320353507995605,
588
+ "learning_rate": 0.00013705281090289608,
589
+ "loss": 0.5306,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 1.28,
594
+ "grad_norm": 2.29856538772583,
595
+ "learning_rate": 0.00013620102214650768,
596
+ "loss": 0.5731,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 1.29,
601
+ "grad_norm": 1.8604925870895386,
602
+ "learning_rate": 0.00013534923339011926,
603
+ "loss": 0.6806,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 1.31,
608
+ "grad_norm": 2.6868739128112793,
609
+ "learning_rate": 0.00013449744463373084,
610
+ "loss": 0.5944,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.33,
615
+ "grad_norm": 3.3680803775787354,
616
+ "learning_rate": 0.00013364565587734243,
617
+ "loss": 0.6412,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.35,
622
+ "grad_norm": 2.798149824142456,
623
+ "learning_rate": 0.000132793867120954,
624
+ "loss": 0.5235,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.36,
629
+ "grad_norm": 2.4862072467803955,
630
+ "learning_rate": 0.00013194207836456561,
631
+ "loss": 0.7768,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.36,
636
+ "eval_accuracy": 0.6533219761499148,
637
+ "eval_loss": 0.8995758295059204,
638
+ "eval_runtime": 38.4107,
639
+ "eval_samples_per_second": 30.564,
640
+ "eval_steps_per_second": 3.827,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 1.38,
645
+ "grad_norm": 3.291276216506958,
646
+ "learning_rate": 0.00013109028960817717,
647
+ "loss": 0.669,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 1.4,
652
+ "grad_norm": 2.814397096633911,
653
+ "learning_rate": 0.00013023850085178878,
654
+ "loss": 0.5539,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 1.41,
659
+ "grad_norm": 2.5982093811035156,
660
+ "learning_rate": 0.00012938671209540033,
661
+ "loss": 0.6565,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 1.43,
666
+ "grad_norm": 3.1191565990448,
667
+ "learning_rate": 0.00012853492333901194,
668
+ "loss": 0.533,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 1.45,
673
+ "grad_norm": 5.229197025299072,
674
+ "learning_rate": 0.00012768313458262352,
675
+ "loss": 0.6123,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 1.47,
680
+ "grad_norm": 2.259110689163208,
681
+ "learning_rate": 0.0001268313458262351,
682
+ "loss": 0.5183,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 1.48,
687
+ "grad_norm": 3.099496364593506,
688
+ "learning_rate": 0.00012597955706984668,
689
+ "loss": 0.6911,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 1.5,
694
+ "grad_norm": 2.9909987449645996,
695
+ "learning_rate": 0.00012512776831345826,
696
+ "loss": 0.6671,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 1.52,
701
+ "grad_norm": 3.1856462955474854,
702
+ "learning_rate": 0.00012427597955706984,
703
+ "loss": 0.6652,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 1.53,
708
+ "grad_norm": 3.9080755710601807,
709
+ "learning_rate": 0.00012342419080068145,
710
+ "loss": 0.6098,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 1.53,
715
+ "eval_accuracy": 0.6695059625212947,
716
+ "eval_loss": 0.8459659218788147,
717
+ "eval_runtime": 37.8733,
718
+ "eval_samples_per_second": 30.998,
719
+ "eval_steps_per_second": 3.881,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 1.55,
724
+ "grad_norm": 1.7587580680847168,
725
+ "learning_rate": 0.000122572402044293,
726
+ "loss": 0.7362,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 1.57,
731
+ "grad_norm": 2.7327494621276855,
732
+ "learning_rate": 0.00012172061328790461,
733
+ "loss": 0.5863,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 1.58,
738
+ "grad_norm": 4.113401889801025,
739
+ "learning_rate": 0.0001208688245315162,
740
+ "loss": 0.8205,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 1.6,
745
+ "grad_norm": 4.598094940185547,
746
+ "learning_rate": 0.00012001703577512777,
747
+ "loss": 0.7198,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 1.62,
752
+ "grad_norm": 2.8792037963867188,
753
+ "learning_rate": 0.00011916524701873937,
754
+ "loss": 0.6532,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 1.64,
759
+ "grad_norm": 2.949414014816284,
760
+ "learning_rate": 0.00011831345826235094,
761
+ "loss": 0.6783,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 1.65,
766
+ "grad_norm": 2.300352096557617,
767
+ "learning_rate": 0.00011746166950596253,
768
+ "loss": 0.69,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 1.67,
773
+ "grad_norm": 2.5100274085998535,
774
+ "learning_rate": 0.00011660988074957411,
775
+ "loss": 0.7028,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 1.69,
780
+ "grad_norm": 2.372359275817871,
781
+ "learning_rate": 0.0001157580919931857,
782
+ "loss": 0.5673,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 1.7,
787
+ "grad_norm": 4.268792152404785,
788
+ "learning_rate": 0.00011490630323679727,
789
+ "loss": 0.6251,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 1.7,
794
+ "eval_accuracy": 0.6703577512776832,
795
+ "eval_loss": 0.8609783053398132,
796
+ "eval_runtime": 37.811,
797
+ "eval_samples_per_second": 31.049,
798
+ "eval_steps_per_second": 3.888,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 1.72,
803
+ "grad_norm": 3.081153154373169,
804
+ "learning_rate": 0.00011405451448040887,
805
+ "loss": 0.7021,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 1.74,
810
+ "grad_norm": 2.9631364345550537,
811
+ "learning_rate": 0.0001132879045996593,
812
+ "loss": 0.5469,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 1.75,
817
+ "grad_norm": 3.2896649837493896,
818
+ "learning_rate": 0.00011243611584327087,
819
+ "loss": 0.5593,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 1.77,
824
+ "grad_norm": 3.8375134468078613,
825
+ "learning_rate": 0.00011158432708688246,
826
+ "loss": 0.5499,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 1.79,
831
+ "grad_norm": 1.5597748756408691,
832
+ "learning_rate": 0.00011073253833049404,
833
+ "loss": 0.5529,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 1.81,
838
+ "grad_norm": 4.54299783706665,
839
+ "learning_rate": 0.00010988074957410564,
840
+ "loss": 0.6211,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 1.82,
845
+ "grad_norm": 3.2734501361846924,
846
+ "learning_rate": 0.0001090289608177172,
847
+ "loss": 0.7002,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 1.84,
852
+ "grad_norm": 3.7582859992980957,
853
+ "learning_rate": 0.0001081771720613288,
854
+ "loss": 0.7465,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 1.86,
859
+ "grad_norm": 2.190544605255127,
860
+ "learning_rate": 0.00010732538330494038,
861
+ "loss": 0.6662,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 1.87,
866
+ "grad_norm": 1.7477951049804688,
867
+ "learning_rate": 0.00010647359454855197,
868
+ "loss": 0.7863,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 1.87,
873
+ "eval_accuracy": 0.6431005110732538,
874
+ "eval_loss": 0.8668282628059387,
875
+ "eval_runtime": 37.5178,
876
+ "eval_samples_per_second": 31.292,
877
+ "eval_steps_per_second": 3.918,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 1.89,
882
+ "grad_norm": 1.9970145225524902,
883
+ "learning_rate": 0.00010562180579216354,
884
+ "loss": 0.5988,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 1.91,
889
+ "grad_norm": 3.718055248260498,
890
+ "learning_rate": 0.00010477001703577514,
891
+ "loss": 0.5973,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 1.93,
896
+ "grad_norm": 1.6347967386245728,
897
+ "learning_rate": 0.0001039182282793867,
898
+ "loss": 0.5818,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 1.94,
903
+ "grad_norm": 2.3118577003479004,
904
+ "learning_rate": 0.0001030664395229983,
905
+ "loss": 0.5136,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 1.96,
910
+ "grad_norm": 2.806833267211914,
911
+ "learning_rate": 0.00010221465076660988,
912
+ "loss": 0.5353,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 1.98,
917
+ "grad_norm": 2.699890375137329,
918
+ "learning_rate": 0.00010136286201022147,
919
+ "loss": 0.5498,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 1.99,
924
+ "grad_norm": 2.3461856842041016,
925
+ "learning_rate": 0.00010051107325383304,
926
+ "loss": 0.7181,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 2.01,
931
+ "grad_norm": 2.483959436416626,
932
+ "learning_rate": 9.965928449744463e-05,
933
+ "loss": 0.3872,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 2.03,
938
+ "grad_norm": 2.1393377780914307,
939
+ "learning_rate": 9.880749574105622e-05,
940
+ "loss": 0.292,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 2.04,
945
+ "grad_norm": 1.6828927993774414,
946
+ "learning_rate": 9.795570698466781e-05,
947
+ "loss": 0.2595,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 2.04,
952
+ "eval_accuracy": 0.6839863713798978,
953
+ "eval_loss": 0.8725138902664185,
954
+ "eval_runtime": 37.2408,
955
+ "eval_samples_per_second": 31.525,
956
+ "eval_steps_per_second": 3.947,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 2.06,
961
+ "grad_norm": 1.973240852355957,
962
+ "learning_rate": 9.710391822827939e-05,
963
+ "loss": 0.2644,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 2.08,
968
+ "grad_norm": 2.932751417160034,
969
+ "learning_rate": 9.625212947189097e-05,
970
+ "loss": 0.2925,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 2.1,
975
+ "grad_norm": 3.356760025024414,
976
+ "learning_rate": 9.540034071550255e-05,
977
+ "loss": 0.312,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 2.11,
982
+ "grad_norm": 1.7125446796417236,
983
+ "learning_rate": 9.454855195911415e-05,
984
+ "loss": 0.2277,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 2.13,
989
+ "grad_norm": 1.714805006980896,
990
+ "learning_rate": 9.369676320272573e-05,
991
+ "loss": 0.3301,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 2.15,
996
+ "grad_norm": 2.301734685897827,
997
+ "learning_rate": 9.284497444633732e-05,
998
+ "loss": 0.2668,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 2.16,
1003
+ "grad_norm": 2.4843878746032715,
1004
+ "learning_rate": 9.19931856899489e-05,
1005
+ "loss": 0.2333,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 2.18,
1010
+ "grad_norm": 2.9054977893829346,
1011
+ "learning_rate": 9.114139693356048e-05,
1012
+ "loss": 0.3492,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 2.2,
1017
+ "grad_norm": 4.664933681488037,
1018
+ "learning_rate": 9.028960817717206e-05,
1019
+ "loss": 0.3754,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 2.21,
1024
+ "grad_norm": 2.1164679527282715,
1025
+ "learning_rate": 8.943781942078366e-05,
1026
+ "loss": 0.2735,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 2.21,
1031
+ "eval_accuracy": 0.6746166950596252,
1032
+ "eval_loss": 0.9306557178497314,
1033
+ "eval_runtime": 37.0944,
1034
+ "eval_samples_per_second": 31.649,
1035
+ "eval_steps_per_second": 3.963,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 2.23,
1040
+ "grad_norm": 4.541740894317627,
1041
+ "learning_rate": 8.858603066439524e-05,
1042
+ "loss": 0.3835,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 2.25,
1047
+ "grad_norm": 3.0828359127044678,
1048
+ "learning_rate": 8.773424190800682e-05,
1049
+ "loss": 0.3189,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 2.27,
1054
+ "grad_norm": 2.398512363433838,
1055
+ "learning_rate": 8.68824531516184e-05,
1056
+ "loss": 0.29,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 2.28,
1061
+ "grad_norm": 3.069840908050537,
1062
+ "learning_rate": 8.603066439522998e-05,
1063
+ "loss": 0.288,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 2.3,
1068
+ "grad_norm": 5.078506946563721,
1069
+ "learning_rate": 8.517887563884158e-05,
1070
+ "loss": 0.2772,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 2.32,
1075
+ "grad_norm": 2.812199354171753,
1076
+ "learning_rate": 8.432708688245316e-05,
1077
+ "loss": 0.2951,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 2.33,
1082
+ "grad_norm": 4.542017936706543,
1083
+ "learning_rate": 8.347529812606474e-05,
1084
+ "loss": 0.2142,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 2.35,
1089
+ "grad_norm": 3.7486696243286133,
1090
+ "learning_rate": 8.262350936967632e-05,
1091
+ "loss": 0.257,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 2.37,
1096
+ "grad_norm": 3.5566983222961426,
1097
+ "learning_rate": 8.17717206132879e-05,
1098
+ "loss": 0.2816,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 2.39,
1103
+ "grad_norm": 1.3465384244918823,
1104
+ "learning_rate": 8.09199318568995e-05,
1105
+ "loss": 0.2429,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 2.39,
1110
+ "eval_accuracy": 0.6354344122657581,
1111
+ "eval_loss": 1.0957823991775513,
1112
+ "eval_runtime": 37.2033,
1113
+ "eval_samples_per_second": 31.556,
1114
+ "eval_steps_per_second": 3.951,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 2.4,
1119
+ "grad_norm": 2.310131788253784,
1120
+ "learning_rate": 8.006814310051108e-05,
1121
+ "loss": 0.306,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 2.42,
1126
+ "grad_norm": 3.1297261714935303,
1127
+ "learning_rate": 7.921635434412266e-05,
1128
+ "loss": 0.3257,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 2.44,
1133
+ "grad_norm": 1.8082480430603027,
1134
+ "learning_rate": 7.836456558773425e-05,
1135
+ "loss": 0.2001,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 2.45,
1140
+ "grad_norm": 1.7700148820877075,
1141
+ "learning_rate": 7.751277683134583e-05,
1142
+ "loss": 0.3476,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 2.47,
1147
+ "grad_norm": 4.247625350952148,
1148
+ "learning_rate": 7.666098807495741e-05,
1149
+ "loss": 0.2323,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 2.49,
1154
+ "grad_norm": 4.059571743011475,
1155
+ "learning_rate": 7.5809199318569e-05,
1156
+ "loss": 0.3089,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 2.5,
1161
+ "grad_norm": 3.2417612075805664,
1162
+ "learning_rate": 7.495741056218059e-05,
1163
+ "loss": 0.1964,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 2.52,
1168
+ "grad_norm": 5.7817463874816895,
1169
+ "learning_rate": 7.410562180579217e-05,
1170
+ "loss": 0.3549,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 2.54,
1175
+ "grad_norm": 5.440825939178467,
1176
+ "learning_rate": 7.325383304940375e-05,
1177
+ "loss": 0.3085,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 2.56,
1182
+ "grad_norm": 4.482067108154297,
1183
+ "learning_rate": 7.240204429301533e-05,
1184
+ "loss": 0.3224,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 2.56,
1189
+ "eval_accuracy": 0.6686541737649063,
1190
+ "eval_loss": 1.0305246114730835,
1191
+ "eval_runtime": 37.1181,
1192
+ "eval_samples_per_second": 31.629,
1193
+ "eval_steps_per_second": 3.96,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 2.57,
1198
+ "grad_norm": 2.1568057537078857,
1199
+ "learning_rate": 7.155025553662692e-05,
1200
+ "loss": 0.1612,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 2.59,
1205
+ "grad_norm": 1.293427586555481,
1206
+ "learning_rate": 7.06984667802385e-05,
1207
+ "loss": 0.3217,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 2.61,
1212
+ "grad_norm": 4.301244258880615,
1213
+ "learning_rate": 6.984667802385009e-05,
1214
+ "loss": 0.2378,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 2.62,
1219
+ "grad_norm": 1.6040468215942383,
1220
+ "learning_rate": 6.899488926746167e-05,
1221
+ "loss": 0.2801,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 2.64,
1226
+ "grad_norm": 0.7993047833442688,
1227
+ "learning_rate": 6.814310051107326e-05,
1228
+ "loss": 0.2637,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 2.66,
1233
+ "grad_norm": 4.865533828735352,
1234
+ "learning_rate": 6.729131175468484e-05,
1235
+ "loss": 0.3441,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 2.67,
1240
+ "grad_norm": 1.7501546144485474,
1241
+ "learning_rate": 6.643952299829642e-05,
1242
+ "loss": 0.2523,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 2.69,
1247
+ "grad_norm": 1.331475019454956,
1248
+ "learning_rate": 6.5587734241908e-05,
1249
+ "loss": 0.2127,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 2.71,
1254
+ "grad_norm": 3.352147102355957,
1255
+ "learning_rate": 6.473594548551958e-05,
1256
+ "loss": 0.3432,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 2.73,
1261
+ "grad_norm": 0.3470512330532074,
1262
+ "learning_rate": 6.388415672913118e-05,
1263
+ "loss": 0.1602,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 2.73,
1268
+ "eval_accuracy": 0.6746166950596252,
1269
+ "eval_loss": 1.0072139501571655,
1270
+ "eval_runtime": 37.0019,
1271
+ "eval_samples_per_second": 31.728,
1272
+ "eval_steps_per_second": 3.973,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 2.74,
1277
+ "grad_norm": 3.1594250202178955,
1278
+ "learning_rate": 6.303236797274277e-05,
1279
+ "loss": 0.1929,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 2.76,
1284
+ "grad_norm": 4.477923393249512,
1285
+ "learning_rate": 6.218057921635435e-05,
1286
+ "loss": 0.2696,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 2.78,
1291
+ "grad_norm": 3.042938232421875,
1292
+ "learning_rate": 6.132879045996594e-05,
1293
+ "loss": 0.2527,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 2.79,
1298
+ "grad_norm": 0.8534514904022217,
1299
+ "learning_rate": 6.0477001703577516e-05,
1300
+ "loss": 0.1727,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 2.81,
1305
+ "grad_norm": 2.2307116985321045,
1306
+ "learning_rate": 5.9625212947189104e-05,
1307
+ "loss": 0.3178,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 2.83,
1312
+ "grad_norm": 3.302003860473633,
1313
+ "learning_rate": 5.8773424190800684e-05,
1314
+ "loss": 0.2973,
1315
+ "step": 1660
1316
+ },
1317
+ {
1318
+ "epoch": 2.84,
1319
+ "grad_norm": 5.320656776428223,
1320
+ "learning_rate": 5.792163543441227e-05,
1321
+ "loss": 0.2715,
1322
+ "step": 1670
1323
+ },
1324
+ {
1325
+ "epoch": 2.86,
1326
+ "grad_norm": 3.923163414001465,
1327
+ "learning_rate": 5.706984667802385e-05,
1328
+ "loss": 0.1991,
1329
+ "step": 1680
1330
+ },
1331
+ {
1332
+ "epoch": 2.88,
1333
+ "grad_norm": 7.479254245758057,
1334
+ "learning_rate": 5.6218057921635434e-05,
1335
+ "loss": 0.321,
1336
+ "step": 1690
1337
+ },
1338
+ {
1339
+ "epoch": 2.9,
1340
+ "grad_norm": 2.2710225582122803,
1341
+ "learning_rate": 5.536626916524702e-05,
1342
+ "loss": 0.2042,
1343
+ "step": 1700
1344
+ },
1345
+ {
1346
+ "epoch": 2.9,
1347
+ "eval_accuracy": 0.6788756388415673,
1348
+ "eval_loss": 1.0971218347549438,
1349
+ "eval_runtime": 36.9173,
1350
+ "eval_samples_per_second": 31.801,
1351
+ "eval_steps_per_second": 3.982,
1352
+ "step": 1700
1353
+ },
1354
+ {
1355
+ "epoch": 2.91,
1356
+ "grad_norm": 2.7610058784484863,
1357
+ "learning_rate": 5.45144804088586e-05,
1358
+ "loss": 0.3396,
1359
+ "step": 1710
1360
+ },
1361
+ {
1362
+ "epoch": 2.93,
1363
+ "grad_norm": 2.2475104331970215,
1364
+ "learning_rate": 5.366269165247019e-05,
1365
+ "loss": 0.266,
1366
+ "step": 1720
1367
+ },
1368
+ {
1369
+ "epoch": 2.95,
1370
+ "grad_norm": 4.55673885345459,
1371
+ "learning_rate": 5.281090289608177e-05,
1372
+ "loss": 0.341,
1373
+ "step": 1730
1374
+ },
1375
+ {
1376
+ "epoch": 2.96,
1377
+ "grad_norm": 4.0248260498046875,
1378
+ "learning_rate": 5.195911413969335e-05,
1379
+ "loss": 0.2005,
1380
+ "step": 1740
1381
+ },
1382
+ {
1383
+ "epoch": 2.98,
1384
+ "grad_norm": 4.798257827758789,
1385
+ "learning_rate": 5.110732538330494e-05,
1386
+ "loss": 0.2615,
1387
+ "step": 1750
1388
+ },
1389
+ {
1390
+ "epoch": 3.0,
1391
+ "grad_norm": 3.2967402935028076,
1392
+ "learning_rate": 5.025553662691652e-05,
1393
+ "loss": 0.1966,
1394
+ "step": 1760
1395
+ },
1396
+ {
1397
+ "epoch": 3.02,
1398
+ "grad_norm": 5.774517059326172,
1399
+ "learning_rate": 4.940374787052811e-05,
1400
+ "loss": 0.1141,
1401
+ "step": 1770
1402
+ },
1403
+ {
1404
+ "epoch": 3.03,
1405
+ "grad_norm": 1.7739803791046143,
1406
+ "learning_rate": 4.8551959114139695e-05,
1407
+ "loss": 0.0671,
1408
+ "step": 1780
1409
+ },
1410
+ {
1411
+ "epoch": 3.05,
1412
+ "grad_norm": 0.8837150931358337,
1413
+ "learning_rate": 4.7700170357751276e-05,
1414
+ "loss": 0.0835,
1415
+ "step": 1790
1416
+ },
1417
+ {
1418
+ "epoch": 3.07,
1419
+ "grad_norm": 1.7833037376403809,
1420
+ "learning_rate": 4.6848381601362864e-05,
1421
+ "loss": 0.0604,
1422
+ "step": 1800
1423
+ },
1424
+ {
1425
+ "epoch": 3.07,
1426
+ "eval_accuracy": 0.6916524701873935,
1427
+ "eval_loss": 1.0816737413406372,
1428
+ "eval_runtime": 36.8222,
1429
+ "eval_samples_per_second": 31.883,
1430
+ "eval_steps_per_second": 3.992,
1431
+ "step": 1800
1432
+ },
1433
+ {
1434
+ "epoch": 3.08,
1435
+ "grad_norm": 0.34585830569267273,
1436
+ "learning_rate": 4.599659284497445e-05,
1437
+ "loss": 0.092,
1438
+ "step": 1810
1439
+ },
1440
+ {
1441
+ "epoch": 3.1,
1442
+ "grad_norm": 0.7962571382522583,
1443
+ "learning_rate": 4.514480408858603e-05,
1444
+ "loss": 0.0587,
1445
+ "step": 1820
1446
+ },
1447
+ {
1448
+ "epoch": 3.12,
1449
+ "grad_norm": 0.16402888298034668,
1450
+ "learning_rate": 4.429301533219762e-05,
1451
+ "loss": 0.0547,
1452
+ "step": 1830
1453
+ },
1454
+ {
1455
+ "epoch": 3.13,
1456
+ "grad_norm": 0.624047040939331,
1457
+ "learning_rate": 4.34412265758092e-05,
1458
+ "loss": 0.0954,
1459
+ "step": 1840
1460
+ },
1461
+ {
1462
+ "epoch": 3.15,
1463
+ "grad_norm": 0.4253842532634735,
1464
+ "learning_rate": 4.258943781942079e-05,
1465
+ "loss": 0.0567,
1466
+ "step": 1850
1467
+ },
1468
+ {
1469
+ "epoch": 3.17,
1470
+ "grad_norm": 0.1523701399564743,
1471
+ "learning_rate": 4.173764906303237e-05,
1472
+ "loss": 0.0413,
1473
+ "step": 1860
1474
+ },
1475
+ {
1476
+ "epoch": 3.19,
1477
+ "grad_norm": 4.592818260192871,
1478
+ "learning_rate": 4.088586030664395e-05,
1479
+ "loss": 0.0968,
1480
+ "step": 1870
1481
+ },
1482
+ {
1483
+ "epoch": 3.2,
1484
+ "grad_norm": 1.4066344499588013,
1485
+ "learning_rate": 4.003407155025554e-05,
1486
+ "loss": 0.1454,
1487
+ "step": 1880
1488
+ },
1489
+ {
1490
+ "epoch": 3.22,
1491
+ "grad_norm": 2.1996095180511475,
1492
+ "learning_rate": 3.9182282793867125e-05,
1493
+ "loss": 0.1128,
1494
+ "step": 1890
1495
+ },
1496
+ {
1497
+ "epoch": 3.24,
1498
+ "grad_norm": 0.102027028799057,
1499
+ "learning_rate": 3.8330494037478706e-05,
1500
+ "loss": 0.0716,
1501
+ "step": 1900
1502
+ },
1503
+ {
1504
+ "epoch": 3.24,
1505
+ "eval_accuracy": 0.692504258943782,
1506
+ "eval_loss": 1.1307132244110107,
1507
+ "eval_runtime": 37.0403,
1508
+ "eval_samples_per_second": 31.695,
1509
+ "eval_steps_per_second": 3.969,
1510
+ "step": 1900
1511
+ },
1512
+ {
1513
+ "epoch": 3.25,
1514
+ "grad_norm": 1.6857343912124634,
1515
+ "learning_rate": 3.7478705281090294e-05,
1516
+ "loss": 0.04,
1517
+ "step": 1910
1518
+ },
1519
+ {
1520
+ "epoch": 3.27,
1521
+ "grad_norm": 1.2973403930664062,
1522
+ "learning_rate": 3.6626916524701875e-05,
1523
+ "loss": 0.0403,
1524
+ "step": 1920
1525
+ },
1526
+ {
1527
+ "epoch": 3.29,
1528
+ "grad_norm": 0.41860514879226685,
1529
+ "learning_rate": 3.577512776831346e-05,
1530
+ "loss": 0.0642,
1531
+ "step": 1930
1532
+ },
1533
+ {
1534
+ "epoch": 3.3,
1535
+ "grad_norm": 0.5436795353889465,
1536
+ "learning_rate": 3.492333901192504e-05,
1537
+ "loss": 0.0836,
1538
+ "step": 1940
1539
+ },
1540
+ {
1541
+ "epoch": 3.32,
1542
+ "grad_norm": 0.21996204555034637,
1543
+ "learning_rate": 3.407155025553663e-05,
1544
+ "loss": 0.0406,
1545
+ "step": 1950
1546
+ },
1547
+ {
1548
+ "epoch": 3.34,
1549
+ "grad_norm": 0.14845231175422668,
1550
+ "learning_rate": 3.321976149914821e-05,
1551
+ "loss": 0.0385,
1552
+ "step": 1960
1553
+ },
1554
+ {
1555
+ "epoch": 3.36,
1556
+ "grad_norm": 3.531405448913574,
1557
+ "learning_rate": 3.236797274275979e-05,
1558
+ "loss": 0.0824,
1559
+ "step": 1970
1560
+ },
1561
+ {
1562
+ "epoch": 3.37,
1563
+ "grad_norm": 0.07682117819786072,
1564
+ "learning_rate": 3.151618398637139e-05,
1565
+ "loss": 0.0717,
1566
+ "step": 1980
1567
+ },
1568
+ {
1569
+ "epoch": 3.39,
1570
+ "grad_norm": 0.07611515372991562,
1571
+ "learning_rate": 3.066439522998297e-05,
1572
+ "loss": 0.0572,
1573
+ "step": 1990
1574
+ },
1575
+ {
1576
+ "epoch": 3.41,
1577
+ "grad_norm": 0.6266534328460693,
1578
+ "learning_rate": 2.9812606473594552e-05,
1579
+ "loss": 0.0822,
1580
+ "step": 2000
1581
+ },
1582
+ {
1583
+ "epoch": 3.41,
1584
+ "eval_accuracy": 0.692504258943782,
1585
+ "eval_loss": 1.1826940774917603,
1586
+ "eval_runtime": 37.1369,
1587
+ "eval_samples_per_second": 31.613,
1588
+ "eval_steps_per_second": 3.958,
1589
+ "step": 2000
1590
+ },
1591
+ {
1592
+ "epoch": 3.42,
1593
+ "grad_norm": 0.1280030608177185,
1594
+ "learning_rate": 2.8960817717206136e-05,
1595
+ "loss": 0.0244,
1596
+ "step": 2010
1597
+ },
1598
+ {
1599
+ "epoch": 3.44,
1600
+ "grad_norm": 0.07406999170780182,
1601
+ "learning_rate": 2.8109028960817717e-05,
1602
+ "loss": 0.0574,
1603
+ "step": 2020
1604
+ },
1605
+ {
1606
+ "epoch": 3.46,
1607
+ "grad_norm": 5.587332248687744,
1608
+ "learning_rate": 2.72572402044293e-05,
1609
+ "loss": 0.0352,
1610
+ "step": 2030
1611
+ },
1612
+ {
1613
+ "epoch": 3.48,
1614
+ "grad_norm": 2.2010979652404785,
1615
+ "learning_rate": 2.6405451448040885e-05,
1616
+ "loss": 0.0789,
1617
+ "step": 2040
1618
+ },
1619
+ {
1620
+ "epoch": 3.49,
1621
+ "grad_norm": 2.9271368980407715,
1622
+ "learning_rate": 2.555366269165247e-05,
1623
+ "loss": 0.082,
1624
+ "step": 2050
1625
+ },
1626
+ {
1627
+ "epoch": 3.51,
1628
+ "grad_norm": 0.05890679359436035,
1629
+ "learning_rate": 2.4701873935264054e-05,
1630
+ "loss": 0.0769,
1631
+ "step": 2060
1632
+ },
1633
+ {
1634
+ "epoch": 3.53,
1635
+ "grad_norm": 0.7043523192405701,
1636
+ "learning_rate": 2.3850085178875638e-05,
1637
+ "loss": 0.0819,
1638
+ "step": 2070
1639
+ },
1640
+ {
1641
+ "epoch": 3.54,
1642
+ "grad_norm": 0.12047506123781204,
1643
+ "learning_rate": 2.2998296422487226e-05,
1644
+ "loss": 0.0195,
1645
+ "step": 2080
1646
+ },
1647
+ {
1648
+ "epoch": 3.56,
1649
+ "grad_norm": 0.1116802990436554,
1650
+ "learning_rate": 2.214650766609881e-05,
1651
+ "loss": 0.0159,
1652
+ "step": 2090
1653
+ },
1654
+ {
1655
+ "epoch": 3.58,
1656
+ "grad_norm": 0.09187493473291397,
1657
+ "learning_rate": 2.1294718909710394e-05,
1658
+ "loss": 0.0889,
1659
+ "step": 2100
1660
+ },
1661
+ {
1662
+ "epoch": 3.58,
1663
+ "eval_accuracy": 0.6933560477001703,
1664
+ "eval_loss": 1.2423571348190308,
1665
+ "eval_runtime": 37.3059,
1666
+ "eval_samples_per_second": 31.47,
1667
+ "eval_steps_per_second": 3.94,
1668
+ "step": 2100
1669
+ },
1670
+ {
1671
+ "epoch": 3.59,
1672
+ "grad_norm": 4.332376956939697,
1673
+ "learning_rate": 2.0442930153321975e-05,
1674
+ "loss": 0.0939,
1675
+ "step": 2110
1676
+ },
1677
+ {
1678
+ "epoch": 3.61,
1679
+ "grad_norm": 0.13916102051734924,
1680
+ "learning_rate": 1.9591141396933563e-05,
1681
+ "loss": 0.0933,
1682
+ "step": 2120
1683
+ },
1684
+ {
1685
+ "epoch": 3.63,
1686
+ "grad_norm": 7.690703392028809,
1687
+ "learning_rate": 1.8739352640545147e-05,
1688
+ "loss": 0.0496,
1689
+ "step": 2130
1690
+ },
1691
+ {
1692
+ "epoch": 3.65,
1693
+ "grad_norm": 2.5700595378875732,
1694
+ "learning_rate": 1.788756388415673e-05,
1695
+ "loss": 0.0782,
1696
+ "step": 2140
1697
+ },
1698
+ {
1699
+ "epoch": 3.66,
1700
+ "grad_norm": 0.20934216678142548,
1701
+ "learning_rate": 1.7035775127768315e-05,
1702
+ "loss": 0.0606,
1703
+ "step": 2150
1704
+ },
1705
+ {
1706
+ "epoch": 3.68,
1707
+ "grad_norm": 1.2959486246109009,
1708
+ "learning_rate": 1.6183986371379896e-05,
1709
+ "loss": 0.0601,
1710
+ "step": 2160
1711
+ },
1712
+ {
1713
+ "epoch": 3.7,
1714
+ "grad_norm": 0.2652721405029297,
1715
+ "learning_rate": 1.5332197614991484e-05,
1716
+ "loss": 0.062,
1717
+ "step": 2170
1718
+ },
1719
+ {
1720
+ "epoch": 3.71,
1721
+ "grad_norm": 0.48360127210617065,
1722
+ "learning_rate": 1.4480408858603068e-05,
1723
+ "loss": 0.054,
1724
+ "step": 2180
1725
+ },
1726
+ {
1727
+ "epoch": 3.73,
1728
+ "grad_norm": 3.1118693351745605,
1729
+ "learning_rate": 1.362862010221465e-05,
1730
+ "loss": 0.0989,
1731
+ "step": 2190
1732
+ },
1733
+ {
1734
+ "epoch": 3.75,
1735
+ "grad_norm": 0.9077383279800415,
1736
+ "learning_rate": 1.2776831345826235e-05,
1737
+ "loss": 0.0855,
1738
+ "step": 2200
1739
+ },
1740
+ {
1741
+ "epoch": 3.75,
1742
+ "eval_accuracy": 0.6899488926746167,
1743
+ "eval_loss": 1.2667156457901,
1744
+ "eval_runtime": 36.8511,
1745
+ "eval_samples_per_second": 31.858,
1746
+ "eval_steps_per_second": 3.989,
1747
+ "step": 2200
1748
+ },
1749
+ {
1750
+ "epoch": 3.76,
1751
+ "grad_norm": 0.13304296135902405,
1752
+ "learning_rate": 1.1925042589437819e-05,
1753
+ "loss": 0.0675,
1754
+ "step": 2210
1755
+ },
1756
+ {
1757
+ "epoch": 3.78,
1758
+ "grad_norm": 1.3241567611694336,
1759
+ "learning_rate": 1.1073253833049405e-05,
1760
+ "loss": 0.0753,
1761
+ "step": 2220
1762
+ },
1763
+ {
1764
+ "epoch": 3.8,
1765
+ "grad_norm": 0.2818525731563568,
1766
+ "learning_rate": 1.0221465076660987e-05,
1767
+ "loss": 0.0998,
1768
+ "step": 2230
1769
+ },
1770
+ {
1771
+ "epoch": 3.82,
1772
+ "grad_norm": 7.136697292327881,
1773
+ "learning_rate": 9.369676320272573e-06,
1774
+ "loss": 0.0314,
1775
+ "step": 2240
1776
+ },
1777
+ {
1778
+ "epoch": 3.83,
1779
+ "grad_norm": 1.372044324874878,
1780
+ "learning_rate": 8.517887563884158e-06,
1781
+ "loss": 0.0768,
1782
+ "step": 2250
1783
+ },
1784
+ {
1785
+ "epoch": 3.85,
1786
+ "grad_norm": 6.264348983764648,
1787
+ "learning_rate": 7.666098807495742e-06,
1788
+ "loss": 0.1516,
1789
+ "step": 2260
1790
+ },
1791
+ {
1792
+ "epoch": 3.87,
1793
+ "grad_norm": 0.1342085599899292,
1794
+ "learning_rate": 6.814310051107325e-06,
1795
+ "loss": 0.0812,
1796
+ "step": 2270
1797
+ },
1798
+ {
1799
+ "epoch": 3.88,
1800
+ "grad_norm": 0.7664629220962524,
1801
+ "learning_rate": 5.9625212947189095e-06,
1802
+ "loss": 0.0474,
1803
+ "step": 2280
1804
+ },
1805
+ {
1806
+ "epoch": 3.9,
1807
+ "grad_norm": 4.264090538024902,
1808
+ "learning_rate": 5.110732538330494e-06,
1809
+ "loss": 0.0903,
1810
+ "step": 2290
1811
+ },
1812
+ {
1813
+ "epoch": 3.92,
1814
+ "grad_norm": 0.07316776365041733,
1815
+ "learning_rate": 4.258943781942079e-06,
1816
+ "loss": 0.0682,
1817
+ "step": 2300
1818
+ },
1819
+ {
1820
+ "epoch": 3.92,
1821
+ "eval_accuracy": 0.6950596252129472,
1822
+ "eval_loss": 1.2470241785049438,
1823
+ "eval_runtime": 37.0027,
1824
+ "eval_samples_per_second": 31.727,
1825
+ "eval_steps_per_second": 3.973,
1826
+ "step": 2300
1827
+ },
1828
+ {
1829
+ "epoch": 3.94,
1830
+ "grad_norm": 1.477973222732544,
1831
+ "learning_rate": 3.4071550255536626e-06,
1832
+ "loss": 0.0587,
1833
+ "step": 2310
1834
+ },
1835
+ {
1836
+ "epoch": 3.95,
1837
+ "grad_norm": 1.249779224395752,
1838
+ "learning_rate": 2.555366269165247e-06,
1839
+ "loss": 0.0546,
1840
+ "step": 2320
1841
+ },
1842
+ {
1843
+ "epoch": 3.97,
1844
+ "grad_norm": 1.9763495922088623,
1845
+ "learning_rate": 1.7035775127768313e-06,
1846
+ "loss": 0.0539,
1847
+ "step": 2330
1848
+ },
1849
+ {
1850
+ "epoch": 3.99,
1851
+ "grad_norm": 0.11824575811624527,
1852
+ "learning_rate": 8.517887563884157e-07,
1853
+ "loss": 0.0322,
1854
+ "step": 2340
1855
+ },
1856
  {
1857
  "epoch": 4.0,
1858
+ "step": 2348,
1859
+ "total_flos": 2.910419581971751e+18,
1860
+ "train_loss": 0.4888155373286145,
1861
+ "train_runtime": 2894.9609,
1862
+ "train_samples_per_second": 12.973,
1863
+ "train_steps_per_second": 0.811
1864
  }
1865
  ],
1866
  "logging_steps": 10,
1867
+ "max_steps": 2348,
1868
  "num_input_tokens_seen": 0,
1869
  "num_train_epochs": 4,
1870
  "save_steps": 100,
1871
+ "total_flos": 2.910419581971751e+18,
1872
  "train_batch_size": 16,
1873
  "trial_name": null,
1874
  "trial_params": null