edlee123 commited on
Commit
f50a312
1 Parent(s): fa09935

End of training

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ base_model: BridgeTower/bridgetower-large-itm-mlm-itc
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
- - newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
@@ -15,7 +15,12 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # bridgetower
17
 
18
- This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the newyorker_caption_contest dataset.
 
 
 
 
 
19
 
20
  ## Model description
21
 
 
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
+ - jmhessel/newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
 
15
 
16
  # bridgetower
17
 
18
+ This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the jmhessel/newyorker_caption_contest matching dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.1205
21
+ - Memory Allocated (gb): 51.27
22
+ - Max Memory Allocated (gb): 63.75
23
+ - Total Memory Available (gb): 94.62
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12840959429740906,
4
- "eval_runtime": 7.7898,
5
- "eval_samples_per_second": 66.741,
6
- "eval_steps_per_second": 4.171,
7
- "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
- "train_loss": 0.06098026679486644,
12
- "train_runtime": 1192.2443,
13
- "train_samples_per_second": 46.607,
14
- "train_steps_per_second": 1.166
15
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12054020911455154,
4
+ "eval_runtime": 14.1795,
5
+ "eval_samples_per_second": 70.009,
6
+ "eval_steps_per_second": 4.376,
7
+ "max_memory_allocated (GB)": 63.75,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
+ "train_loss": 0.06080986156755564,
12
+ "train_runtime": 1168.6251,
13
+ "train_samples_per_second": 48.37,
14
+ "train_steps_per_second": 1.21
15
  }
runs/Oct16_17-52-02_workload-ai-workshop/events.out.tfevents.1729102480.workload-ai-workshop.9959.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8fd3630e67b28b9f38b2a483c0776dc52d6135a384177624a014e952925e08
3
+ size 998
test_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12840959429740906,
4
- "eval_runtime": 7.7898,
5
- "eval_samples_per_second": 66.741,
6
- "eval_steps_per_second": 4.171,
7
- "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12054020911455154,
4
+ "eval_runtime": 14.1795,
5
+ "eval_samples_per_second": 70.009,
6
+ "eval_steps_per_second": 4.376,
7
+ "max_memory_allocated (GB)": 63.75,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
train_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
  "epoch": 5.0,
3
- "max_memory_allocated (GB)": 57.18,
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
- "train_loss": 0.06098026679486644,
8
- "train_runtime": 1192.2443,
9
- "train_samples_per_second": 46.607,
10
- "train_steps_per_second": 1.166
11
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "max_memory_allocated (GB)": 63.75,
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
+ "train_loss": 0.06080986156755564,
8
+ "train_runtime": 1168.6251,
9
+ "train_samples_per_second": 48.37,
10
+ "train_steps_per_second": 1.21
11
  }
trainer_state.json CHANGED
@@ -10,9 +10,9 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04081632653061224,
13
- "grad_norm": 16.731555938720703,
14
  "learning_rate": 9.918367346938776e-06,
15
- "loss": 0.2616,
16
  "max_memory_allocated (GB)": 57.18,
17
  "memory_allocated (GB)": 50.57,
18
  "step": 10,
@@ -20,9 +20,9 @@
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
- "grad_norm": 7.052234649658203,
24
  "learning_rate": 9.836734693877552e-06,
25
- "loss": 0.1555,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
@@ -30,9 +30,9 @@
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
- "grad_norm": 6.5298075675964355,
34
  "learning_rate": 9.755102040816327e-06,
35
- "loss": 0.1251,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
@@ -40,9 +40,9 @@
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
- "grad_norm": 4.405805587768555,
44
  "learning_rate": 9.673469387755103e-06,
45
- "loss": 0.1102,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
@@ -50,9 +50,9 @@
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
- "grad_norm": 4.870044708251953,
54
  "learning_rate": 9.591836734693878e-06,
55
- "loss": 0.1232,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
@@ -60,7 +60,7 @@
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
- "grad_norm": 1.6433866024017334,
64
  "learning_rate": 9.510204081632653e-06,
65
  "loss": 0.0797,
66
  "max_memory_allocated (GB)": 57.18,
@@ -70,9 +70,9 @@
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
- "grad_norm": 4.2432074546813965,
74
  "learning_rate": 9.42857142857143e-06,
75
- "loss": 0.1031,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
@@ -80,9 +80,9 @@
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
- "grad_norm": 2.0352487564086914,
84
  "learning_rate": 9.346938775510204e-06,
85
- "loss": 0.1115,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
@@ -90,9 +90,9 @@
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
- "grad_norm": 4.201560020446777,
94
  "learning_rate": 9.26530612244898e-06,
95
- "loss": 0.0817,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
@@ -100,9 +100,9 @@
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
- "grad_norm": 3.618368625640869,
104
  "learning_rate": 9.183673469387756e-06,
105
- "loss": 0.0768,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
@@ -110,9 +110,9 @@
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
- "grad_norm": 4.793916702270508,
114
  "learning_rate": 9.102040816326532e-06,
115
- "loss": 0.071,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
@@ -120,9 +120,9 @@
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
- "grad_norm": 2.3223495483398438,
124
  "learning_rate": 9.020408163265307e-06,
125
- "loss": 0.0707,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
@@ -130,9 +130,9 @@
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
- "grad_norm": 3.5389153957366943,
134
  "learning_rate": 8.938775510204082e-06,
135
- "loss": 0.0599,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
@@ -140,9 +140,9 @@
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
- "grad_norm": 1.586653232574463,
144
  "learning_rate": 8.857142857142858e-06,
145
- "loss": 0.0491,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
@@ -150,9 +150,9 @@
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
- "grad_norm": 1.5236841440200806,
154
  "learning_rate": 8.775510204081633e-06,
155
- "loss": 0.0632,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
@@ -160,9 +160,9 @@
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
- "grad_norm": 2.752020835876465,
164
  "learning_rate": 8.69387755102041e-06,
165
- "loss": 0.0722,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
@@ -170,9 +170,9 @@
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
- "grad_norm": 7.606927394866943,
174
  "learning_rate": 8.612244897959184e-06,
175
- "loss": 0.0756,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
@@ -180,9 +180,9 @@
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
- "grad_norm": 1.5622702836990356,
184
  "learning_rate": 8.530612244897961e-06,
185
- "loss": 0.0617,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
@@ -190,9 +190,9 @@
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
- "grad_norm": 0.9614956378936768,
194
  "learning_rate": 8.448979591836736e-06,
195
- "loss": 0.0572,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
@@ -200,9 +200,9 @@
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
- "grad_norm": 0.7814755439758301,
204
  "learning_rate": 8.36734693877551e-06,
205
- "loss": 0.0636,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
@@ -210,9 +210,9 @@
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
- "grad_norm": 1.352851390838623,
214
  "learning_rate": 8.285714285714287e-06,
215
- "loss": 0.0648,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
@@ -220,9 +220,9 @@
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
- "grad_norm": 1.6814969778060913,
224
  "learning_rate": 8.204081632653062e-06,
225
- "loss": 0.0604,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
@@ -230,9 +230,9 @@
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
- "grad_norm": 0.859993040561676,
234
  "learning_rate": 8.122448979591837e-06,
235
- "loss": 0.0549,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
@@ -240,9 +240,9 @@
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
- "grad_norm": 0.6439819931983948,
244
  "learning_rate": 8.040816326530613e-06,
245
- "loss": 0.0493,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
@@ -250,9 +250,9 @@
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
- "grad_norm": 0.8465150594711304,
254
  "learning_rate": 7.959183673469388e-06,
255
- "loss": 0.0624,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
@@ -260,9 +260,9 @@
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
- "grad_norm": 1.0257333517074585,
264
  "learning_rate": 7.877551020408164e-06,
265
- "loss": 0.056,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
@@ -270,9 +270,9 @@
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
- "grad_norm": 2.619938850402832,
274
  "learning_rate": 7.79591836734694e-06,
275
- "loss": 0.0648,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
@@ -280,9 +280,9 @@
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
- "grad_norm": 0.4946042001247406,
284
  "learning_rate": 7.714285714285716e-06,
285
- "loss": 0.0586,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
@@ -290,9 +290,9 @@
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
- "grad_norm": 1.0154733657836914,
294
  "learning_rate": 7.63265306122449e-06,
295
- "loss": 0.0505,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
@@ -300,9 +300,9 @@
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
- "grad_norm": 1.0347952842712402,
304
  "learning_rate": 7.551020408163265e-06,
305
- "loss": 0.0646,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
@@ -310,9 +310,9 @@
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
- "grad_norm": 0.7844366431236267,
314
  "learning_rate": 7.469387755102041e-06,
315
- "loss": 0.0676,
316
  "max_memory_allocated (GB)": 57.18,
317
  "memory_allocated (GB)": 50.57,
318
  "step": 310,
@@ -320,9 +320,9 @@
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
- "grad_norm": 1.1971337795257568,
324
  "learning_rate": 7.387755102040817e-06,
325
- "loss": 0.0499,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
@@ -330,9 +330,9 @@
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
- "grad_norm": 0.6674404740333557,
334
  "learning_rate": 7.306122448979592e-06,
335
- "loss": 0.0602,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
@@ -340,9 +340,9 @@
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
- "grad_norm": 1.511208415031433,
344
  "learning_rate": 7.224489795918368e-06,
345
- "loss": 0.0547,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
@@ -350,9 +350,9 @@
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
- "grad_norm": 0.5328841209411621,
354
  "learning_rate": 7.1428571428571436e-06,
355
- "loss": 0.0486,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
@@ -360,9 +360,9 @@
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
- "grad_norm": 1.464439034461975,
364
  "learning_rate": 7.061224489795919e-06,
365
- "loss": 0.0464,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
@@ -370,9 +370,9 @@
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
- "grad_norm": 0.834863543510437,
374
  "learning_rate": 6.979591836734695e-06,
375
- "loss": 0.0591,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
@@ -380,9 +380,9 @@
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
- "grad_norm": 0.5399609208106995,
384
  "learning_rate": 6.8979591836734705e-06,
385
- "loss": 0.0464,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
@@ -390,9 +390,9 @@
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
- "grad_norm": 0.8577661514282227,
394
  "learning_rate": 6.816326530612245e-06,
395
- "loss": 0.0654,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
@@ -400,7 +400,7 @@
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
- "grad_norm": 0.5057955384254456,
404
  "learning_rate": 6.734693877551021e-06,
405
  "loss": 0.0609,
406
  "max_memory_allocated (GB)": 57.18,
@@ -410,9 +410,9 @@
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
- "grad_norm": 0.9135333895683289,
414
  "learning_rate": 6.653061224489797e-06,
415
- "loss": 0.0607,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
@@ -420,9 +420,9 @@
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
- "grad_norm": 2.9697179794311523,
424
  "learning_rate": 6.571428571428572e-06,
425
- "loss": 0.054,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
@@ -430,9 +430,9 @@
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
- "grad_norm": 0.3473312556743622,
434
  "learning_rate": 6.489795918367348e-06,
435
- "loss": 0.0685,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
@@ -440,9 +440,9 @@
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
- "grad_norm": 1.4528335332870483,
444
  "learning_rate": 6.408163265306124e-06,
445
- "loss": 0.0611,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
@@ -450,9 +450,9 @@
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
- "grad_norm": 0.48578280210494995,
454
  "learning_rate": 6.326530612244899e-06,
455
- "loss": 0.0438,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
@@ -460,9 +460,9 @@
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
- "grad_norm": 0.3472760021686554,
464
  "learning_rate": 6.244897959183675e-06,
465
- "loss": 0.0544,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
@@ -470,9 +470,9 @@
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
- "grad_norm": 1.0984327793121338,
474
  "learning_rate": 6.163265306122449e-06,
475
- "loss": 0.0438,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
@@ -480,9 +480,9 @@
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
- "grad_norm": 0.20147933065891266,
484
  "learning_rate": 6.0816326530612245e-06,
485
- "loss": 0.0518,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
@@ -490,9 +490,9 @@
490
  },
491
  {
492
  "epoch": 2.0,
493
- "grad_norm": 1.1583309173583984,
494
  "learning_rate": 6e-06,
495
- "loss": 0.0637,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
@@ -500,9 +500,9 @@
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
- "grad_norm": 0.6601622104644775,
504
  "learning_rate": 5.918367346938776e-06,
505
- "loss": 0.0596,
506
  "max_memory_allocated (GB)": 57.18,
507
  "memory_allocated (GB)": 50.57,
508
  "step": 500,
@@ -510,9 +510,9 @@
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
- "grad_norm": 0.5227305293083191,
514
  "learning_rate": 5.8367346938775515e-06,
515
- "loss": 0.0493,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
@@ -520,9 +520,9 @@
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
- "grad_norm": 0.8996191620826721,
524
  "learning_rate": 5.755102040816327e-06,
525
- "loss": 0.0461,
526
  "max_memory_allocated (GB)": 57.18,
527
  "memory_allocated (GB)": 50.57,
528
  "step": 520,
@@ -530,9 +530,9 @@
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
- "grad_norm": 1.0684189796447754,
534
  "learning_rate": 5.673469387755103e-06,
535
- "loss": 0.0629,
536
  "max_memory_allocated (GB)": 57.18,
537
  "memory_allocated (GB)": 50.57,
538
  "step": 530,
@@ -540,9 +540,9 @@
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
- "grad_norm": 0.5558530688285828,
544
  "learning_rate": 5.591836734693878e-06,
545
- "loss": 0.0581,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
@@ -550,9 +550,9 @@
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
- "grad_norm": 1.1996757984161377,
554
  "learning_rate": 5.510204081632653e-06,
555
- "loss": 0.0626,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
@@ -560,9 +560,9 @@
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
- "grad_norm": 1.2928632497787476,
564
  "learning_rate": 5.428571428571429e-06,
565
- "loss": 0.0575,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
@@ -570,9 +570,9 @@
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
- "grad_norm": 0.7934871912002563,
574
  "learning_rate": 5.3469387755102045e-06,
575
- "loss": 0.0577,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
@@ -580,9 +580,9 @@
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
- "grad_norm": 3.946485757827759,
584
  "learning_rate": 5.26530612244898e-06,
585
- "loss": 0.0663,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
@@ -590,9 +590,9 @@
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
- "grad_norm": 0.43567588925361633,
594
  "learning_rate": 5.183673469387756e-06,
595
- "loss": 0.0539,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
@@ -600,9 +600,9 @@
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
- "grad_norm": 0.5725533962249756,
604
  "learning_rate": 5.1020408163265315e-06,
605
- "loss": 0.0438,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
@@ -610,9 +610,9 @@
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
- "grad_norm": 0.44328320026397705,
614
  "learning_rate": 5.020408163265307e-06,
615
- "loss": 0.041,
616
  "max_memory_allocated (GB)": 57.18,
617
  "memory_allocated (GB)": 50.57,
618
  "step": 610,
@@ -620,9 +620,9 @@
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
- "grad_norm": 1.338100790977478,
624
  "learning_rate": 4.938775510204082e-06,
625
- "loss": 0.0424,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
@@ -630,9 +630,9 @@
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
- "grad_norm": 0.92643803358078,
634
  "learning_rate": 4.857142857142858e-06,
635
- "loss": 0.0811,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
@@ -640,9 +640,9 @@
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
- "grad_norm": 1.1147398948669434,
644
  "learning_rate": 4.775510204081633e-06,
645
- "loss": 0.0492,
646
  "max_memory_allocated (GB)": 57.18,
647
  "memory_allocated (GB)": 50.57,
648
  "step": 640,
@@ -650,9 +650,9 @@
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
- "grad_norm": 0.6104307174682617,
654
  "learning_rate": 4.693877551020409e-06,
655
- "loss": 0.0468,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
@@ -660,9 +660,9 @@
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
- "grad_norm": 0.9826134443283081,
664
  "learning_rate": 4.612244897959184e-06,
665
- "loss": 0.0471,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
@@ -670,9 +670,9 @@
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
- "grad_norm": 0.7680672407150269,
674
  "learning_rate": 4.530612244897959e-06,
675
- "loss": 0.0858,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
@@ -680,9 +680,9 @@
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
- "grad_norm": 0.9682340025901794,
684
  "learning_rate": 4.448979591836735e-06,
685
- "loss": 0.0484,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
@@ -690,9 +690,9 @@
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
- "grad_norm": 0.37712323665618896,
694
  "learning_rate": 4.367346938775511e-06,
695
- "loss": 0.0443,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
@@ -700,9 +700,9 @@
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
- "grad_norm": 0.34970754384994507,
704
  "learning_rate": 4.2857142857142855e-06,
705
- "loss": 0.0434,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
@@ -710,9 +710,9 @@
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
- "grad_norm": 0.9949877262115479,
714
  "learning_rate": 4.204081632653061e-06,
715
- "loss": 0.0553,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
@@ -720,9 +720,9 @@
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
- "grad_norm": 1.4436949491500854,
724
  "learning_rate": 4.122448979591837e-06,
725
- "loss": 0.0583,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
@@ -730,9 +730,9 @@
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
- "grad_norm": 0.1619979739189148,
734
  "learning_rate": 4.040816326530612e-06,
735
- "loss": 0.0336,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
@@ -740,7 +740,7 @@
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
- "grad_norm": 1.2799049615859985,
744
  "learning_rate": 3.959183673469388e-06,
745
  "loss": 0.0536,
746
  "max_memory_allocated (GB)": 57.18,
@@ -750,9 +750,9 @@
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
- "grad_norm": 0.5613189935684204,
754
  "learning_rate": 3.877551020408164e-06,
755
- "loss": 0.062,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
@@ -760,9 +760,9 @@
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
- "grad_norm": 0.827383279800415,
764
  "learning_rate": 3.795918367346939e-06,
765
- "loss": 0.0527,
766
  "max_memory_allocated (GB)": 57.18,
767
  "memory_allocated (GB)": 50.57,
768
  "step": 760,
@@ -770,9 +770,9 @@
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
- "grad_norm": 0.6983201503753662,
774
  "learning_rate": 3.7142857142857146e-06,
775
- "loss": 0.0691,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
@@ -780,9 +780,9 @@
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
- "grad_norm": 1.0466923713684082,
784
  "learning_rate": 3.6326530612244903e-06,
785
- "loss": 0.0644,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
@@ -790,9 +790,9 @@
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
- "grad_norm": 0.3068871796131134,
794
  "learning_rate": 3.5510204081632655e-06,
795
- "loss": 0.0524,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
@@ -800,9 +800,9 @@
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
- "grad_norm": 0.40160393714904785,
804
  "learning_rate": 3.469387755102041e-06,
805
- "loss": 0.0434,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
@@ -810,9 +810,9 @@
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
- "grad_norm": 0.880214512348175,
814
  "learning_rate": 3.3877551020408168e-06,
815
- "loss": 0.056,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
@@ -820,9 +820,9 @@
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
- "grad_norm": 0.9539953470230103,
824
  "learning_rate": 3.3061224489795924e-06,
825
- "loss": 0.0464,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
@@ -830,9 +830,9 @@
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
- "grad_norm": 0.24522298574447632,
834
  "learning_rate": 3.2244897959183672e-06,
835
- "loss": 0.0485,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
@@ -840,9 +840,9 @@
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
- "grad_norm": 0.4946345388889313,
844
  "learning_rate": 3.142857142857143e-06,
845
- "loss": 0.0527,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
@@ -850,9 +850,9 @@
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
- "grad_norm": 0.4724675416946411,
854
  "learning_rate": 3.0612244897959185e-06,
855
- "loss": 0.0813,
856
  "max_memory_allocated (GB)": 57.18,
857
  "memory_allocated (GB)": 50.57,
858
  "step": 850,
@@ -860,9 +860,9 @@
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
- "grad_norm": 0.9907402396202087,
864
  "learning_rate": 2.979591836734694e-06,
865
- "loss": 0.0447,
866
  "max_memory_allocated (GB)": 57.18,
867
  "memory_allocated (GB)": 50.57,
868
  "step": 860,
@@ -870,9 +870,9 @@
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
- "grad_norm": 0.19696560502052307,
874
  "learning_rate": 2.8979591836734694e-06,
875
- "loss": 0.0635,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
@@ -880,9 +880,9 @@
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
- "grad_norm": 0.7972800135612488,
884
  "learning_rate": 2.816326530612245e-06,
885
- "loss": 0.0438,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
@@ -890,9 +890,9 @@
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
- "grad_norm": 0.21193134784698486,
894
  "learning_rate": 2.7346938775510207e-06,
895
- "loss": 0.029,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
@@ -900,9 +900,9 @@
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
- "grad_norm": 0.6128103137016296,
904
  "learning_rate": 2.6530612244897964e-06,
905
- "loss": 0.0514,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
@@ -910,9 +910,9 @@
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
- "grad_norm": 0.8112168312072754,
914
  "learning_rate": 2.571428571428571e-06,
915
- "loss": 0.061,
916
  "max_memory_allocated (GB)": 57.18,
917
  "memory_allocated (GB)": 50.57,
918
  "step": 910,
@@ -920,9 +920,9 @@
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
- "grad_norm": 0.18730562925338745,
924
  "learning_rate": 2.489795918367347e-06,
925
- "loss": 0.0546,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
@@ -930,9 +930,9 @@
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
- "grad_norm": 0.3866801857948303,
934
  "learning_rate": 2.4081632653061225e-06,
935
- "loss": 0.0501,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
@@ -940,9 +940,9 @@
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
- "grad_norm": 0.8816384077072144,
944
  "learning_rate": 2.326530612244898e-06,
945
- "loss": 0.0489,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
@@ -950,9 +950,9 @@
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
- "grad_norm": 0.5572797656059265,
954
  "learning_rate": 2.244897959183674e-06,
955
- "loss": 0.0599,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
@@ -960,9 +960,9 @@
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
- "grad_norm": 0.38238489627838135,
964
  "learning_rate": 2.1632653061224495e-06,
965
- "loss": 0.0497,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
@@ -970,9 +970,9 @@
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
- "grad_norm": 0.6144959926605225,
974
  "learning_rate": 2.0816326530612247e-06,
975
- "loss": 0.0741,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
@@ -980,265 +980,265 @@
980
  },
981
  {
982
  "epoch": 4.0,
983
- "grad_norm": 0.6087101697921753,
984
  "learning_rate": 2.0000000000000003e-06,
985
- "loss": 0.0703,
986
- "max_memory_allocated (GB)": 57.18,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
- "grad_norm": 0.5187469720840454,
994
  "learning_rate": 1.9183673469387756e-06,
995
- "loss": 0.0482,
996
- "max_memory_allocated (GB)": 57.18,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
- "grad_norm": 1.248850703239441,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
- "loss": 0.0631,
1006
- "max_memory_allocated (GB)": 57.18,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
- "grad_norm": 0.5806276798248291,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
- "loss": 0.0629,
1016
- "max_memory_allocated (GB)": 57.18,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
1019
  "total_memory_available (GB)": 94.62
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
- "grad_norm": 0.3565673828125,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
- "loss": 0.0407,
1026
- "max_memory_allocated (GB)": 57.18,
1027
  "memory_allocated (GB)": 50.57,
1028
  "step": 1020,
1029
  "total_memory_available (GB)": 94.62
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
- "grad_norm": 0.6948438882827759,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
- "loss": 0.053,
1036
- "max_memory_allocated (GB)": 57.18,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
1039
  "total_memory_available (GB)": 94.62
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
- "grad_norm": 0.5245764851570129,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
  "loss": 0.0399,
1046
- "max_memory_allocated (GB)": 57.18,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
1049
  "total_memory_available (GB)": 94.62
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
- "grad_norm": 0.7932385802268982,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
- "loss": 0.0502,
1056
- "max_memory_allocated (GB)": 57.18,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
1059
  "total_memory_available (GB)": 94.62
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
- "grad_norm": 0.30140048265457153,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
- "loss": 0.046,
1066
- "max_memory_allocated (GB)": 57.18,
1067
  "memory_allocated (GB)": 50.57,
1068
  "step": 1060,
1069
  "total_memory_available (GB)": 94.62
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
- "grad_norm": 0.570467472076416,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
- "loss": 0.0487,
1076
- "max_memory_allocated (GB)": 57.18,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
1079
  "total_memory_available (GB)": 94.62
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
- "grad_norm": 0.43690067529678345,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
- "loss": 0.0521,
1086
- "max_memory_allocated (GB)": 57.18,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
1089
  "total_memory_available (GB)": 94.62
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
- "grad_norm": 0.5298590660095215,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
- "loss": 0.0506,
1096
- "max_memory_allocated (GB)": 57.18,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
1099
  "total_memory_available (GB)": 94.62
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
- "grad_norm": 0.2310735136270523,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
- "loss": 0.036,
1106
- "max_memory_allocated (GB)": 57.18,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
1109
  "total_memory_available (GB)": 94.62
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
- "grad_norm": 0.13128583133220673,
1114
  "learning_rate": 9.387755102040817e-07,
1115
- "loss": 0.0463,
1116
- "max_memory_allocated (GB)": 57.18,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
1119
  "total_memory_available (GB)": 94.62
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
- "grad_norm": 0.7682464122772217,
1124
  "learning_rate": 8.571428571428572e-07,
1125
- "loss": 0.0403,
1126
- "max_memory_allocated (GB)": 57.18,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
1129
  "total_memory_available (GB)": 94.62
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
- "grad_norm": 0.6608971953392029,
1134
  "learning_rate": 7.755102040816327e-07,
1135
- "loss": 0.0543,
1136
- "max_memory_allocated (GB)": 57.18,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
1139
  "total_memory_available (GB)": 94.62
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
- "grad_norm": 0.8803687691688538,
1144
  "learning_rate": 6.938775510204082e-07,
1145
- "loss": 0.0728,
1146
- "max_memory_allocated (GB)": 57.18,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
1149
  "total_memory_available (GB)": 94.62
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
- "grad_norm": 4.121662139892578,
1154
  "learning_rate": 6.122448979591837e-07,
1155
- "loss": 0.0514,
1156
- "max_memory_allocated (GB)": 57.18,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
1159
  "total_memory_available (GB)": 94.62
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
- "grad_norm": 0.7500938773155212,
1164
  "learning_rate": 5.306122448979592e-07,
1165
- "loss": 0.0612,
1166
- "max_memory_allocated (GB)": 57.18,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
1169
  "total_memory_available (GB)": 94.62
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
- "grad_norm": 0.6001973748207092,
1174
  "learning_rate": 4.489795918367347e-07,
1175
  "loss": 0.0549,
1176
- "max_memory_allocated (GB)": 57.18,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
1179
  "total_memory_available (GB)": 94.62
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
- "grad_norm": 0.7522645592689514,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
- "loss": 0.0445,
1186
- "max_memory_allocated (GB)": 57.18,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
1189
  "total_memory_available (GB)": 94.62
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
- "grad_norm": 0.6640497446060181,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
- "loss": 0.0542,
1196
- "max_memory_allocated (GB)": 57.18,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
1199
  "total_memory_available (GB)": 94.62
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
- "grad_norm": 0.8031227588653564,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
  "loss": 0.0728,
1206
- "max_memory_allocated (GB)": 57.18,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
1209
  "total_memory_available (GB)": 94.62
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
- "grad_norm": 0.39187708497047424,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
- "loss": 0.065,
1216
- "max_memory_allocated (GB)": 57.18,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
1219
  "total_memory_available (GB)": 94.62
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
- "grad_norm": 3.809382915496826,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
- "loss": 0.0417,
1226
- "max_memory_allocated (GB)": 57.18,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
1229
  "total_memory_available (GB)": 94.62
1230
  },
1231
  {
1232
  "epoch": 5.0,
1233
- "max_memory_allocated (GB)": 57.18,
1234
  "memory_allocated (GB)": 50.57,
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
- "train_loss": 0.06098026679486644,
1239
- "train_runtime": 1192.2443,
1240
- "train_samples_per_second": 46.607,
1241
- "train_steps_per_second": 1.166
1242
  }
1243
  ],
1244
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04081632653061224,
13
+ "grad_norm": 9.589848518371582,
14
  "learning_rate": 9.918367346938776e-06,
15
+ "loss": 0.2612,
16
  "max_memory_allocated (GB)": 57.18,
17
  "memory_allocated (GB)": 50.57,
18
  "step": 10,
 
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
+ "grad_norm": 8.167236328125,
24
  "learning_rate": 9.836734693877552e-06,
25
+ "loss": 0.1542,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
 
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
+ "grad_norm": 5.958656311035156,
34
  "learning_rate": 9.755102040816327e-06,
35
+ "loss": 0.1236,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
 
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
+ "grad_norm": 4.233438968658447,
44
  "learning_rate": 9.673469387755103e-06,
45
+ "loss": 0.1105,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
 
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
+ "grad_norm": 4.4948201179504395,
54
  "learning_rate": 9.591836734693878e-06,
55
+ "loss": 0.1195,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
 
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
+ "grad_norm": 1.827812910079956,
64
  "learning_rate": 9.510204081632653e-06,
65
  "loss": 0.0797,
66
  "max_memory_allocated (GB)": 57.18,
 
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
+ "grad_norm": 3.889023780822754,
74
  "learning_rate": 9.42857142857143e-06,
75
+ "loss": 0.0989,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
 
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
+ "grad_norm": 1.9397954940795898,
84
  "learning_rate": 9.346938775510204e-06,
85
+ "loss": 0.1102,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
 
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
+ "grad_norm": 3.2782671451568604,
94
  "learning_rate": 9.26530612244898e-06,
95
+ "loss": 0.0824,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
 
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
+ "grad_norm": 5.711858749389648,
104
  "learning_rate": 9.183673469387756e-06,
105
+ "loss": 0.0762,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
 
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
+ "grad_norm": 3.395564317703247,
114
  "learning_rate": 9.102040816326532e-06,
115
+ "loss": 0.0726,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
 
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
+ "grad_norm": 2.2739310264587402,
124
  "learning_rate": 9.020408163265307e-06,
125
+ "loss": 0.0705,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
 
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
+ "grad_norm": 2.418794870376587,
134
  "learning_rate": 8.938775510204082e-06,
135
+ "loss": 0.0595,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
 
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
+ "grad_norm": 2.2896311283111572,
144
  "learning_rate": 8.857142857142858e-06,
145
+ "loss": 0.0498,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
 
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
+ "grad_norm": 1.7899913787841797,
154
  "learning_rate": 8.775510204081633e-06,
155
+ "loss": 0.0629,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
 
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
+ "grad_norm": 1.9983731508255005,
164
  "learning_rate": 8.69387755102041e-06,
165
+ "loss": 0.071,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
 
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
+ "grad_norm": 1.7236266136169434,
174
  "learning_rate": 8.612244897959184e-06,
175
+ "loss": 0.0748,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
 
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
+ "grad_norm": 1.1180106401443481,
184
  "learning_rate": 8.530612244897961e-06,
185
+ "loss": 0.0599,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
 
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
+ "grad_norm": 2.3057782649993896,
194
  "learning_rate": 8.448979591836736e-06,
195
+ "loss": 0.0588,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
 
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
+ "grad_norm": 0.8334403038024902,
204
  "learning_rate": 8.36734693877551e-06,
205
+ "loss": 0.0622,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
 
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
+ "grad_norm": 1.0548275709152222,
214
  "learning_rate": 8.285714285714287e-06,
215
+ "loss": 0.0623,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
 
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
+ "grad_norm": 1.3381606340408325,
224
  "learning_rate": 8.204081632653062e-06,
225
+ "loss": 0.0605,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
 
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
+ "grad_norm": 0.809412956237793,
234
  "learning_rate": 8.122448979591837e-06,
235
+ "loss": 0.0566,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
 
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
+ "grad_norm": 0.7182928323745728,
244
  "learning_rate": 8.040816326530613e-06,
245
+ "loss": 0.0496,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
 
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
+ "grad_norm": 1.1081018447875977,
254
  "learning_rate": 7.959183673469388e-06,
255
+ "loss": 0.0601,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
 
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
+ "grad_norm": 3.9478495121002197,
264
  "learning_rate": 7.877551020408164e-06,
265
+ "loss": 0.0541,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
 
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
+ "grad_norm": 0.8079606294631958,
274
  "learning_rate": 7.79591836734694e-06,
275
+ "loss": 0.063,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
 
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
+ "grad_norm": 1.4108704328536987,
284
  "learning_rate": 7.714285714285716e-06,
285
+ "loss": 0.0579,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
 
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
+ "grad_norm": 1.544438123703003,
294
  "learning_rate": 7.63265306122449e-06,
295
+ "loss": 0.0518,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
 
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
+ "grad_norm": 1.0270023345947266,
304
  "learning_rate": 7.551020408163265e-06,
305
+ "loss": 0.0658,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
 
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
+ "grad_norm": 1.0347421169281006,
314
  "learning_rate": 7.469387755102041e-06,
315
+ "loss": 0.0667,
316
  "max_memory_allocated (GB)": 57.18,
317
  "memory_allocated (GB)": 50.57,
318
  "step": 310,
 
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
+ "grad_norm": 1.5818060636520386,
324
  "learning_rate": 7.387755102040817e-06,
325
+ "loss": 0.0502,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
 
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
+ "grad_norm": 1.2030609846115112,
334
  "learning_rate": 7.306122448979592e-06,
335
+ "loss": 0.0616,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
 
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
+ "grad_norm": 1.2504222393035889,
344
  "learning_rate": 7.224489795918368e-06,
345
+ "loss": 0.0543,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
 
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
+ "grad_norm": 0.8420921564102173,
354
  "learning_rate": 7.1428571428571436e-06,
355
+ "loss": 0.0488,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
 
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
+ "grad_norm": 1.517096757888794,
364
  "learning_rate": 7.061224489795919e-06,
365
+ "loss": 0.0467,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
 
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
+ "grad_norm": 1.4490768909454346,
374
  "learning_rate": 6.979591836734695e-06,
375
+ "loss": 0.0585,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
 
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
+ "grad_norm": 0.5360353589057922,
384
  "learning_rate": 6.8979591836734705e-06,
385
+ "loss": 0.046,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
 
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
+ "grad_norm": 0.8193866610527039,
394
  "learning_rate": 6.816326530612245e-06,
395
+ "loss": 0.0657,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
 
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
+ "grad_norm": 0.5883302092552185,
404
  "learning_rate": 6.734693877551021e-06,
405
  "loss": 0.0609,
406
  "max_memory_allocated (GB)": 57.18,
 
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
+ "grad_norm": 0.6720415949821472,
414
  "learning_rate": 6.653061224489797e-06,
415
+ "loss": 0.0603,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
 
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
+ "grad_norm": 1.368994951248169,
424
  "learning_rate": 6.571428571428572e-06,
425
+ "loss": 0.0528,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
 
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
+ "grad_norm": 0.25535887479782104,
434
  "learning_rate": 6.489795918367348e-06,
435
+ "loss": 0.0674,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
 
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
+ "grad_norm": 0.8839388489723206,
444
  "learning_rate": 6.408163265306124e-06,
445
+ "loss": 0.0612,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
 
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
+ "grad_norm": 0.4119959771633148,
454
  "learning_rate": 6.326530612244899e-06,
455
+ "loss": 0.0435,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
 
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
+ "grad_norm": 0.45877301692962646,
464
  "learning_rate": 6.244897959183675e-06,
465
+ "loss": 0.0543,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
 
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
+ "grad_norm": 0.9594807624816895,
474
  "learning_rate": 6.163265306122449e-06,
475
+ "loss": 0.0433,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
 
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
+ "grad_norm": 0.168818861246109,
484
  "learning_rate": 6.0816326530612245e-06,
485
+ "loss": 0.0497,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
 
490
  },
491
  {
492
  "epoch": 2.0,
493
+ "grad_norm": 1.1468336582183838,
494
  "learning_rate": 6e-06,
495
+ "loss": 0.0651,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
 
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
+ "grad_norm": 0.6001573204994202,
504
  "learning_rate": 5.918367346938776e-06,
505
+ "loss": 0.0591,
506
  "max_memory_allocated (GB)": 57.18,
507
  "memory_allocated (GB)": 50.57,
508
  "step": 500,
 
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
+ "grad_norm": 0.8006247878074646,
514
  "learning_rate": 5.8367346938775515e-06,
515
+ "loss": 0.0489,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
 
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
+ "grad_norm": 0.6839123964309692,
524
  "learning_rate": 5.755102040816327e-06,
525
+ "loss": 0.0456,
526
  "max_memory_allocated (GB)": 57.18,
527
  "memory_allocated (GB)": 50.57,
528
  "step": 520,
 
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
+ "grad_norm": 1.020135521888733,
534
  "learning_rate": 5.673469387755103e-06,
535
+ "loss": 0.0627,
536
  "max_memory_allocated (GB)": 57.18,
537
  "memory_allocated (GB)": 50.57,
538
  "step": 530,
 
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
+ "grad_norm": 0.7156575322151184,
544
  "learning_rate": 5.591836734693878e-06,
545
+ "loss": 0.0584,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
 
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
+ "grad_norm": 1.195730447769165,
554
  "learning_rate": 5.510204081632653e-06,
555
+ "loss": 0.0628,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
 
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
+ "grad_norm": 0.8824738264083862,
564
  "learning_rate": 5.428571428571429e-06,
565
+ "loss": 0.0572,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
 
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
+ "grad_norm": 1.3354676961898804,
574
  "learning_rate": 5.3469387755102045e-06,
575
+ "loss": 0.0585,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
 
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
+ "grad_norm": 1.9097795486450195,
584
  "learning_rate": 5.26530612244898e-06,
585
+ "loss": 0.0668,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
 
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
+ "grad_norm": 0.8386860489845276,
594
  "learning_rate": 5.183673469387756e-06,
595
+ "loss": 0.0542,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
 
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
+ "grad_norm": 1.2374165058135986,
604
  "learning_rate": 5.1020408163265315e-06,
605
+ "loss": 0.0436,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
 
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
+ "grad_norm": 0.3719439208507538,
614
  "learning_rate": 5.020408163265307e-06,
615
+ "loss": 0.0408,
616
  "max_memory_allocated (GB)": 57.18,
617
  "memory_allocated (GB)": 50.57,
618
  "step": 610,
 
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
+ "grad_norm": 0.905327558517456,
624
  "learning_rate": 4.938775510204082e-06,
625
+ "loss": 0.0429,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
 
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
+ "grad_norm": 0.628597617149353,
634
  "learning_rate": 4.857142857142858e-06,
635
+ "loss": 0.0812,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
 
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
+ "grad_norm": 1.3098090887069702,
644
  "learning_rate": 4.775510204081633e-06,
645
+ "loss": 0.0495,
646
  "max_memory_allocated (GB)": 57.18,
647
  "memory_allocated (GB)": 50.57,
648
  "step": 640,
 
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
+ "grad_norm": 0.5635781288146973,
654
  "learning_rate": 4.693877551020409e-06,
655
+ "loss": 0.0466,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
 
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
+ "grad_norm": 0.6197735071182251,
664
  "learning_rate": 4.612244897959184e-06,
665
+ "loss": 0.0474,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
 
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
+ "grad_norm": 0.6390748620033264,
674
  "learning_rate": 4.530612244897959e-06,
675
+ "loss": 0.0863,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
 
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
+ "grad_norm": 1.6307971477508545,
684
  "learning_rate": 4.448979591836735e-06,
685
+ "loss": 0.0489,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
 
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
+ "grad_norm": 0.3477366864681244,
694
  "learning_rate": 4.367346938775511e-06,
695
+ "loss": 0.0448,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
 
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
+ "grad_norm": 0.41136085987091064,
704
  "learning_rate": 4.2857142857142855e-06,
705
+ "loss": 0.0431,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
 
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
+ "grad_norm": 1.1029525995254517,
714
  "learning_rate": 4.204081632653061e-06,
715
+ "loss": 0.0551,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
 
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
+ "grad_norm": 0.8994241952896118,
724
  "learning_rate": 4.122448979591837e-06,
725
+ "loss": 0.0581,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
 
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
+ "grad_norm": 0.1889757364988327,
734
  "learning_rate": 4.040816326530612e-06,
735
+ "loss": 0.034,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
 
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
+ "grad_norm": 1.7815334796905518,
744
  "learning_rate": 3.959183673469388e-06,
745
  "loss": 0.0536,
746
  "max_memory_allocated (GB)": 57.18,
 
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
+ "grad_norm": 0.4372510612010956,
754
  "learning_rate": 3.877551020408164e-06,
755
+ "loss": 0.0617,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
 
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
+ "grad_norm": 5.120749473571777,
764
  "learning_rate": 3.795918367346939e-06,
765
+ "loss": 0.0518,
766
  "max_memory_allocated (GB)": 57.18,
767
  "memory_allocated (GB)": 50.57,
768
  "step": 760,
 
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
+ "grad_norm": 6.453648090362549,
774
  "learning_rate": 3.7142857142857146e-06,
775
+ "loss": 0.069,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
 
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
+ "grad_norm": 0.6512885093688965,
784
  "learning_rate": 3.6326530612244903e-06,
785
+ "loss": 0.0649,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
 
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
+ "grad_norm": 0.3266737759113312,
794
  "learning_rate": 3.5510204081632655e-06,
795
+ "loss": 0.0523,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
 
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
+ "grad_norm": 0.4506176710128784,
804
  "learning_rate": 3.469387755102041e-06,
805
+ "loss": 0.0432,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
 
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
+ "grad_norm": 0.8929914236068726,
814
  "learning_rate": 3.3877551020408168e-06,
815
+ "loss": 0.0554,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
 
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
+ "grad_norm": 0.7046924233436584,
824
  "learning_rate": 3.3061224489795924e-06,
825
+ "loss": 0.0453,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
 
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
+ "grad_norm": 0.29230576753616333,
834
  "learning_rate": 3.2244897959183672e-06,
835
+ "loss": 0.0491,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
 
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
+ "grad_norm": 0.4533096253871918,
844
  "learning_rate": 3.142857142857143e-06,
845
+ "loss": 0.0529,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
 
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
+ "grad_norm": 0.5383632183074951,
854
  "learning_rate": 3.0612244897959185e-06,
855
+ "loss": 0.0823,
856
  "max_memory_allocated (GB)": 57.18,
857
  "memory_allocated (GB)": 50.57,
858
  "step": 850,
 
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
+ "grad_norm": 2.8597779273986816,
864
  "learning_rate": 2.979591836734694e-06,
865
+ "loss": 0.0456,
866
  "max_memory_allocated (GB)": 57.18,
867
  "memory_allocated (GB)": 50.57,
868
  "step": 860,
 
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
+ "grad_norm": 0.26686975359916687,
874
  "learning_rate": 2.8979591836734694e-06,
875
+ "loss": 0.064,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
 
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
+ "grad_norm": 0.7789614796638489,
884
  "learning_rate": 2.816326530612245e-06,
885
+ "loss": 0.0441,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
 
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
+ "grad_norm": 0.21532948315143585,
894
  "learning_rate": 2.7346938775510207e-06,
895
+ "loss": 0.0291,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
 
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
+ "grad_norm": 0.741765558719635,
904
  "learning_rate": 2.6530612244897964e-06,
905
+ "loss": 0.0512,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
 
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
+ "grad_norm": 0.6416855454444885,
914
  "learning_rate": 2.571428571428571e-06,
915
+ "loss": 0.0606,
916
  "max_memory_allocated (GB)": 57.18,
917
  "memory_allocated (GB)": 50.57,
918
  "step": 910,
 
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
+ "grad_norm": 0.14841973781585693,
924
  "learning_rate": 2.489795918367347e-06,
925
+ "loss": 0.0542,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
 
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
+ "grad_norm": 0.4417996108531952,
934
  "learning_rate": 2.4081632653061225e-06,
935
+ "loss": 0.0498,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
 
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
+ "grad_norm": 0.9759775400161743,
944
  "learning_rate": 2.326530612244898e-06,
945
+ "loss": 0.0491,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
 
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
+ "grad_norm": 1.020371913909912,
954
  "learning_rate": 2.244897959183674e-06,
955
+ "loss": 0.0597,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
 
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
+ "grad_norm": 0.3064863085746765,
964
  "learning_rate": 2.1632653061224495e-06,
965
+ "loss": 0.0499,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
 
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
+ "grad_norm": 0.7580925226211548,
974
  "learning_rate": 2.0816326530612247e-06,
975
+ "loss": 0.0742,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
 
980
  },
981
  {
982
  "epoch": 4.0,
983
+ "grad_norm": 0.6833075881004333,
984
  "learning_rate": 2.0000000000000003e-06,
985
+ "loss": 0.0708,
986
+ "max_memory_allocated (GB)": 63.75,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
+ "grad_norm": 0.5641142725944519,
994
  "learning_rate": 1.9183673469387756e-06,
995
+ "loss": 0.0481,
996
+ "max_memory_allocated (GB)": 63.75,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
+ "grad_norm": 0.8568029403686523,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
+ "loss": 0.0626,
1006
+ "max_memory_allocated (GB)": 63.75,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
+ "grad_norm": 0.5912718772888184,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
+ "loss": 0.0628,
1016
+ "max_memory_allocated (GB)": 63.75,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
1019
  "total_memory_available (GB)": 94.62
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
+ "grad_norm": 0.3173392713069916,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
+ "loss": 0.0402,
1026
+ "max_memory_allocated (GB)": 63.75,
1027
  "memory_allocated (GB)": 50.57,
1028
  "step": 1020,
1029
  "total_memory_available (GB)": 94.62
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
+ "grad_norm": 0.8902315497398376,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
+ "loss": 0.0536,
1036
+ "max_memory_allocated (GB)": 63.75,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
1039
  "total_memory_available (GB)": 94.62
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
+ "grad_norm": 0.5009722113609314,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
  "loss": 0.0399,
1046
+ "max_memory_allocated (GB)": 63.75,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
1049
  "total_memory_available (GB)": 94.62
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
+ "grad_norm": 1.8656221628189087,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
+ "loss": 0.0499,
1056
+ "max_memory_allocated (GB)": 63.75,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
1059
  "total_memory_available (GB)": 94.62
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
+ "grad_norm": 0.4257819950580597,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
+ "loss": 0.0459,
1066
+ "max_memory_allocated (GB)": 63.75,
1067
  "memory_allocated (GB)": 50.57,
1068
  "step": 1060,
1069
  "total_memory_available (GB)": 94.62
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
+ "grad_norm": 0.5823583006858826,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
+ "loss": 0.0488,
1076
+ "max_memory_allocated (GB)": 63.75,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
1079
  "total_memory_available (GB)": 94.62
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
+ "grad_norm": 0.40693071484565735,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
+ "loss": 0.0525,
1086
+ "max_memory_allocated (GB)": 63.75,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
1089
  "total_memory_available (GB)": 94.62
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
+ "grad_norm": 1.8890392780303955,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
+ "loss": 0.0512,
1096
+ "max_memory_allocated (GB)": 63.75,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
1099
  "total_memory_available (GB)": 94.62
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
+ "grad_norm": 0.3593562841415405,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
+ "loss": 0.0364,
1106
+ "max_memory_allocated (GB)": 63.75,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
1109
  "total_memory_available (GB)": 94.62
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
+ "grad_norm": 0.1553877741098404,
1114
  "learning_rate": 9.387755102040817e-07,
1115
+ "loss": 0.0465,
1116
+ "max_memory_allocated (GB)": 63.75,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
1119
  "total_memory_available (GB)": 94.62
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
+ "grad_norm": 0.6775248050689697,
1124
  "learning_rate": 8.571428571428572e-07,
1125
+ "loss": 0.0406,
1126
+ "max_memory_allocated (GB)": 63.75,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
1129
  "total_memory_available (GB)": 94.62
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
+ "grad_norm": 0.5735678672790527,
1134
  "learning_rate": 7.755102040816327e-07,
1135
+ "loss": 0.0539,
1136
+ "max_memory_allocated (GB)": 63.75,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
1139
  "total_memory_available (GB)": 94.62
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
+ "grad_norm": 0.7891528606414795,
1144
  "learning_rate": 6.938775510204082e-07,
1145
+ "loss": 0.0732,
1146
+ "max_memory_allocated (GB)": 63.75,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
1149
  "total_memory_available (GB)": 94.62
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
+ "grad_norm": 0.7845800518989563,
1154
  "learning_rate": 6.122448979591837e-07,
1155
+ "loss": 0.0515,
1156
+ "max_memory_allocated (GB)": 63.75,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
1159
  "total_memory_available (GB)": 94.62
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
+ "grad_norm": 1.0361818075180054,
1164
  "learning_rate": 5.306122448979592e-07,
1165
+ "loss": 0.0608,
1166
+ "max_memory_allocated (GB)": 63.75,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
1169
  "total_memory_available (GB)": 94.62
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
+ "grad_norm": 0.42603600025177,
1174
  "learning_rate": 4.489795918367347e-07,
1175
  "loss": 0.0549,
1176
+ "max_memory_allocated (GB)": 63.75,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
1179
  "total_memory_available (GB)": 94.62
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
+ "grad_norm": 0.6727630496025085,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
+ "loss": 0.0441,
1186
+ "max_memory_allocated (GB)": 63.75,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
1189
  "total_memory_available (GB)": 94.62
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
+ "grad_norm": 0.8499141335487366,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
+ "loss": 0.0544,
1196
+ "max_memory_allocated (GB)": 63.75,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
1199
  "total_memory_available (GB)": 94.62
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
+ "grad_norm": 0.7604736685752869,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
  "loss": 0.0728,
1206
+ "max_memory_allocated (GB)": 63.75,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
1209
  "total_memory_available (GB)": 94.62
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
+ "grad_norm": 1.0298157930374146,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
+ "loss": 0.0654,
1216
+ "max_memory_allocated (GB)": 63.75,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
1219
  "total_memory_available (GB)": 94.62
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
+ "grad_norm": 1.1890877485275269,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
+ "loss": 0.0413,
1226
+ "max_memory_allocated (GB)": 63.75,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
1229
  "total_memory_available (GB)": 94.62
1230
  },
1231
  {
1232
  "epoch": 5.0,
1233
+ "max_memory_allocated (GB)": 63.75,
1234
  "memory_allocated (GB)": 50.57,
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
+ "train_loss": 0.06080986156755564,
1239
+ "train_runtime": 1168.6251,
1240
+ "train_samples_per_second": 48.37,
1241
+ "train_steps_per_second": 1.21
1242
  }
1243
  ],
1244
  "logging_steps": 10,
validation_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12305960804224014,
4
- "eval_runtime": 31.7562,
5
- "eval_samples_per_second": 32.832,
6
- "eval_steps_per_second": 2.107,
7
- "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.13143940269947052,
4
+ "eval_runtime": 46.706,
5
+ "eval_samples_per_second": 27.822,
6
+ "eval_steps_per_second": 1.786,
7
+ "max_memory_allocated (GB)": 63.75,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
10
  }