chansung commited on
Commit
9ca2da2
1 Parent(s): 3bc4329

Model save

Browse files
README.md CHANGED
@@ -4,7 +4,6 @@ library_name: peft
4
  tags:
5
  - trl
6
  - sft
7
- - alignment-handbook
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
@@ -21,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.6852
25
 
26
  ## Model description
27
 
@@ -41,14 +40,14 @@ More information needed
41
 
42
  The following hyperparameters were used during training:
43
  - learning_rate: 0.0002
44
- - train_batch_size: 16
45
- - eval_batch_size: 16
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
  - num_devices: 3
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 96
51
- - total_eval_batch_size: 48
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
@@ -58,16 +57,16 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 1.2474 | 1.0 | 146 | 2.5237 |
62
- | 1.1269 | 2.0 | 292 | 2.4805 |
63
- | 1.0909 | 3.0 | 438 | 2.4893 |
64
- | 1.0354 | 4.0 | 584 | 2.5017 |
65
- | 1.0016 | 5.0 | 730 | 2.5295 |
66
- | 0.9823 | 6.0 | 876 | 2.5500 |
67
- | 0.955 | 7.0 | 1022 | 2.5866 |
68
- | 0.9214 | 8.0 | 1168 | 2.6224 |
69
- | 0.913 | 9.0 | 1314 | 2.6512 |
70
- | 0.889 | 10.0 | 1460 | 2.6852 |
71
 
72
 
73
  ### Framework versions
 
4
  tags:
5
  - trl
6
  - sft
 
7
  - generated_from_trainer
8
  base_model: google/gemma-2b
9
  datasets:
 
20
 
21
  This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.7931
24
 
25
  ## Model description
26
 
 
40
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 0.0002
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
  - num_devices: 3
48
  - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 48
50
+ - total_eval_batch_size: 24
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.1808 | 1.0 | 146 | 2.4876 |
61
+ | 1.0819 | 2.0 | 292 | 2.4820 |
62
+ | 1.035 | 3.0 | 438 | 2.4995 |
63
+ | 0.9796 | 4.0 | 584 | 2.5387 |
64
+ | 0.9366 | 5.0 | 730 | 2.6038 |
65
+ | 0.9051 | 6.0 | 876 | 2.6521 |
66
+ | 0.8676 | 7.0 | 1022 | 2.7249 |
67
+ | 0.8291 | 8.0 | 1168 | 2.7667 |
68
+ | 0.8286 | 9.0 | 1314 | 2.7899 |
69
+ | 0.8185 | 10.0 | 1460 | 2.7931 |
70
 
71
 
72
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7177a5d231842fc98b177d3bcfe6276948248bfce196e34c35c5d4f45b3d67a
3
  size 78480320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0adbe9f8822dd528c487486d169b1cdc0db793f6c22d58013d09a1a98fa248f
3
  size 78480320
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.273972602739725,
3
- "total_flos": 8.853977907740017e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 3.0171,
6
  "train_samples": 64610,
7
- "train_samples_per_second": 23207.393,
8
- "train_steps_per_second": 483.902
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 8.702314001108828e+17,
4
+ "train_loss": 0.9981180969982931,
5
+ "train_runtime": 8741.0304,
6
  "train_samples": 64610,
7
+ "train_samples_per_second": 8.01,
8
+ "train_steps_per_second": 0.167
9
  }
runs/Jun10_15-40-50_user-HP-Z8-Fury-G5-Workstation-Desktop-PC/events.out.tfevents.1718001669.user-HP-Z8-Fury-G5-Workstation-Desktop-PC.6920.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c4ee272b638040bcce47df09360631ae68d01899306c9a6e72b3d05da89168a
3
- size 67223
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8da717ead12a274f7d00aab0a96423a7266dcdb7b22fc03364f0e6a82add57b
3
+ size 70380
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 10.273972602739725,
3
- "total_flos": 8.853977907740017e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 3.0171,
6
  "train_samples": 64610,
7
- "train_samples_per_second": 23207.393,
8
- "train_steps_per_second": 483.902
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 8.702314001108828e+17,
4
+ "train_loss": 0.9981180969982931,
5
+ "train_runtime": 8741.0304,
6
  "train_samples": 64610,
7
+ "train_samples_per_second": 8.01,
8
+ "train_steps_per_second": 0.167
9
  }
trainer_state.json CHANGED
@@ -1,2208 +1,2152 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 10.273972602739725,
5
  "eval_steps": 500,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.00684931506849315,
13
- "grad_norm": 3.5625,
14
- "learning_rate": 9.132420091324201e-07,
15
  "loss": 3.0017,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03424657534246575,
20
- "grad_norm": 2.9375,
21
- "learning_rate": 4.566210045662101e-06,
22
- "loss": 3.0725,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.0684931506849315,
27
- "grad_norm": 3.078125,
28
- "learning_rate": 9.132420091324201e-06,
29
- "loss": 3.0374,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10273972602739725,
34
- "grad_norm": 2.515625,
35
- "learning_rate": 1.3698630136986302e-05,
36
- "loss": 3.0044,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.136986301369863,
41
- "grad_norm": 2.3125,
42
- "learning_rate": 1.8264840182648402e-05,
43
- "loss": 2.9373,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17123287671232876,
48
- "grad_norm": 4.90625,
49
- "learning_rate": 2.2831050228310503e-05,
50
- "loss": 2.7849,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.2054794520547945,
55
- "grad_norm": 17.0,
56
- "learning_rate": 2.7397260273972603e-05,
57
- "loss": 2.6263,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.23972602739726026,
62
- "grad_norm": 1.0859375,
63
- "learning_rate": 3.1963470319634704e-05,
64
- "loss": 2.4603,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.273972602739726,
69
- "grad_norm": 1.75,
70
- "learning_rate": 3.6529680365296805e-05,
71
- "loss": 2.3423,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3082191780821918,
76
- "grad_norm": 3.0,
77
- "learning_rate": 4.1095890410958905e-05,
78
- "loss": 2.2364,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3424657534246575,
83
- "grad_norm": 1.0546875,
84
- "learning_rate": 4.5662100456621006e-05,
85
- "loss": 2.0795,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.3767123287671233,
90
- "grad_norm": 1.734375,
91
- "learning_rate": 5.0228310502283106e-05,
92
- "loss": 1.9497,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.410958904109589,
97
- "grad_norm": 1.25,
98
- "learning_rate": 5.479452054794521e-05,
99
- "loss": 1.8556,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4452054794520548,
104
- "grad_norm": 0.640625,
105
- "learning_rate": 5.936073059360731e-05,
106
- "loss": 1.759,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.4794520547945205,
111
- "grad_norm": 0.97265625,
112
- "learning_rate": 6.392694063926941e-05,
113
- "loss": 1.6773,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5136986301369864,
118
- "grad_norm": 1.9296875,
119
- "learning_rate": 6.84931506849315e-05,
120
- "loss": 1.6105,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.547945205479452,
125
- "grad_norm": 0.51171875,
126
- "learning_rate": 7.305936073059361e-05,
127
- "loss": 1.5517,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.5821917808219178,
132
- "grad_norm": 0.45703125,
133
- "learning_rate": 7.76255707762557e-05,
134
- "loss": 1.4895,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6164383561643836,
139
- "grad_norm": 0.326171875,
140
- "learning_rate": 8.219178082191781e-05,
141
- "loss": 1.466,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6506849315068494,
146
- "grad_norm": 0.283203125,
147
- "learning_rate": 8.67579908675799e-05,
148
- "loss": 1.4237,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.684931506849315,
153
- "grad_norm": 0.333984375,
154
- "learning_rate": 9.132420091324201e-05,
155
- "loss": 1.3836,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7191780821917808,
160
- "grad_norm": 0.578125,
161
- "learning_rate": 9.58904109589041e-05,
162
- "loss": 1.3655,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7534246575342466,
167
- "grad_norm": 0.484375,
168
- "learning_rate": 0.00010045662100456621,
169
- "loss": 1.3369,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.7876712328767124,
174
- "grad_norm": 0.3671875,
175
- "learning_rate": 0.00010502283105022832,
176
- "loss": 1.3149,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.821917808219178,
181
- "grad_norm": 0.9765625,
182
- "learning_rate": 0.00010958904109589041,
183
- "loss": 1.3051,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8561643835616438,
188
- "grad_norm": 0.74609375,
189
- "learning_rate": 0.00011415525114155252,
190
- "loss": 1.2835,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.8904109589041096,
195
- "grad_norm": 0.271484375,
196
- "learning_rate": 0.00011872146118721462,
197
- "loss": 1.2805,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9246575342465754,
202
- "grad_norm": 0.82421875,
203
- "learning_rate": 0.0001232876712328767,
204
- "loss": 1.2617,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.958904109589041,
209
- "grad_norm": 0.498046875,
210
- "learning_rate": 0.00012785388127853882,
211
- "loss": 1.2659,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.9931506849315068,
216
- "grad_norm": 0.28125,
217
- "learning_rate": 0.00013242009132420092,
218
- "loss": 1.2474,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 1.0,
223
- "eval_loss": 2.523677110671997,
224
- "eval_runtime": 0.5573,
225
- "eval_samples_per_second": 17.944,
226
- "eval_steps_per_second": 1.794,
227
  "step": 146
228
  },
229
  {
230
  "epoch": 1.0273972602739727,
231
- "grad_norm": 0.58984375,
232
- "learning_rate": 0.000136986301369863,
233
- "loss": 1.2351,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 1.0616438356164384,
238
- "grad_norm": 0.5234375,
239
- "learning_rate": 0.0001415525114155251,
240
- "loss": 1.2256,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 1.095890410958904,
245
- "grad_norm": 0.55859375,
246
- "learning_rate": 0.00014611872146118722,
247
- "loss": 1.2203,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 1.13013698630137,
252
- "grad_norm": 0.35546875,
253
- "learning_rate": 0.00015068493150684933,
254
- "loss": 1.1994,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 1.1643835616438356,
259
- "grad_norm": 0.345703125,
260
- "learning_rate": 0.0001552511415525114,
261
- "loss": 1.2069,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 1.1986301369863013,
266
- "grad_norm": 0.412109375,
267
- "learning_rate": 0.00015981735159817351,
268
- "loss": 1.1912,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 1.2328767123287672,
273
- "grad_norm": 0.365234375,
274
- "learning_rate": 0.00016438356164383562,
275
- "loss": 1.1879,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 1.2671232876712328,
280
- "grad_norm": 0.42578125,
281
- "learning_rate": 0.00016894977168949773,
282
- "loss": 1.1983,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 1.3013698630136985,
287
- "grad_norm": 0.63671875,
288
- "learning_rate": 0.0001735159817351598,
289
- "loss": 1.1872,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 1.3356164383561644,
294
- "grad_norm": 0.376953125,
295
- "learning_rate": 0.00017808219178082192,
296
- "loss": 1.1806,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 1.36986301369863,
301
- "grad_norm": 1.1640625,
302
- "learning_rate": 0.00018264840182648402,
303
- "loss": 1.1849,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 1.404109589041096,
308
- "grad_norm": 1.046875,
309
- "learning_rate": 0.00018721461187214613,
310
- "loss": 1.1782,
311
  "step": 205
312
  },
313
  {
314
  "epoch": 1.4383561643835616,
315
- "grad_norm": 0.373046875,
316
- "learning_rate": 0.0001917808219178082,
317
- "loss": 1.1727,
318
  "step": 210
319
  },
320
  {
321
  "epoch": 1.4726027397260273,
322
- "grad_norm": 0.482421875,
323
- "learning_rate": 0.00019634703196347032,
324
- "loss": 1.1725,
325
  "step": 215
326
  },
327
  {
328
  "epoch": 1.5068493150684932,
329
- "grad_norm": 0.80859375,
330
- "learning_rate": 0.00019999987297289245,
331
- "loss": 1.1611,
332
  "step": 220
333
  },
334
  {
335
  "epoch": 1.541095890410959,
336
- "grad_norm": 0.56640625,
337
- "learning_rate": 0.00019999542705801296,
338
- "loss": 1.1642,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 1.5753424657534247,
343
- "grad_norm": 0.361328125,
344
- "learning_rate": 0.00019998463011046926,
345
- "loss": 1.1608,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 1.6095890410958904,
350
- "grad_norm": 0.76953125,
351
- "learning_rate": 0.00019996748281601038,
352
- "loss": 1.1563,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 1.643835616438356,
357
- "grad_norm": 0.388671875,
358
- "learning_rate": 0.00019994398626371643,
359
- "loss": 1.1457,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 1.678082191780822,
364
- "grad_norm": 0.45703125,
365
- "learning_rate": 0.0001999141419459293,
366
- "loss": 1.1609,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 1.7123287671232876,
371
- "grad_norm": 0.70703125,
372
- "learning_rate": 0.00019987795175815807,
373
- "loss": 1.1479,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 1.7465753424657535,
378
- "grad_norm": 0.451171875,
379
- "learning_rate": 0.0001998354179989585,
380
- "loss": 1.148,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 1.7808219178082192,
385
- "grad_norm": 0.421875,
386
- "learning_rate": 0.0001997865433697871,
387
- "loss": 1.1513,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 1.8150684931506849,
392
- "grad_norm": 0.64453125,
393
- "learning_rate": 0.00019973133097482947,
394
- "loss": 1.1327,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 1.8493150684931505,
399
- "grad_norm": 0.326171875,
400
- "learning_rate": 0.00019966978432080316,
401
- "loss": 1.1424,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 1.8835616438356164,
406
- "grad_norm": 0.4375,
407
- "learning_rate": 0.00019960190731673505,
408
- "loss": 1.1387,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 1.9178082191780823,
413
- "grad_norm": 0.34765625,
414
- "learning_rate": 0.00019952770427371304,
415
- "loss": 1.1258,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 1.952054794520548,
420
- "grad_norm": 0.447265625,
421
- "learning_rate": 0.00019944717990461207,
422
- "loss": 1.1226,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 1.9863013698630136,
427
- "grad_norm": 0.427734375,
428
- "learning_rate": 0.00019936033932379504,
429
- "loss": 1.1269,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 2.0,
434
- "eval_loss": 2.4804677963256836,
435
- "eval_runtime": 0.5614,
436
- "eval_samples_per_second": 17.814,
437
- "eval_steps_per_second": 1.781,
438
  "step": 292
439
  },
440
  {
441
  "epoch": 2.0205479452054793,
442
- "grad_norm": 0.4609375,
443
- "learning_rate": 0.00019926718804678785,
444
- "loss": 1.1225,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 2.0547945205479454,
449
- "grad_norm": 0.435546875,
450
- "learning_rate": 0.000199167731989929,
451
- "loss": 1.1022,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 2.089041095890411,
456
- "grad_norm": 0.4140625,
457
- "learning_rate": 0.00019906197746999408,
458
- "loss": 1.1012,
459
  "step": 305
460
  },
461
  {
462
  "epoch": 2.1232876712328768,
463
- "grad_norm": 0.3515625,
464
- "learning_rate": 0.00019894993120379435,
465
- "loss": 1.0928,
466
  "step": 310
467
  },
468
  {
469
  "epoch": 2.1575342465753424,
470
- "grad_norm": 0.43359375,
471
- "learning_rate": 0.00019883160030775016,
472
- "loss": 1.1032,
473
  "step": 315
474
  },
475
  {
476
  "epoch": 2.191780821917808,
477
- "grad_norm": 0.96484375,
478
- "learning_rate": 0.00019870699229743911,
479
- "loss": 1.0966,
480
  "step": 320
481
  },
482
  {
483
  "epoch": 2.2260273972602738,
484
- "grad_norm": 0.81640625,
485
- "learning_rate": 0.0001985761150871185,
486
- "loss": 1.0952,
487
  "step": 325
488
  },
489
  {
490
  "epoch": 2.26027397260274,
491
- "grad_norm": 0.462890625,
492
- "learning_rate": 0.00019843897698922284,
493
- "loss": 1.0936,
494
  "step": 330
495
  },
496
  {
497
  "epoch": 2.2945205479452055,
498
- "grad_norm": 0.3203125,
499
- "learning_rate": 0.00019829558671383585,
500
- "loss": 1.0938,
501
  "step": 335
502
  },
503
  {
504
  "epoch": 2.328767123287671,
505
- "grad_norm": 0.494140625,
506
- "learning_rate": 0.00019814595336813725,
507
- "loss": 1.0856,
508
  "step": 340
509
  },
510
  {
511
  "epoch": 2.363013698630137,
512
- "grad_norm": 0.353515625,
513
- "learning_rate": 0.0001979900864558242,
514
- "loss": 1.0851,
515
  "step": 345
516
  },
517
  {
518
  "epoch": 2.3972602739726026,
519
- "grad_norm": 0.3359375,
520
- "learning_rate": 0.00019782799587650805,
521
- "loss": 1.1018,
522
  "step": 350
523
  },
524
  {
525
  "epoch": 2.4315068493150687,
526
- "grad_norm": 0.39453125,
527
- "learning_rate": 0.00019765969192508508,
528
- "loss": 1.0882,
529
  "step": 355
530
  },
531
  {
532
  "epoch": 2.4657534246575343,
533
- "grad_norm": 0.341796875,
534
- "learning_rate": 0.00019748518529108316,
535
- "loss": 1.0932,
536
  "step": 360
537
  },
538
  {
539
  "epoch": 2.5,
540
- "grad_norm": 0.404296875,
541
- "learning_rate": 0.00019730448705798239,
542
- "loss": 1.0945,
543
  "step": 365
544
  },
545
  {
546
  "epoch": 2.5342465753424657,
547
- "grad_norm": 0.35546875,
548
- "learning_rate": 0.00019711760870251143,
549
- "loss": 1.0881,
550
  "step": 370
551
  },
552
  {
553
  "epoch": 2.5684931506849313,
554
- "grad_norm": 0.40234375,
555
- "learning_rate": 0.00019692456209391846,
556
- "loss": 1.0802,
557
  "step": 375
558
  },
559
  {
560
  "epoch": 2.602739726027397,
561
- "grad_norm": 0.52734375,
562
- "learning_rate": 0.0001967253594932173,
563
- "loss": 1.0822,
564
  "step": 380
565
  },
566
  {
567
  "epoch": 2.636986301369863,
568
- "grad_norm": 0.337890625,
569
- "learning_rate": 0.00019652001355240878,
570
- "loss": 1.0907,
571
  "step": 385
572
  },
573
  {
574
  "epoch": 2.671232876712329,
575
- "grad_norm": 0.373046875,
576
- "learning_rate": 0.00019630853731367713,
577
- "loss": 1.0868,
578
  "step": 390
579
  },
580
  {
581
  "epoch": 2.7054794520547945,
582
- "grad_norm": 0.40234375,
583
- "learning_rate": 0.0001960909442085615,
584
- "loss": 1.086,
585
  "step": 395
586
  },
587
  {
588
  "epoch": 2.73972602739726,
589
- "grad_norm": 0.384765625,
590
- "learning_rate": 0.00019586724805710306,
591
- "loss": 1.0746,
592
  "step": 400
593
  },
594
  {
595
  "epoch": 2.7739726027397262,
596
- "grad_norm": 0.353515625,
597
- "learning_rate": 0.0001956374630669672,
598
- "loss": 1.0832,
599
  "step": 405
600
  },
601
  {
602
  "epoch": 2.808219178082192,
603
- "grad_norm": 0.34765625,
604
- "learning_rate": 0.00019540160383254107,
605
- "loss": 1.0753,
606
  "step": 410
607
  },
608
  {
609
  "epoch": 2.8424657534246576,
610
- "grad_norm": 0.328125,
611
- "learning_rate": 0.00019515968533400673,
612
- "loss": 1.0844,
613
  "step": 415
614
  },
615
  {
616
  "epoch": 2.8767123287671232,
617
- "grad_norm": 0.34765625,
618
- "learning_rate": 0.00019491172293638968,
619
- "loss": 1.083,
620
  "step": 420
621
  },
622
  {
623
  "epoch": 2.910958904109589,
624
- "grad_norm": 0.369140625,
625
- "learning_rate": 0.00019465773238858298,
626
- "loss": 1.0757,
627
  "step": 425
628
  },
629
  {
630
  "epoch": 2.9452054794520546,
631
- "grad_norm": 0.56640625,
632
- "learning_rate": 0.00019439772982234697,
633
- "loss": 1.075,
634
  "step": 430
635
  },
636
  {
637
  "epoch": 2.9794520547945207,
638
- "grad_norm": 3.71875,
639
- "learning_rate": 0.00019413173175128473,
640
- "loss": 1.0909,
641
  "step": 435
642
  },
643
  {
644
  "epoch": 3.0,
645
- "eval_loss": 2.4892916679382324,
646
- "eval_runtime": 0.5522,
647
- "eval_samples_per_second": 18.108,
648
- "eval_steps_per_second": 1.811,
649
  "step": 438
650
  },
651
  {
652
  "epoch": 3.0136986301369864,
653
- "grad_norm": 1.3046875,
654
- "learning_rate": 0.0001938597550697932,
655
- "loss": 1.0635,
656
  "step": 440
657
  },
658
  {
659
  "epoch": 3.047945205479452,
660
- "grad_norm": 0.3984375,
661
- "learning_rate": 0.00019358181705199015,
662
- "loss": 1.0518,
663
  "step": 445
664
  },
665
  {
666
  "epoch": 3.0821917808219177,
667
- "grad_norm": 0.369140625,
668
- "learning_rate": 0.00019329793535061723,
669
- "loss": 1.0509,
670
  "step": 450
671
  },
672
  {
673
  "epoch": 3.1164383561643834,
674
- "grad_norm": 0.412109375,
675
- "learning_rate": 0.00019300812799591846,
676
- "loss": 1.0529,
677
  "step": 455
678
  },
679
  {
680
  "epoch": 3.1506849315068495,
681
- "grad_norm": 0.66015625,
682
- "learning_rate": 0.00019271241339449536,
683
- "loss": 1.0416,
684
  "step": 460
685
  },
686
  {
687
  "epoch": 3.184931506849315,
688
- "grad_norm": 0.89453125,
689
- "learning_rate": 0.00019241081032813772,
690
- "loss": 1.0488,
691
  "step": 465
692
  },
693
  {
694
  "epoch": 3.219178082191781,
695
- "grad_norm": 0.55078125,
696
- "learning_rate": 0.00019210333795263075,
697
- "loss": 1.0402,
698
  "step": 470
699
  },
700
  {
701
  "epoch": 3.2534246575342465,
702
- "grad_norm": 0.73046875,
703
- "learning_rate": 0.00019179001579653853,
704
- "loss": 1.0568,
705
  "step": 475
706
  },
707
  {
708
  "epoch": 3.287671232876712,
709
- "grad_norm": 1.0390625,
710
- "learning_rate": 0.0001914708637599636,
711
- "loss": 1.0487,
712
  "step": 480
713
  },
714
  {
715
  "epoch": 3.3219178082191783,
716
- "grad_norm": 0.400390625,
717
- "learning_rate": 0.00019114590211328288,
718
- "loss": 1.0468,
719
  "step": 485
720
  },
721
  {
722
  "epoch": 3.356164383561644,
723
- "grad_norm": 0.439453125,
724
- "learning_rate": 0.0001908151514958606,
725
- "loss": 1.0538,
726
  "step": 490
727
  },
728
  {
729
  "epoch": 3.3904109589041096,
730
- "grad_norm": 0.37109375,
731
- "learning_rate": 0.00019047863291473717,
732
- "loss": 1.0441,
733
  "step": 495
734
  },
735
  {
736
  "epoch": 3.4246575342465753,
737
- "grad_norm": 0.34765625,
738
- "learning_rate": 0.00019013636774329495,
739
- "loss": 1.0521,
740
  "step": 500
741
  },
742
  {
743
  "epoch": 3.458904109589041,
744
- "grad_norm": 0.4140625,
745
- "learning_rate": 0.00018978837771990085,
746
- "loss": 1.0405,
747
  "step": 505
748
  },
749
  {
750
  "epoch": 3.493150684931507,
751
- "grad_norm": 0.4375,
752
- "learning_rate": 0.0001894346849465257,
753
- "loss": 1.0439,
754
  "step": 510
755
  },
756
  {
757
  "epoch": 3.5273972602739727,
758
- "grad_norm": 0.349609375,
759
- "learning_rate": 0.00018907531188734026,
760
- "loss": 1.0525,
761
  "step": 515
762
  },
763
  {
764
  "epoch": 3.5616438356164384,
765
- "grad_norm": 0.47265625,
766
- "learning_rate": 0.00018871028136728874,
767
- "loss": 1.0493,
768
  "step": 520
769
  },
770
  {
771
  "epoch": 3.595890410958904,
772
- "grad_norm": 0.35546875,
773
- "learning_rate": 0.00018833961657063885,
774
- "loss": 1.0405,
775
  "step": 525
776
  },
777
  {
778
  "epoch": 3.6301369863013697,
779
- "grad_norm": 0.50390625,
780
- "learning_rate": 0.0001879633410395095,
781
- "loss": 1.0452,
782
  "step": 530
783
  },
784
  {
785
  "epoch": 3.6643835616438354,
786
- "grad_norm": 0.34765625,
787
- "learning_rate": 0.00018758147867237548,
788
- "loss": 1.0515,
789
  "step": 535
790
  },
791
  {
792
  "epoch": 3.6986301369863015,
793
- "grad_norm": 0.421875,
794
- "learning_rate": 0.00018719405372254948,
795
- "loss": 1.0453,
796
  "step": 540
797
  },
798
  {
799
  "epoch": 3.732876712328767,
800
- "grad_norm": 0.3359375,
801
- "learning_rate": 0.00018680109079664188,
802
- "loss": 1.0356,
803
  "step": 545
804
  },
805
  {
806
  "epoch": 3.767123287671233,
807
- "grad_norm": 0.333984375,
808
- "learning_rate": 0.0001864026148529978,
809
- "loss": 1.0355,
810
  "step": 550
811
  },
812
  {
813
  "epoch": 3.8013698630136985,
814
- "grad_norm": 0.427734375,
815
- "learning_rate": 0.00018599865120011192,
816
- "loss": 1.0452,
817
  "step": 555
818
  },
819
  {
820
  "epoch": 3.8356164383561646,
821
- "grad_norm": 0.34375,
822
- "learning_rate": 0.00018558922549502107,
823
- "loss": 1.0258,
824
  "step": 560
825
  },
826
  {
827
  "epoch": 3.8698630136986303,
828
- "grad_norm": 0.412109375,
829
- "learning_rate": 0.0001851743637416747,
830
- "loss": 1.0423,
831
  "step": 565
832
  },
833
  {
834
  "epoch": 3.904109589041096,
835
- "grad_norm": 0.31640625,
836
- "learning_rate": 0.00018475409228928312,
837
- "loss": 1.0238,
838
  "step": 570
839
  },
840
  {
841
  "epoch": 3.9383561643835616,
842
- "grad_norm": 0.400390625,
843
- "learning_rate": 0.00018432843783064429,
844
- "loss": 1.041,
845
  "step": 575
846
  },
847
  {
848
  "epoch": 3.9726027397260273,
849
- "grad_norm": 0.412109375,
850
- "learning_rate": 0.00018389742740044813,
851
- "loss": 1.0354,
852
  "step": 580
853
  },
854
  {
855
  "epoch": 4.0,
856
- "eval_loss": 2.5017333030700684,
857
- "eval_runtime": 0.5568,
858
- "eval_samples_per_second": 17.961,
859
- "eval_steps_per_second": 1.796,
860
  "step": 584
861
  },
862
  {
863
  "epoch": 4.006849315068493,
864
- "grad_norm": 0.52734375,
865
- "learning_rate": 0.00018346108837355972,
866
- "loss": 1.0411,
867
  "step": 585
868
  },
869
  {
870
  "epoch": 4.041095890410959,
871
- "grad_norm": 0.41796875,
872
- "learning_rate": 0.00018301944846328049,
873
- "loss": 0.9963,
874
  "step": 590
875
  },
876
  {
877
  "epoch": 4.075342465753424,
878
- "grad_norm": 0.36328125,
879
- "learning_rate": 0.0001825725357195881,
880
- "loss": 1.0137,
881
  "step": 595
882
  },
883
  {
884
  "epoch": 4.109589041095891,
885
- "grad_norm": 0.48046875,
886
- "learning_rate": 0.00018212037852735486,
887
- "loss": 1.006,
888
  "step": 600
889
  },
890
  {
891
  "epoch": 4.1438356164383565,
892
- "grad_norm": 0.4140625,
893
- "learning_rate": 0.0001816630056045451,
894
- "loss": 1.0075,
895
  "step": 605
896
  },
897
  {
898
  "epoch": 4.178082191780822,
899
- "grad_norm": 0.353515625,
900
- "learning_rate": 0.0001812004460003909,
901
- "loss": 0.9975,
902
  "step": 610
903
  },
904
  {
905
  "epoch": 4.212328767123288,
906
- "grad_norm": 0.365234375,
907
- "learning_rate": 0.00018073272909354727,
908
- "loss": 1.0171,
909
  "step": 615
910
  },
911
  {
912
  "epoch": 4.2465753424657535,
913
- "grad_norm": 0.51171875,
914
- "learning_rate": 0.0001802598845902262,
915
- "loss": 0.9953,
916
  "step": 620
917
  },
918
  {
919
  "epoch": 4.280821917808219,
920
- "grad_norm": 0.38671875,
921
- "learning_rate": 0.00017978194252230985,
922
- "loss": 1.008,
923
  "step": 625
924
  },
925
  {
926
  "epoch": 4.315068493150685,
927
- "grad_norm": 0.359375,
928
- "learning_rate": 0.00017929893324544332,
929
- "loss": 0.9993,
930
  "step": 630
931
  },
932
  {
933
  "epoch": 4.3493150684931505,
934
- "grad_norm": 0.56640625,
935
- "learning_rate": 0.0001788108874371063,
936
- "loss": 1.0119,
937
  "step": 635
938
  },
939
  {
940
  "epoch": 4.383561643835616,
941
- "grad_norm": 0.33203125,
942
- "learning_rate": 0.00017831783609466504,
943
- "loss": 1.0047,
944
  "step": 640
945
  },
946
  {
947
  "epoch": 4.417808219178082,
948
- "grad_norm": 0.341796875,
949
- "learning_rate": 0.00017781981053340337,
950
- "loss": 1.0143,
951
  "step": 645
952
  },
953
  {
954
  "epoch": 4.4520547945205475,
955
- "grad_norm": 0.345703125,
956
- "learning_rate": 0.00017731684238453385,
957
- "loss": 1.0023,
958
  "step": 650
959
  },
960
  {
961
  "epoch": 4.486301369863014,
962
- "grad_norm": 0.37890625,
963
- "learning_rate": 0.0001768089635931887,
964
- "loss": 1.0125,
965
  "step": 655
966
  },
967
  {
968
  "epoch": 4.52054794520548,
969
- "grad_norm": 0.609375,
970
- "learning_rate": 0.00017629620641639103,
971
- "loss": 1.0074,
972
  "step": 660
973
  },
974
  {
975
  "epoch": 4.554794520547945,
976
- "grad_norm": 0.36328125,
977
- "learning_rate": 0.00017577860342100579,
978
- "loss": 1.0124,
979
  "step": 665
980
  },
981
  {
982
  "epoch": 4.589041095890411,
983
- "grad_norm": 0.65625,
984
- "learning_rate": 0.0001752561874816717,
985
- "loss": 1.015,
986
  "step": 670
987
  },
988
  {
989
  "epoch": 4.623287671232877,
990
- "grad_norm": 0.38671875,
991
- "learning_rate": 0.00017472899177871297,
992
- "loss": 1.0066,
993
  "step": 675
994
  },
995
  {
996
  "epoch": 4.657534246575342,
997
- "grad_norm": 0.32421875,
998
- "learning_rate": 0.00017419704979603214,
999
- "loss": 1.0182,
1000
  "step": 680
1001
  },
1002
  {
1003
  "epoch": 4.691780821917808,
1004
- "grad_norm": 0.34375,
1005
- "learning_rate": 0.00017366039531898326,
1006
- "loss": 1.0139,
1007
  "step": 685
1008
  },
1009
  {
1010
  "epoch": 4.726027397260274,
1011
- "grad_norm": 0.349609375,
1012
- "learning_rate": 0.00017311906243222614,
1013
- "loss": 1.0162,
1014
  "step": 690
1015
  },
1016
  {
1017
  "epoch": 4.760273972602739,
1018
- "grad_norm": 0.3359375,
1019
- "learning_rate": 0.0001725730855175615,
1020
- "loss": 1.019,
1021
  "step": 695
1022
  },
1023
  {
1024
  "epoch": 4.794520547945205,
1025
- "grad_norm": 0.431640625,
1026
- "learning_rate": 0.00017202249925174723,
1027
- "loss": 1.0051,
1028
  "step": 700
1029
  },
1030
  {
1031
  "epoch": 4.828767123287671,
1032
- "grad_norm": 0.4140625,
1033
- "learning_rate": 0.00017146733860429612,
1034
- "loss": 1.0174,
1035
  "step": 705
1036
  },
1037
  {
1038
  "epoch": 4.863013698630137,
1039
- "grad_norm": 0.408203125,
1040
- "learning_rate": 0.0001709076388352546,
1041
- "loss": 1.0065,
1042
  "step": 710
1043
  },
1044
  {
1045
  "epoch": 4.897260273972603,
1046
  "grad_norm": 0.359375,
1047
- "learning_rate": 0.00017034343549296346,
1048
- "loss": 1.0262,
1049
  "step": 715
1050
  },
1051
  {
1052
  "epoch": 4.931506849315069,
1053
- "grad_norm": 0.44140625,
1054
- "learning_rate": 0.00016977476441179992,
1055
- "loss": 1.0023,
1056
  "step": 720
1057
  },
1058
  {
1059
  "epoch": 4.965753424657534,
1060
- "grad_norm": 0.357421875,
1061
- "learning_rate": 0.0001692016617099018,
1062
- "loss": 1.0048,
1063
  "step": 725
1064
  },
1065
  {
1066
  "epoch": 5.0,
1067
- "grad_norm": 0.431640625,
1068
- "learning_rate": 0.0001686241637868734,
1069
- "loss": 1.0016,
1070
  "step": 730
1071
  },
1072
  {
1073
  "epoch": 5.0,
1074
- "eval_loss": 2.5294971466064453,
1075
- "eval_runtime": 0.5501,
1076
- "eval_samples_per_second": 18.178,
1077
- "eval_steps_per_second": 1.818,
1078
  "step": 730
1079
  },
1080
  {
1081
  "epoch": 5.034246575342466,
1082
- "grad_norm": 0.380859375,
1083
- "learning_rate": 0.0001680423073214737,
1084
- "loss": 0.9822,
1085
  "step": 735
1086
  },
1087
  {
1088
  "epoch": 5.068493150684931,
1089
- "grad_norm": 0.369140625,
1090
- "learning_rate": 0.00016745612926928694,
1091
- "loss": 0.9842,
1092
  "step": 740
1093
  },
1094
  {
1095
  "epoch": 5.102739726027397,
1096
- "grad_norm": 0.38671875,
1097
- "learning_rate": 0.0001668656668603751,
1098
- "loss": 0.9717,
1099
  "step": 745
1100
  },
1101
  {
1102
  "epoch": 5.136986301369863,
1103
- "grad_norm": 0.375,
1104
- "learning_rate": 0.00016627095759691362,
1105
- "loss": 0.9685,
1106
  "step": 750
1107
  },
1108
  {
1109
  "epoch": 5.171232876712328,
1110
- "grad_norm": 0.353515625,
1111
- "learning_rate": 0.0001656720392508094,
1112
- "loss": 0.9744,
1113
  "step": 755
1114
  },
1115
  {
1116
  "epoch": 5.205479452054795,
1117
- "grad_norm": 0.376953125,
1118
- "learning_rate": 0.00016506894986130171,
1119
- "loss": 0.9736,
1120
  "step": 760
1121
  },
1122
  {
1123
  "epoch": 5.239726027397261,
1124
- "grad_norm": 0.486328125,
1125
- "learning_rate": 0.00016446172773254629,
1126
- "loss": 0.972,
1127
  "step": 765
1128
  },
1129
  {
1130
  "epoch": 5.273972602739726,
1131
- "grad_norm": 0.470703125,
1132
- "learning_rate": 0.00016385041143118255,
1133
- "loss": 0.9813,
1134
  "step": 770
1135
  },
1136
  {
1137
  "epoch": 5.308219178082192,
1138
- "grad_norm": 0.5546875,
1139
- "learning_rate": 0.000163235039783884,
1140
- "loss": 0.9855,
1141
  "step": 775
1142
  },
1143
  {
1144
  "epoch": 5.342465753424658,
1145
- "grad_norm": 0.462890625,
1146
- "learning_rate": 0.0001626156518748922,
1147
- "loss": 0.9765,
1148
  "step": 780
1149
  },
1150
  {
1151
  "epoch": 5.376712328767123,
1152
- "grad_norm": 0.59375,
1153
- "learning_rate": 0.00016199228704353455,
1154
- "loss": 0.9876,
1155
  "step": 785
1156
  },
1157
  {
1158
  "epoch": 5.410958904109589,
1159
- "grad_norm": 0.53125,
1160
- "learning_rate": 0.00016136498488172568,
1161
- "loss": 0.9772,
1162
  "step": 790
1163
  },
1164
  {
1165
  "epoch": 5.445205479452055,
1166
- "grad_norm": 0.3984375,
1167
- "learning_rate": 0.0001607337852314527,
1168
- "loss": 0.9861,
1169
  "step": 795
1170
  },
1171
  {
1172
  "epoch": 5.47945205479452,
1173
- "grad_norm": 0.3671875,
1174
- "learning_rate": 0.00016009872818224485,
1175
- "loss": 0.9879,
1176
  "step": 800
1177
  },
1178
  {
1179
  "epoch": 5.513698630136986,
1180
- "grad_norm": 0.357421875,
1181
- "learning_rate": 0.00015945985406862721,
1182
- "loss": 0.9821,
1183
  "step": 805
1184
  },
1185
  {
1186
  "epoch": 5.5479452054794525,
1187
- "grad_norm": 0.4375,
1188
- "learning_rate": 0.00015881720346755905,
1189
- "loss": 0.9748,
1190
  "step": 810
1191
  },
1192
  {
1193
  "epoch": 5.582191780821918,
1194
- "grad_norm": 0.376953125,
1195
- "learning_rate": 0.00015817081719585643,
1196
- "loss": 0.9726,
1197
  "step": 815
1198
  },
1199
  {
1200
  "epoch": 5.616438356164384,
1201
- "grad_norm": 0.37890625,
1202
- "learning_rate": 0.00015752073630759998,
1203
- "loss": 0.9918,
1204
  "step": 820
1205
  },
1206
  {
1207
  "epoch": 5.6506849315068495,
1208
- "grad_norm": 0.419921875,
1209
- "learning_rate": 0.00015686700209152738,
1210
- "loss": 0.9775,
1211
  "step": 825
1212
  },
1213
  {
1214
  "epoch": 5.684931506849315,
1215
- "grad_norm": 0.33203125,
1216
- "learning_rate": 0.00015620965606841098,
1217
- "loss": 0.9734,
1218
  "step": 830
1219
  },
1220
  {
1221
  "epoch": 5.719178082191781,
1222
- "grad_norm": 0.37890625,
1223
- "learning_rate": 0.0001555487399884206,
1224
- "loss": 0.9753,
1225
  "step": 835
1226
  },
1227
  {
1228
  "epoch": 5.7534246575342465,
1229
- "grad_norm": 0.39453125,
1230
- "learning_rate": 0.00015488429582847192,
1231
- "loss": 0.9701,
1232
  "step": 840
1233
  },
1234
  {
1235
  "epoch": 5.787671232876712,
1236
- "grad_norm": 0.357421875,
1237
- "learning_rate": 0.0001542163657895605,
1238
- "loss": 0.9726,
1239
  "step": 845
1240
  },
1241
  {
1242
  "epoch": 5.821917808219178,
1243
- "grad_norm": 0.4375,
1244
- "learning_rate": 0.00015354499229408114,
1245
- "loss": 0.9755,
1246
  "step": 850
1247
  },
1248
  {
1249
  "epoch": 5.8561643835616435,
1250
- "grad_norm": 0.50390625,
1251
- "learning_rate": 0.0001528702179831338,
1252
- "loss": 0.9733,
1253
  "step": 855
1254
  },
1255
  {
1256
  "epoch": 5.890410958904109,
1257
- "grad_norm": 0.419921875,
1258
- "learning_rate": 0.00015219208571381525,
1259
- "loss": 0.9795,
1260
  "step": 860
1261
  },
1262
  {
1263
  "epoch": 5.924657534246576,
1264
- "grad_norm": 0.466796875,
1265
- "learning_rate": 0.00015151063855649698,
1266
- "loss": 0.9906,
1267
  "step": 865
1268
  },
1269
  {
1270
  "epoch": 5.958904109589041,
1271
- "grad_norm": 0.35546875,
1272
- "learning_rate": 0.00015082591979208976,
1273
- "loss": 0.983,
1274
  "step": 870
1275
  },
1276
  {
1277
  "epoch": 5.993150684931507,
1278
- "grad_norm": 0.51953125,
1279
- "learning_rate": 0.00015013797290929466,
1280
- "loss": 0.9823,
1281
  "step": 875
1282
  },
1283
  {
1284
  "epoch": 6.0,
1285
- "eval_loss": 2.5500409603118896,
1286
- "eval_runtime": 0.5455,
1287
- "eval_samples_per_second": 18.332,
1288
- "eval_steps_per_second": 1.833,
1289
  "step": 876
1290
  },
1291
  {
1292
  "epoch": 6.027397260273973,
1293
- "grad_norm": 0.380859375,
1294
- "learning_rate": 0.00014944684160184108,
1295
- "loss": 0.9588,
1296
  "step": 880
1297
  },
1298
  {
1299
  "epoch": 6.061643835616438,
1300
- "grad_norm": 0.435546875,
1301
- "learning_rate": 0.00014875256976571135,
1302
- "loss": 0.9449,
1303
  "step": 885
1304
  },
1305
  {
1306
  "epoch": 6.095890410958904,
1307
- "grad_norm": 0.41796875,
1308
- "learning_rate": 0.00014805520149635307,
1309
- "loss": 0.9336,
1310
  "step": 890
1311
  },
1312
  {
1313
  "epoch": 6.13013698630137,
1314
- "grad_norm": 0.388671875,
1315
- "learning_rate": 0.00014735478108587828,
1316
- "loss": 0.9428,
1317
  "step": 895
1318
  },
1319
  {
1320
  "epoch": 6.164383561643835,
1321
- "grad_norm": 0.578125,
1322
- "learning_rate": 0.00014665135302025035,
1323
- "loss": 0.9457,
1324
  "step": 900
1325
  },
1326
  {
1327
  "epoch": 6.198630136986301,
1328
- "grad_norm": 0.375,
1329
- "learning_rate": 0.00014594496197645852,
1330
- "loss": 0.9425,
1331
  "step": 905
1332
  },
1333
  {
1334
  "epoch": 6.232876712328767,
1335
- "grad_norm": 0.361328125,
1336
- "learning_rate": 0.0001452356528196804,
1337
- "loss": 0.9492,
1338
  "step": 910
1339
  },
1340
  {
1341
  "epoch": 6.267123287671233,
1342
- "grad_norm": 0.34375,
1343
- "learning_rate": 0.00014452347060043237,
1344
- "loss": 0.9542,
1345
  "step": 915
1346
  },
1347
  {
1348
  "epoch": 6.301369863013699,
1349
- "grad_norm": 0.375,
1350
- "learning_rate": 0.00014380846055170828,
1351
- "loss": 0.9488,
1352
  "step": 920
1353
  },
1354
  {
1355
  "epoch": 6.335616438356165,
1356
- "grad_norm": 0.56640625,
1357
- "learning_rate": 0.00014309066808610655,
1358
- "loss": 0.9532,
1359
  "step": 925
1360
  },
1361
  {
1362
  "epoch": 6.36986301369863,
1363
- "grad_norm": 0.451171875,
1364
- "learning_rate": 0.0001423701387929459,
1365
- "loss": 0.954,
1366
  "step": 930
1367
  },
1368
  {
1369
  "epoch": 6.404109589041096,
1370
- "grad_norm": 0.361328125,
1371
- "learning_rate": 0.00014164691843536982,
1372
- "loss": 0.9513,
1373
  "step": 935
1374
  },
1375
  {
1376
  "epoch": 6.438356164383562,
1377
- "grad_norm": 0.4375,
1378
- "learning_rate": 0.00014092105294744,
1379
- "loss": 0.954,
1380
  "step": 940
1381
  },
1382
  {
1383
  "epoch": 6.472602739726027,
1384
- "grad_norm": 0.404296875,
1385
- "learning_rate": 0.00014019258843121893,
1386
- "loss": 0.9549,
1387
  "step": 945
1388
  },
1389
  {
1390
  "epoch": 6.506849315068493,
1391
- "grad_norm": 0.38671875,
1392
- "learning_rate": 0.0001394615711538417,
1393
- "loss": 0.9509,
1394
  "step": 950
1395
  },
1396
  {
1397
  "epoch": 6.541095890410959,
1398
- "grad_norm": 0.376953125,
1399
- "learning_rate": 0.00013872804754457759,
1400
- "loss": 0.9556,
1401
  "step": 955
1402
  },
1403
  {
1404
  "epoch": 6.575342465753424,
1405
- "grad_norm": 0.400390625,
1406
- "learning_rate": 0.00013799206419188103,
1407
- "loss": 0.9596,
1408
  "step": 960
1409
  },
1410
  {
1411
  "epoch": 6.609589041095891,
1412
- "grad_norm": 0.37890625,
1413
- "learning_rate": 0.00013725366784043288,
1414
- "loss": 0.9532,
1415
  "step": 965
1416
  },
1417
  {
1418
  "epoch": 6.6438356164383565,
1419
- "grad_norm": 0.361328125,
1420
- "learning_rate": 0.00013651290538817113,
1421
- "loss": 0.9547,
1422
  "step": 970
1423
  },
1424
  {
1425
  "epoch": 6.678082191780822,
1426
- "grad_norm": 0.392578125,
1427
- "learning_rate": 0.0001357698238833126,
1428
- "loss": 0.9619,
1429
  "step": 975
1430
  },
1431
  {
1432
  "epoch": 6.712328767123288,
1433
- "grad_norm": 0.38671875,
1434
- "learning_rate": 0.00013502447052136455,
1435
- "loss": 0.9457,
1436
  "step": 980
1437
  },
1438
  {
1439
  "epoch": 6.7465753424657535,
1440
- "grad_norm": 0.384765625,
1441
- "learning_rate": 0.00013427689264212738,
1442
- "loss": 0.9595,
1443
  "step": 985
1444
  },
1445
  {
1446
  "epoch": 6.780821917808219,
1447
- "grad_norm": 0.3984375,
1448
- "learning_rate": 0.00013352713772668765,
1449
- "loss": 0.9501,
1450
  "step": 990
1451
  },
1452
  {
1453
  "epoch": 6.815068493150685,
1454
- "grad_norm": 0.404296875,
1455
- "learning_rate": 0.0001327752533944025,
1456
- "loss": 0.9542,
1457
  "step": 995
1458
  },
1459
  {
1460
  "epoch": 6.8493150684931505,
1461
- "grad_norm": 0.5546875,
1462
- "learning_rate": 0.00013202128739987532,
1463
- "loss": 0.957,
1464
  "step": 1000
1465
  },
1466
  {
1467
  "epoch": 6.883561643835616,
1468
- "grad_norm": 0.388671875,
1469
- "learning_rate": 0.00013126528762992247,
1470
- "loss": 0.9597,
1471
  "step": 1005
1472
  },
1473
  {
1474
  "epoch": 6.917808219178082,
1475
- "grad_norm": 0.4140625,
1476
- "learning_rate": 0.0001305073021005321,
1477
- "loss": 0.9525,
1478
  "step": 1010
1479
  },
1480
  {
1481
  "epoch": 6.9520547945205475,
1482
- "grad_norm": 0.400390625,
1483
- "learning_rate": 0.0001297473789538142,
1484
- "loss": 0.9554,
1485
  "step": 1015
1486
  },
1487
  {
1488
  "epoch": 6.986301369863014,
1489
- "grad_norm": 0.37890625,
1490
- "learning_rate": 0.00012898556645494325,
1491
- "loss": 0.955,
1492
  "step": 1020
1493
  },
1494
  {
1495
  "epoch": 7.0,
1496
- "eval_loss": 2.5866098403930664,
1497
- "eval_runtime": 0.5603,
1498
- "eval_samples_per_second": 17.847,
1499
- "eval_steps_per_second": 1.785,
1500
  "step": 1022
1501
  },
1502
  {
1503
  "epoch": 7.02054794520548,
1504
- "grad_norm": 0.380859375,
1505
- "learning_rate": 0.0001282219129890925,
1506
- "loss": 0.9357,
1507
  "step": 1025
1508
  },
1509
  {
1510
  "epoch": 7.054794520547945,
1511
- "grad_norm": 0.373046875,
1512
- "learning_rate": 0.00012745646705836097,
1513
- "loss": 0.9228,
1514
  "step": 1030
1515
  },
1516
  {
1517
  "epoch": 7.089041095890411,
1518
- "grad_norm": 0.5390625,
1519
- "learning_rate": 0.0001266892772786929,
1520
- "loss": 0.9121,
1521
  "step": 1035
1522
  },
1523
  {
1524
  "epoch": 7.123287671232877,
1525
- "grad_norm": 0.37109375,
1526
- "learning_rate": 0.0001259203923767901,
1527
- "loss": 0.9181,
1528
  "step": 1040
1529
  },
1530
  {
1531
  "epoch": 7.157534246575342,
1532
- "grad_norm": 0.37109375,
1533
- "learning_rate": 0.00012514986118701695,
1534
- "loss": 0.9176,
1535
  "step": 1045
1536
  },
1537
  {
1538
  "epoch": 7.191780821917808,
1539
- "grad_norm": 0.3984375,
1540
- "learning_rate": 0.00012437773264829897,
1541
- "loss": 0.9241,
1542
  "step": 1050
1543
  },
1544
  {
1545
  "epoch": 7.226027397260274,
1546
- "grad_norm": 0.376953125,
1547
- "learning_rate": 0.00012360405580101448,
1548
- "loss": 0.9287,
1549
  "step": 1055
1550
  },
1551
  {
1552
  "epoch": 7.260273972602739,
1553
- "grad_norm": 0.375,
1554
- "learning_rate": 0.00012282887978387976,
1555
- "loss": 0.9347,
1556
  "step": 1060
1557
  },
1558
  {
1559
  "epoch": 7.294520547945205,
1560
- "grad_norm": 0.3984375,
1561
- "learning_rate": 0.00012205225383082843,
1562
- "loss": 0.9275,
1563
  "step": 1065
1564
  },
1565
  {
1566
  "epoch": 7.328767123287671,
1567
- "grad_norm": 0.404296875,
1568
- "learning_rate": 0.000121274227267884,
1569
- "loss": 0.923,
1570
  "step": 1070
1571
  },
1572
  {
1573
  "epoch": 7.363013698630137,
1574
- "grad_norm": 0.388671875,
1575
- "learning_rate": 0.00012049484951002739,
1576
- "loss": 0.9332,
1577
  "step": 1075
1578
  },
1579
  {
1580
  "epoch": 7.397260273972603,
1581
- "grad_norm": 0.37890625,
1582
- "learning_rate": 0.00011971417005805818,
1583
- "loss": 0.9238,
1584
  "step": 1080
1585
  },
1586
  {
1587
  "epoch": 7.431506849315069,
1588
- "grad_norm": 0.37109375,
1589
- "learning_rate": 0.00011893223849545084,
1590
- "loss": 0.9278,
1591
  "step": 1085
1592
  },
1593
  {
1594
  "epoch": 7.465753424657534,
1595
- "grad_norm": 0.388671875,
1596
- "learning_rate": 0.00011814910448520536,
1597
- "loss": 0.9268,
1598
  "step": 1090
1599
  },
1600
  {
1601
  "epoch": 7.5,
1602
- "grad_norm": 0.404296875,
1603
- "learning_rate": 0.00011736481776669306,
1604
- "loss": 0.931,
1605
  "step": 1095
1606
  },
1607
  {
1608
  "epoch": 7.534246575342466,
1609
- "grad_norm": 0.390625,
1610
- "learning_rate": 0.00011657942815249754,
1611
- "loss": 0.9283,
1612
  "step": 1100
1613
  },
1614
  {
1615
  "epoch": 7.568493150684931,
1616
- "grad_norm": 0.369140625,
1617
- "learning_rate": 0.00011579298552525084,
1618
- "loss": 0.9246,
1619
  "step": 1105
1620
  },
1621
  {
1622
  "epoch": 7.602739726027397,
1623
- "grad_norm": 0.390625,
1624
- "learning_rate": 0.00011500553983446527,
1625
- "loss": 0.9293,
1626
  "step": 1110
1627
  },
1628
  {
1629
  "epoch": 7.636986301369863,
1630
- "grad_norm": 0.365234375,
1631
- "learning_rate": 0.00011421714109336097,
1632
- "loss": 0.9335,
1633
  "step": 1115
1634
  },
1635
  {
1636
  "epoch": 7.671232876712329,
1637
- "grad_norm": 0.453125,
1638
- "learning_rate": 0.00011342783937568926,
1639
- "loss": 0.9359,
1640
  "step": 1120
1641
  },
1642
  {
1643
  "epoch": 7.705479452054795,
1644
- "grad_norm": 0.416015625,
1645
- "learning_rate": 0.00011263768481255264,
1646
- "loss": 0.9295,
1647
  "step": 1125
1648
  },
1649
  {
1650
  "epoch": 7.739726027397261,
1651
- "grad_norm": 0.380859375,
1652
- "learning_rate": 0.00011184672758922034,
1653
- "loss": 0.9404,
1654
  "step": 1130
1655
  },
1656
  {
1657
  "epoch": 7.773972602739726,
1658
- "grad_norm": 0.396484375,
1659
- "learning_rate": 0.00011105501794194131,
1660
- "loss": 0.9289,
1661
  "step": 1135
1662
  },
1663
  {
1664
  "epoch": 7.808219178082192,
1665
- "grad_norm": 0.39453125,
1666
- "learning_rate": 0.00011026260615475333,
1667
- "loss": 0.9409,
1668
  "step": 1140
1669
  },
1670
  {
1671
  "epoch": 7.842465753424658,
1672
- "grad_norm": 0.396484375,
1673
- "learning_rate": 0.00010946954255628928,
1674
- "loss": 0.9355,
1675
  "step": 1145
1676
  },
1677
  {
1678
  "epoch": 7.876712328767123,
1679
- "grad_norm": 0.443359375,
1680
- "learning_rate": 0.00010867587751658079,
1681
- "loss": 0.9257,
1682
  "step": 1150
1683
  },
1684
  {
1685
  "epoch": 7.910958904109589,
1686
- "grad_norm": 0.365234375,
1687
- "learning_rate": 0.00010788166144385888,
1688
- "loss": 0.924,
1689
  "step": 1155
1690
  },
1691
  {
1692
  "epoch": 7.945205479452055,
1693
- "grad_norm": 0.427734375,
1694
- "learning_rate": 0.0001070869447813525,
1695
- "loss": 0.9202,
1696
  "step": 1160
1697
  },
1698
  {
1699
  "epoch": 7.97945205479452,
1700
- "grad_norm": 0.3515625,
1701
- "learning_rate": 0.0001062917780040847,
1702
- "loss": 0.9214,
1703
  "step": 1165
1704
  },
1705
  {
1706
  "epoch": 8.0,
1707
- "eval_loss": 2.6224260330200195,
1708
- "eval_runtime": 0.5566,
1709
- "eval_samples_per_second": 17.965,
1710
- "eval_steps_per_second": 1.797,
1711
  "step": 1168
1712
  },
1713
  {
1714
  "epoch": 8.013698630136986,
1715
- "grad_norm": 0.388671875,
1716
- "learning_rate": 0.0001054962116156667,
1717
- "loss": 0.9133,
1718
  "step": 1170
1719
  },
1720
  {
1721
  "epoch": 8.047945205479452,
1722
- "grad_norm": 0.41796875,
1723
- "learning_rate": 0.00010470029614509041,
1724
- "loss": 0.8952,
1725
  "step": 1175
1726
  },
1727
  {
1728
  "epoch": 8.082191780821917,
1729
- "grad_norm": 0.3984375,
1730
- "learning_rate": 0.00010390408214351892,
1731
- "loss": 0.8963,
1732
  "step": 1180
1733
  },
1734
  {
1735
  "epoch": 8.116438356164384,
1736
- "grad_norm": 0.388671875,
1737
- "learning_rate": 0.0001031076201810762,
1738
- "loss": 0.8996,
1739
  "step": 1185
1740
  },
1741
  {
1742
  "epoch": 8.150684931506849,
1743
- "grad_norm": 0.38671875,
1744
- "learning_rate": 0.00010231096084363483,
1745
- "loss": 0.8898,
1746
  "step": 1190
1747
  },
1748
  {
1749
  "epoch": 8.184931506849315,
1750
- "grad_norm": 0.390625,
1751
- "learning_rate": 0.00010151415472960342,
1752
- "loss": 0.9138,
1753
  "step": 1195
1754
  },
1755
  {
1756
  "epoch": 8.219178082191782,
1757
- "grad_norm": 0.388671875,
1758
- "learning_rate": 0.00010071725244671282,
1759
- "loss": 0.9023,
1760
  "step": 1200
1761
  },
1762
  {
1763
  "epoch": 8.253424657534246,
1764
- "grad_norm": 0.388671875,
1765
- "learning_rate": 9.992030460880181e-05,
1766
- "loss": 0.8929,
1767
  "step": 1205
1768
  },
1769
  {
1770
  "epoch": 8.287671232876713,
1771
- "grad_norm": 0.392578125,
1772
- "learning_rate": 9.91233618326026e-05,
1773
- "loss": 0.9089,
1774
  "step": 1210
1775
  },
1776
  {
1777
  "epoch": 8.321917808219178,
1778
- "grad_norm": 0.41015625,
1779
- "learning_rate": 9.83264747345259e-05,
1780
- "loss": 0.9037,
1781
  "step": 1215
1782
  },
1783
  {
1784
  "epoch": 8.356164383561644,
1785
- "grad_norm": 0.369140625,
1786
- "learning_rate": 9.752969392744606e-05,
1787
- "loss": 0.9062,
1788
  "step": 1220
1789
  },
1790
  {
1791
  "epoch": 8.39041095890411,
1792
- "grad_norm": 0.40234375,
1793
- "learning_rate": 9.673307001748661e-05,
1794
- "loss": 0.8982,
1795
  "step": 1225
1796
  },
1797
  {
1798
  "epoch": 8.424657534246576,
1799
- "grad_norm": 0.392578125,
1800
- "learning_rate": 9.593665360080599e-05,
1801
- "loss": 0.8994,
1802
  "step": 1230
1803
  },
1804
  {
1805
  "epoch": 8.45890410958904,
1806
- "grad_norm": 0.4140625,
1807
- "learning_rate": 9.514049526038418e-05,
1808
- "loss": 0.9045,
1809
  "step": 1235
1810
  },
1811
  {
1812
  "epoch": 8.493150684931507,
1813
- "grad_norm": 0.400390625,
1814
- "learning_rate": 9.43446455628097e-05,
1815
- "loss": 0.9062,
1816
  "step": 1240
1817
  },
1818
  {
1819
  "epoch": 8.527397260273972,
1820
- "grad_norm": 0.427734375,
1821
- "learning_rate": 9.354915505506839e-05,
1822
- "loss": 0.9056,
1823
  "step": 1245
1824
  },
1825
  {
1826
  "epoch": 8.561643835616438,
1827
- "grad_norm": 0.3828125,
1828
- "learning_rate": 9.27540742613326e-05,
1829
- "loss": 0.9078,
1830
  "step": 1250
1831
  },
1832
  {
1833
  "epoch": 8.595890410958905,
1834
- "grad_norm": 0.431640625,
1835
- "learning_rate": 9.195945367975256e-05,
1836
- "loss": 0.8994,
1837
  "step": 1255
1838
  },
1839
  {
1840
  "epoch": 8.63013698630137,
1841
- "grad_norm": 0.404296875,
1842
- "learning_rate": 9.116534377924883e-05,
1843
- "loss": 0.9088,
1844
  "step": 1260
1845
  },
1846
  {
1847
  "epoch": 8.664383561643836,
1848
- "grad_norm": 0.44921875,
1849
- "learning_rate": 9.037179499630703e-05,
1850
- "loss": 0.9035,
1851
  "step": 1265
1852
  },
1853
  {
1854
  "epoch": 8.698630136986301,
1855
- "grad_norm": 0.40625,
1856
- "learning_rate": 8.957885773177438e-05,
1857
- "loss": 0.9178,
1858
  "step": 1270
1859
  },
1860
  {
1861
  "epoch": 8.732876712328768,
1862
- "grad_norm": 0.51953125,
1863
- "learning_rate": 8.878658234765858e-05,
1864
- "loss": 0.9062,
1865
  "step": 1275
1866
  },
1867
  {
1868
  "epoch": 8.767123287671232,
1869
- "grad_norm": 0.486328125,
1870
- "learning_rate": 8.799501916392912e-05,
1871
- "loss": 0.9157,
1872
  "step": 1280
1873
  },
1874
  {
1875
  "epoch": 8.801369863013699,
1876
- "grad_norm": 0.392578125,
1877
- "learning_rate": 8.720421845532151e-05,
1878
- "loss": 0.912,
1879
  "step": 1285
1880
  },
1881
  {
1882
  "epoch": 8.835616438356164,
1883
- "grad_norm": 0.37109375,
1884
- "learning_rate": 8.641423044814374e-05,
1885
- "loss": 0.9085,
1886
  "step": 1290
1887
  },
1888
  {
1889
  "epoch": 8.86986301369863,
1890
- "grad_norm": 0.396484375,
1891
- "learning_rate": 8.562510531708677e-05,
1892
- "loss": 0.9158,
1893
  "step": 1295
1894
  },
1895
  {
1896
  "epoch": 8.904109589041095,
1897
- "grad_norm": 0.384765625,
1898
- "learning_rate": 8.48368931820373e-05,
1899
- "loss": 0.909,
1900
  "step": 1300
1901
  },
1902
  {
1903
  "epoch": 8.938356164383562,
1904
- "grad_norm": 0.39453125,
1905
- "learning_rate": 8.404964410489485e-05,
1906
- "loss": 0.9121,
1907
  "step": 1305
1908
  },
1909
  {
1910
  "epoch": 8.972602739726028,
1911
- "grad_norm": 0.39453125,
1912
- "learning_rate": 8.32634080863919e-05,
1913
- "loss": 0.913,
1914
  "step": 1310
1915
  },
1916
  {
1917
  "epoch": 9.0,
1918
- "eval_loss": 2.6512458324432373,
1919
- "eval_runtime": 0.5534,
1920
- "eval_samples_per_second": 18.07,
1921
- "eval_steps_per_second": 1.807,
1922
  "step": 1314
1923
  },
1924
  {
1925
  "epoch": 9.006849315068493,
1926
- "grad_norm": 0.408203125,
1927
- "learning_rate": 8.247823506291844e-05,
1928
- "loss": 0.9034,
1929
  "step": 1315
1930
  },
1931
  {
1932
  "epoch": 9.04109589041096,
1933
- "grad_norm": 0.404296875,
1934
- "learning_rate": 8.169417490335007e-05,
1935
- "loss": 0.8821,
1936
  "step": 1320
1937
  },
1938
  {
1939
  "epoch": 9.075342465753424,
1940
- "grad_norm": 0.416015625,
1941
- "learning_rate": 8.091127740588094e-05,
1942
- "loss": 0.8702,
1943
  "step": 1325
1944
  },
1945
  {
1946
  "epoch": 9.10958904109589,
1947
- "grad_norm": 0.39453125,
1948
- "learning_rate": 8.012959229486061e-05,
1949
- "loss": 0.8755,
1950
  "step": 1330
1951
  },
1952
  {
1953
  "epoch": 9.143835616438356,
1954
- "grad_norm": 0.43359375,
1955
- "learning_rate": 7.934916921763628e-05,
1956
- "loss": 0.8783,
1957
  "step": 1335
1958
  },
1959
  {
1960
  "epoch": 9.178082191780822,
1961
- "grad_norm": 0.421875,
1962
- "learning_rate": 7.857005774139907e-05,
1963
- "loss": 0.8794,
1964
  "step": 1340
1965
  },
1966
  {
1967
  "epoch": 9.212328767123287,
1968
- "grad_norm": 0.400390625,
1969
- "learning_rate": 7.779230735003628e-05,
1970
- "loss": 0.8844,
1971
  "step": 1345
1972
  },
1973
  {
1974
  "epoch": 9.246575342465754,
1975
- "grad_norm": 0.3984375,
1976
- "learning_rate": 7.701596744098818e-05,
1977
- "loss": 0.8775,
1978
  "step": 1350
1979
  },
1980
  {
1981
  "epoch": 9.280821917808218,
1982
- "grad_norm": 0.404296875,
1983
- "learning_rate": 7.624108732211081e-05,
1984
- "loss": 0.8705,
1985
  "step": 1355
1986
  },
1987
  {
1988
  "epoch": 9.315068493150685,
1989
- "grad_norm": 0.408203125,
1990
- "learning_rate": 7.54677162085442e-05,
1991
- "loss": 0.8897,
1992
  "step": 1360
1993
  },
1994
  {
1995
  "epoch": 9.349315068493151,
1996
- "grad_norm": 0.3984375,
1997
- "learning_rate": 7.469590321958662e-05,
1998
- "loss": 0.882,
1999
  "step": 1365
2000
  },
2001
  {
2002
  "epoch": 9.383561643835616,
2003
- "grad_norm": 0.43359375,
2004
- "learning_rate": 7.392569737557474e-05,
2005
- "loss": 0.8879,
2006
  "step": 1370
2007
  },
2008
  {
2009
  "epoch": 9.417808219178083,
2010
- "grad_norm": 0.416015625,
2011
- "learning_rate": 7.31571475947703e-05,
2012
- "loss": 0.8827,
2013
  "step": 1375
2014
  },
2015
  {
2016
  "epoch": 9.452054794520548,
2017
- "grad_norm": 0.412109375,
2018
- "learning_rate": 7.239030269025311e-05,
2019
- "loss": 0.8805,
2020
  "step": 1380
2021
  },
2022
  {
2023
  "epoch": 9.486301369863014,
2024
- "grad_norm": 0.408203125,
2025
- "learning_rate": 7.162521136682085e-05,
2026
- "loss": 0.8966,
2027
  "step": 1385
2028
  },
2029
  {
2030
  "epoch": 9.520547945205479,
2031
- "grad_norm": 0.431640625,
2032
- "learning_rate": 7.08619222178954e-05,
2033
- "loss": 0.8895,
2034
  "step": 1390
2035
  },
2036
  {
2037
  "epoch": 9.554794520547945,
2038
- "grad_norm": 0.423828125,
2039
- "learning_rate": 7.010048372243698e-05,
2040
- "loss": 0.8907,
2041
  "step": 1395
2042
  },
2043
  {
2044
  "epoch": 9.58904109589041,
2045
- "grad_norm": 0.42578125,
2046
- "learning_rate": 6.934094424186459e-05,
2047
- "loss": 0.8876,
2048
  "step": 1400
2049
  },
2050
  {
2051
  "epoch": 9.623287671232877,
2052
- "grad_norm": 0.39453125,
2053
- "learning_rate": 6.858335201698485e-05,
2054
- "loss": 0.8936,
2055
  "step": 1405
2056
  },
2057
  {
2058
  "epoch": 9.657534246575342,
2059
- "grad_norm": 0.451171875,
2060
- "learning_rate": 6.782775516492771e-05,
2061
- "loss": 0.8804,
2062
  "step": 1410
2063
  },
2064
  {
2065
  "epoch": 9.691780821917808,
2066
- "grad_norm": 0.40234375,
2067
- "learning_rate": 6.70742016760907e-05,
2068
- "loss": 0.8907,
2069
  "step": 1415
2070
  },
2071
  {
2072
  "epoch": 9.726027397260275,
2073
- "grad_norm": 0.4453125,
2074
- "learning_rate": 6.632273941109064e-05,
2075
- "loss": 0.8756,
2076
  "step": 1420
2077
  },
2078
  {
2079
  "epoch": 9.76027397260274,
2080
- "grad_norm": 0.40625,
2081
- "learning_rate": 6.5573416097724e-05,
2082
- "loss": 0.8963,
2083
  "step": 1425
2084
  },
2085
  {
2086
  "epoch": 9.794520547945206,
2087
- "grad_norm": 0.412109375,
2088
- "learning_rate": 6.482627932793553e-05,
2089
- "loss": 0.8998,
2090
  "step": 1430
2091
  },
2092
  {
2093
  "epoch": 9.82876712328767,
2094
- "grad_norm": 0.419921875,
2095
- "learning_rate": 6.408137655479554e-05,
2096
- "loss": 0.9024,
2097
  "step": 1435
2098
  },
2099
  {
2100
  "epoch": 9.863013698630137,
2101
- "grad_norm": 0.421875,
2102
- "learning_rate": 6.333875508948593e-05,
2103
- "loss": 0.8921,
2104
  "step": 1440
2105
  },
2106
  {
2107
  "epoch": 9.897260273972602,
2108
- "grad_norm": 0.45703125,
2109
- "learning_rate": 6.259846209829551e-05,
2110
- "loss": 0.904,
2111
  "step": 1445
2112
  },
2113
  {
2114
  "epoch": 9.931506849315069,
2115
- "grad_norm": 0.4140625,
2116
- "learning_rate": 6.186054459962399e-05,
2117
- "loss": 0.8899,
2118
  "step": 1450
2119
  },
2120
  {
2121
  "epoch": 9.965753424657533,
2122
- "grad_norm": 0.40625,
2123
- "learning_rate": 6.112504946099604e-05,
2124
- "loss": 0.8875,
2125
  "step": 1455
2126
  },
2127
  {
2128
  "epoch": 10.0,
2129
- "grad_norm": 0.431640625,
2130
- "learning_rate": 6.039202339608432e-05,
2131
- "loss": 0.889,
2132
  "step": 1460
2133
  },
2134
  {
2135
  "epoch": 10.0,
2136
- "eval_loss": 2.6852145195007324,
2137
- "eval_runtime": 0.5511,
2138
- "eval_samples_per_second": 18.146,
2139
- "eval_steps_per_second": 1.815,
2140
  "step": 1460
2141
  },
2142
  {
2143
- "epoch": 10.034246575342467,
2144
- "grad_norm": 0.40625,
2145
- "learning_rate": 5.966151296174268e-05,
2146
- "loss": 0.8664,
2147
- "step": 1465
2148
- },
2149
- {
2150
- "epoch": 10.068493150684931,
2151
- "grad_norm": 0.431640625,
2152
- "learning_rate": 5.8933564555049105e-05,
2153
- "loss": 0.8677,
2154
- "step": 1470
2155
- },
2156
- {
2157
- "epoch": 10.102739726027398,
2158
- "grad_norm": 0.41796875,
2159
- "learning_rate": 5.820822441035899e-05,
2160
- "loss": 0.866,
2161
- "step": 1475
2162
- },
2163
- {
2164
- "epoch": 10.136986301369863,
2165
- "grad_norm": 0.40625,
2166
- "learning_rate": 5.7485538596368496e-05,
2167
- "loss": 0.8664,
2168
- "step": 1480
2169
- },
2170
- {
2171
- "epoch": 10.17123287671233,
2172
- "grad_norm": 0.41015625,
2173
- "learning_rate": 5.6765553013188766e-05,
2174
- "loss": 0.8645,
2175
- "step": 1485
2176
- },
2177
- {
2178
- "epoch": 10.205479452054794,
2179
- "grad_norm": 0.400390625,
2180
- "learning_rate": 5.6048313389430484e-05,
2181
- "loss": 0.8624,
2182
- "step": 1490
2183
- },
2184
- {
2185
- "epoch": 10.23972602739726,
2186
- "grad_norm": 0.408203125,
2187
- "learning_rate": 5.533386527929962e-05,
2188
- "loss": 0.874,
2189
- "step": 1495
2190
- },
2191
- {
2192
- "epoch": 10.273972602739725,
2193
- "grad_norm": 0.40625,
2194
- "learning_rate": 5.462225405970401e-05,
2195
- "loss": 0.8708,
2196
- "step": 1500
2197
- },
2198
- {
2199
- "epoch": 10.273972602739725,
2200
- "step": 1500,
2201
- "total_flos": 8.853977907740017e+17,
2202
- "train_loss": 0.0,
2203
- "train_runtime": 3.0171,
2204
- "train_samples_per_second": 23207.393,
2205
- "train_steps_per_second": 483.902
2206
  }
2207
  ],
2208
  "logging_steps": 5,
@@ -2222,7 +2166,7 @@
2222
  "attributes": {}
2223
  }
2224
  },
2225
- "total_flos": 8.853977907740017e+17,
2226
  "train_batch_size": 8,
2227
  "trial_name": null,
2228
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 1460,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.00684931506849315,
13
+ "grad_norm": 4.9375,
14
+ "learning_rate": 1.3698630136986302e-06,
15
  "loss": 3.0017,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03424657534246575,
20
+ "grad_norm": 4.5625,
21
+ "learning_rate": 6.849315068493151e-06,
22
+ "loss": 3.0717,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.0684931506849315,
27
+ "grad_norm": 5.40625,
28
+ "learning_rate": 1.3698630136986302e-05,
29
+ "loss": 3.002,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10273972602739725,
34
+ "grad_norm": 2.640625,
35
+ "learning_rate": 2.0547945205479453e-05,
36
+ "loss": 2.8518,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.136986301369863,
41
+ "grad_norm": 1.734375,
42
+ "learning_rate": 2.7397260273972603e-05,
43
+ "loss": 2.6474,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17123287671232876,
48
+ "grad_norm": 1.59375,
49
+ "learning_rate": 3.424657534246575e-05,
50
+ "loss": 2.4285,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.2054794520547945,
55
+ "grad_norm": 75.0,
56
+ "learning_rate": 4.1095890410958905e-05,
57
+ "loss": 2.2533,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.23972602739726026,
62
+ "grad_norm": 1.84375,
63
+ "learning_rate": 4.794520547945205e-05,
64
+ "loss": 2.0584,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.273972602739726,
69
+ "grad_norm": 0.80859375,
70
+ "learning_rate": 5.479452054794521e-05,
71
+ "loss": 1.9038,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.3082191780821918,
76
+ "grad_norm": 0.65625,
77
+ "learning_rate": 6.164383561643835e-05,
78
+ "loss": 1.797,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.3424657534246575,
83
+ "grad_norm": 0.5859375,
84
+ "learning_rate": 6.84931506849315e-05,
85
+ "loss": 1.6455,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.3767123287671233,
90
+ "grad_norm": 0.40625,
91
+ "learning_rate": 7.534246575342466e-05,
92
+ "loss": 1.5458,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.410958904109589,
97
+ "grad_norm": 0.345703125,
98
+ "learning_rate": 8.219178082191781e-05,
99
+ "loss": 1.4771,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4452054794520548,
104
+ "grad_norm": 0.38671875,
105
+ "learning_rate": 8.904109589041096e-05,
106
+ "loss": 1.4206,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.4794520547945205,
111
+ "grad_norm": 0.546875,
112
+ "learning_rate": 9.58904109589041e-05,
113
+ "loss": 1.3804,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5136986301369864,
118
+ "grad_norm": 0.353515625,
119
+ "learning_rate": 0.00010273972602739728,
120
+ "loss": 1.3471,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.547945205479452,
125
+ "grad_norm": 0.259765625,
126
+ "learning_rate": 0.00010958904109589041,
127
+ "loss": 1.3165,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.5821917808219178,
132
+ "grad_norm": 0.53515625,
133
+ "learning_rate": 0.00011643835616438356,
134
+ "loss": 1.2853,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6164383561643836,
139
+ "grad_norm": 0.3828125,
140
+ "learning_rate": 0.0001232876712328767,
141
+ "loss": 1.2863,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6506849315068494,
146
+ "grad_norm": 0.8828125,
147
+ "learning_rate": 0.00013013698630136988,
148
+ "loss": 1.267,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.684931506849315,
153
+ "grad_norm": 0.74609375,
154
+ "learning_rate": 0.000136986301369863,
155
+ "loss": 1.2466,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.7191780821917808,
160
+ "grad_norm": 1.0,
161
+ "learning_rate": 0.00014383561643835618,
162
+ "loss": 1.2444,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7534246575342466,
167
+ "grad_norm": 0.3671875,
168
+ "learning_rate": 0.00015068493150684933,
169
+ "loss": 1.2283,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.7876712328767124,
174
+ "grad_norm": 0.3125,
175
+ "learning_rate": 0.00015753424657534247,
176
+ "loss": 1.2153,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.821917808219178,
181
+ "grad_norm": 0.3984375,
182
+ "learning_rate": 0.00016438356164383562,
183
+ "loss": 1.2106,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8561643835616438,
188
+ "grad_norm": 0.330078125,
189
+ "learning_rate": 0.00017123287671232877,
190
+ "loss": 1.1939,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.8904109589041096,
195
+ "grad_norm": 0.57421875,
196
+ "learning_rate": 0.00017808219178082192,
197
+ "loss": 1.1965,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9246575342465754,
202
+ "grad_norm": 0.515625,
203
+ "learning_rate": 0.0001849315068493151,
204
+ "loss": 1.1839,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.958904109589041,
209
+ "grad_norm": 1.4453125,
210
+ "learning_rate": 0.0001917808219178082,
211
+ "loss": 1.1983,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.9931506849315068,
216
+ "grad_norm": 0.470703125,
217
+ "learning_rate": 0.00019863013698630139,
218
+ "loss": 1.1808,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 1.0,
223
+ "eval_loss": 2.487597942352295,
224
+ "eval_runtime": 0.541,
225
+ "eval_samples_per_second": 18.483,
226
+ "eval_steps_per_second": 1.848,
227
  "step": 146
228
  },
229
  {
230
  "epoch": 1.0273972602739727,
231
+ "grad_norm": 0.396484375,
232
+ "learning_rate": 0.00019999542705801296,
233
+ "loss": 1.1618,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 1.0616438356164384,
238
+ "grad_norm": 0.8125,
239
+ "learning_rate": 0.00019997685019798912,
240
+ "loss": 1.1531,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 1.095890410958904,
245
+ "grad_norm": 0.390625,
246
+ "learning_rate": 0.00019994398626371643,
247
+ "loss": 1.1517,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 1.13013698630137,
252
+ "grad_norm": 0.333984375,
253
+ "learning_rate": 0.00019989683995157677,
254
+ "loss": 1.1322,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 1.1643835616438356,
259
+ "grad_norm": 0.318359375,
260
+ "learning_rate": 0.0001998354179989585,
261
+ "loss": 1.1413,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 1.1986301369863013,
266
+ "grad_norm": 0.37109375,
267
+ "learning_rate": 0.00019975972918329356,
268
+ "loss": 1.1275,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 1.2328767123287672,
273
+ "grad_norm": 0.54296875,
274
+ "learning_rate": 0.00019966978432080316,
275
+ "loss": 1.1261,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 1.2671232876712328,
280
+ "grad_norm": 0.43359375,
281
+ "learning_rate": 0.00019956559626495212,
282
+ "loss": 1.1384,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 1.3013698630136985,
287
+ "grad_norm": 2.4375,
288
+ "learning_rate": 0.00019944717990461207,
289
+ "loss": 1.1376,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 1.3356164383561644,
294
+ "grad_norm": 0.423828125,
295
+ "learning_rate": 0.00019931455216193382,
296
+ "loss": 1.1376,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 1.36986301369863,
301
+ "grad_norm": 0.55078125,
302
+ "learning_rate": 0.000199167731989929,
303
+ "loss": 1.14,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 1.404109589041096,
308
+ "grad_norm": 0.44140625,
309
+ "learning_rate": 0.00019900674036976173,
310
+ "loss": 1.1281,
311
  "step": 205
312
  },
313
  {
314
  "epoch": 1.4383561643835616,
315
+ "grad_norm": 0.63671875,
316
+ "learning_rate": 0.00019883160030775016,
317
+ "loss": 1.1221,
318
  "step": 210
319
  },
320
  {
321
  "epoch": 1.4726027397260273,
322
+ "grad_norm": 0.36328125,
323
+ "learning_rate": 0.00019864233683207906,
324
+ "loss": 1.1217,
325
  "step": 215
326
  },
327
  {
328
  "epoch": 1.5068493150684932,
329
+ "grad_norm": 0.412109375,
330
+ "learning_rate": 0.00019843897698922284,
331
+ "loss": 1.1086,
332
  "step": 220
333
  },
334
  {
335
  "epoch": 1.541095890410959,
336
+ "grad_norm": 0.357421875,
337
+ "learning_rate": 0.00019822154984008088,
338
+ "loss": 1.1132,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 1.5753424657534247,
343
+ "grad_norm": 0.314453125,
344
+ "learning_rate": 0.0001979900864558242,
345
+ "loss": 1.1094,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 1.6095890410958904,
350
+ "grad_norm": 0.48046875,
351
+ "learning_rate": 0.00019774461991345577,
352
+ "loss": 1.1048,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 1.643835616438356,
357
+ "grad_norm": 0.32421875,
358
+ "learning_rate": 0.00019748518529108316,
359
+ "loss": 1.0937,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 1.678082191780822,
364
+ "grad_norm": 0.431640625,
365
+ "learning_rate": 0.00019721181966290613,
366
+ "loss": 1.1099,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 1.7123287671232876,
371
+ "grad_norm": 0.35546875,
372
+ "learning_rate": 0.00019692456209391846,
373
+ "loss": 1.0998,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 1.7465753424657535,
378
+ "grad_norm": 0.33984375,
379
+ "learning_rate": 0.0001966234536343253,
380
+ "loss": 1.0985,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 1.7808219178082192,
385
+ "grad_norm": 0.455078125,
386
+ "learning_rate": 0.00019630853731367713,
387
+ "loss": 1.1036,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 1.8150684931506849,
392
+ "grad_norm": 0.52734375,
393
+ "learning_rate": 0.00019597985813472052,
394
+ "loss": 1.0853,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 1.8493150684931505,
399
+ "grad_norm": 0.30078125,
400
+ "learning_rate": 0.0001956374630669672,
401
+ "loss": 1.0958,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 1.8835616438356164,
406
+ "grad_norm": 0.62890625,
407
+ "learning_rate": 0.00019528140103998177,
408
+ "loss": 1.0911,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 1.9178082191780823,
413
+ "grad_norm": 0.5234375,
414
+ "learning_rate": 0.00019491172293638968,
415
+ "loss": 1.0812,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 1.952054794520548,
420
+ "grad_norm": 0.44921875,
421
+ "learning_rate": 0.0001945284815846057,
422
+ "loss": 1.0777,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 1.9863013698630136,
427
+ "grad_norm": 0.53515625,
428
+ "learning_rate": 0.00019413173175128473,
429
+ "loss": 1.0819,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 2.0,
434
+ "eval_loss": 2.4820380210876465,
435
+ "eval_runtime": 0.5459,
436
+ "eval_samples_per_second": 18.32,
437
+ "eval_steps_per_second": 1.832,
438
  "step": 292
439
  },
440
  {
441
  "epoch": 2.0205479452054793,
442
+ "grad_norm": 0.412109375,
443
+ "learning_rate": 0.00019372153013349523,
444
+ "loss": 1.0712,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 2.0547945205479454,
449
+ "grad_norm": 0.302734375,
450
+ "learning_rate": 0.00019329793535061723,
451
+ "loss": 1.0467,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 2.089041095890411,
456
+ "grad_norm": 0.34765625,
457
+ "learning_rate": 0.0001928610079359652,
458
+ "loss": 1.0444,
459
  "step": 305
460
  },
461
  {
462
  "epoch": 2.1232876712328768,
463
+ "grad_norm": 0.376953125,
464
+ "learning_rate": 0.00019241081032813772,
465
+ "loss": 1.0393,
466
  "step": 310
467
  },
468
  {
469
  "epoch": 2.1575342465753424,
470
+ "grad_norm": 0.33984375,
471
+ "learning_rate": 0.00019194740686209464,
472
+ "loss": 1.0475,
473
  "step": 315
474
  },
475
  {
476
  "epoch": 2.191780821917808,
477
+ "grad_norm": 1.2578125,
478
+ "learning_rate": 0.0001914708637599636,
479
+ "loss": 1.0427,
480
  "step": 320
481
  },
482
  {
483
  "epoch": 2.2260273972602738,
484
+ "grad_norm": 1.5546875,
485
+ "learning_rate": 0.00019098124912157632,
486
+ "loss": 1.0486,
487
  "step": 325
488
  },
489
  {
490
  "epoch": 2.26027397260274,
491
+ "grad_norm": 0.458984375,
492
+ "learning_rate": 0.00019047863291473717,
493
+ "loss": 1.0412,
494
  "step": 330
495
  },
496
  {
497
  "epoch": 2.2945205479452055,
498
+ "grad_norm": 0.62109375,
499
+ "learning_rate": 0.00018996308696522433,
500
+ "loss": 1.0414,
501
  "step": 335
502
  },
503
  {
504
  "epoch": 2.328767123287671,
505
+ "grad_norm": 0.2890625,
506
+ "learning_rate": 0.0001894346849465257,
507
+ "loss": 1.033,
508
  "step": 340
509
  },
510
  {
511
  "epoch": 2.363013698630137,
512
+ "grad_norm": 0.328125,
513
+ "learning_rate": 0.00018889350236931055,
514
+ "loss": 1.0325,
515
  "step": 345
516
  },
517
  {
518
  "epoch": 2.3972602739726026,
519
+ "grad_norm": 0.333984375,
520
+ "learning_rate": 0.00018833961657063885,
521
+ "loss": 1.0499,
522
  "step": 350
523
  },
524
  {
525
  "epoch": 2.4315068493150687,
526
+ "grad_norm": 0.578125,
527
+ "learning_rate": 0.0001877731067029096,
528
+ "loss": 1.0361,
529
  "step": 355
530
  },
531
  {
532
  "epoch": 2.4657534246575343,
533
+ "grad_norm": 0.4921875,
534
+ "learning_rate": 0.00018719405372254948,
535
+ "loss": 1.0412,
536
  "step": 360
537
  },
538
  {
539
  "epoch": 2.5,
540
+ "grad_norm": 0.298828125,
541
+ "learning_rate": 0.00018660254037844388,
542
+ "loss": 1.0435,
543
  "step": 365
544
  },
545
  {
546
  "epoch": 2.5342465753424657,
547
+ "grad_norm": 0.4296875,
548
+ "learning_rate": 0.00018599865120011192,
549
+ "loss": 1.0369,
550
  "step": 370
551
  },
552
  {
553
  "epoch": 2.5684931506849313,
554
+ "grad_norm": 0.31640625,
555
+ "learning_rate": 0.00018538247248562674,
556
+ "loss": 1.0298,
557
  "step": 375
558
  },
559
  {
560
  "epoch": 2.602739726027397,
561
+ "grad_norm": 0.3125,
562
+ "learning_rate": 0.00018475409228928312,
563
+ "loss": 1.0325,
564
  "step": 380
565
  },
566
  {
567
  "epoch": 2.636986301369863,
568
+ "grad_norm": 0.41015625,
569
+ "learning_rate": 0.0001841136004090144,
570
+ "loss": 1.0411,
571
  "step": 385
572
  },
573
  {
574
  "epoch": 2.671232876712329,
575
+ "grad_norm": 0.310546875,
576
+ "learning_rate": 0.00018346108837355972,
577
+ "loss": 1.0372,
578
  "step": 390
579
  },
580
  {
581
  "epoch": 2.7054794520547945,
582
+ "grad_norm": 0.384765625,
583
+ "learning_rate": 0.00018279664942938447,
584
+ "loss": 1.0364,
585
  "step": 395
586
  },
587
  {
588
  "epoch": 2.73972602739726,
589
+ "grad_norm": 0.279296875,
590
+ "learning_rate": 0.00018212037852735486,
591
+ "loss": 1.0237,
592
  "step": 400
593
  },
594
  {
595
  "epoch": 2.7739726027397262,
596
+ "grad_norm": 0.375,
597
+ "learning_rate": 0.0001814323723091692,
598
+ "loss": 1.0341,
599
  "step": 405
600
  },
601
  {
602
  "epoch": 2.808219178082192,
603
+ "grad_norm": 0.486328125,
604
+ "learning_rate": 0.00018073272909354727,
605
+ "loss": 1.0256,
606
  "step": 410
607
  },
608
  {
609
  "epoch": 2.8424657534246576,
610
+ "grad_norm": 0.287109375,
611
+ "learning_rate": 0.00018002154886218033,
612
+ "loss": 1.0347,
613
  "step": 415
614
  },
615
  {
616
  "epoch": 2.8767123287671232,
617
+ "grad_norm": 0.427734375,
618
+ "learning_rate": 0.00017929893324544332,
619
+ "loss": 1.0357,
620
  "step": 420
621
  },
622
  {
623
  "epoch": 2.910958904109589,
624
+ "grad_norm": 0.408203125,
625
+ "learning_rate": 0.00017856498550787144,
626
+ "loss": 1.029,
627
  "step": 425
628
  },
629
  {
630
  "epoch": 2.9452054794520546,
631
+ "grad_norm": 0.375,
632
+ "learning_rate": 0.00017781981053340337,
633
+ "loss": 1.0263,
634
  "step": 430
635
  },
636
  {
637
  "epoch": 2.9794520547945207,
638
+ "grad_norm": 0.474609375,
639
+ "learning_rate": 0.00017706351481039284,
640
+ "loss": 1.035,
641
  "step": 435
642
  },
643
  {
644
  "epoch": 3.0,
645
+ "eval_loss": 2.4995357990264893,
646
+ "eval_runtime": 0.5513,
647
+ "eval_samples_per_second": 18.139,
648
+ "eval_steps_per_second": 1.814,
649
  "step": 438
650
  },
651
  {
652
  "epoch": 3.0136986301369864,
653
+ "grad_norm": 0.302734375,
654
+ "learning_rate": 0.00017629620641639103,
655
+ "loss": 1.006,
656
  "step": 440
657
  },
658
  {
659
  "epoch": 3.047945205479452,
660
+ "grad_norm": 0.349609375,
661
+ "learning_rate": 0.00017551799500270198,
662
+ "loss": 0.9868,
663
  "step": 445
664
  },
665
  {
666
  "epoch": 3.0821917808219177,
667
+ "grad_norm": 0.37109375,
668
+ "learning_rate": 0.00017472899177871297,
669
+ "loss": 0.9878,
670
  "step": 450
671
  },
672
  {
673
  "epoch": 3.1164383561643834,
674
+ "grad_norm": 0.35546875,
675
+ "learning_rate": 0.00017392930949600217,
676
+ "loss": 0.9897,
677
  "step": 455
678
  },
679
  {
680
  "epoch": 3.1506849315068495,
681
+ "grad_norm": 0.50390625,
682
+ "learning_rate": 0.00017311906243222614,
683
+ "loss": 0.9801,
684
  "step": 460
685
  },
686
  {
687
  "epoch": 3.184931506849315,
688
+ "grad_norm": 0.322265625,
689
+ "learning_rate": 0.00017229836637478902,
690
+ "loss": 0.9837,
691
  "step": 465
692
  },
693
  {
694
  "epoch": 3.219178082191781,
695
+ "grad_norm": 0.38671875,
696
+ "learning_rate": 0.00017146733860429612,
697
+ "loss": 0.9774,
698
  "step": 470
699
  },
700
  {
701
  "epoch": 3.2534246575342465,
702
+ "grad_norm": 0.59765625,
703
+ "learning_rate": 0.00017062609787779403,
704
+ "loss": 0.9918,
705
  "step": 475
706
  },
707
  {
708
  "epoch": 3.287671232876712,
709
+ "grad_norm": 0.310546875,
710
+ "learning_rate": 0.00016977476441179992,
711
+ "loss": 0.9844,
712
  "step": 480
713
  },
714
  {
715
  "epoch": 3.3219178082191783,
716
+ "grad_norm": 0.337890625,
717
+ "learning_rate": 0.0001689134598651219,
718
+ "loss": 0.9841,
719
  "step": 485
720
  },
721
  {
722
  "epoch": 3.356164383561644,
723
+ "grad_norm": 0.30078125,
724
+ "learning_rate": 0.0001680423073214737,
725
+ "loss": 0.993,
726
  "step": 490
727
  },
728
  {
729
  "epoch": 3.3904109589041096,
730
+ "grad_norm": 0.349609375,
731
+ "learning_rate": 0.00016716143127188548,
732
+ "loss": 0.9842,
733
  "step": 495
734
  },
735
  {
736
  "epoch": 3.4246575342465753,
737
+ "grad_norm": 0.37109375,
738
+ "learning_rate": 0.00016627095759691362,
739
+ "loss": 0.9925,
740
  "step": 500
741
  },
742
  {
743
  "epoch": 3.458904109589041,
744
+ "grad_norm": 0.474609375,
745
+ "learning_rate": 0.0001653710135486518,
746
+ "loss": 0.9822,
747
  "step": 505
748
  },
749
  {
750
  "epoch": 3.493150684931507,
751
+ "grad_norm": 0.3046875,
752
+ "learning_rate": 0.00016446172773254629,
753
+ "loss": 0.985,
754
  "step": 510
755
  },
756
  {
757
  "epoch": 3.5273972602739727,
758
+ "grad_norm": 0.37109375,
759
+ "learning_rate": 0.00016354323008901776,
760
+ "loss": 0.9937,
761
  "step": 515
762
  },
763
  {
764
  "epoch": 3.5616438356164384,
765
+ "grad_norm": 0.314453125,
766
+ "learning_rate": 0.0001626156518748922,
767
+ "loss": 0.9889,
768
  "step": 520
769
  },
770
  {
771
  "epoch": 3.595890410958904,
772
+ "grad_norm": 0.3046875,
773
+ "learning_rate": 0.00016167912564464383,
774
+ "loss": 0.9816,
775
  "step": 525
776
  },
777
  {
778
  "epoch": 3.6301369863013697,
779
+ "grad_norm": 0.314453125,
780
+ "learning_rate": 0.0001607337852314527,
781
+ "loss": 0.9873,
782
  "step": 530
783
  },
784
  {
785
  "epoch": 3.6643835616438354,
786
+ "grad_norm": 0.3046875,
787
+ "learning_rate": 0.0001597797657280792,
788
+ "loss": 0.9935,
789
  "step": 535
790
  },
791
  {
792
  "epoch": 3.6986301369863015,
793
+ "grad_norm": 0.287109375,
794
+ "learning_rate": 0.00015881720346755905,
795
+ "loss": 0.9863,
796
  "step": 540
797
  },
798
  {
799
  "epoch": 3.732876712328767,
800
+ "grad_norm": 0.34375,
801
+ "learning_rate": 0.00015784623600372042,
802
+ "loss": 0.9782,
803
  "step": 545
804
  },
805
  {
806
  "epoch": 3.767123287671233,
807
+ "grad_norm": 0.375,
808
+ "learning_rate": 0.00015686700209152738,
809
+ "loss": 0.9779,
810
  "step": 550
811
  },
812
  {
813
  "epoch": 3.8013698630136985,
814
+ "grad_norm": 0.3515625,
815
+ "learning_rate": 0.00015587964166725095,
816
+ "loss": 0.9883,
817
  "step": 555
818
  },
819
  {
820
  "epoch": 3.8356164383561646,
821
+ "grad_norm": 0.33203125,
822
+ "learning_rate": 0.00015488429582847192,
823
+ "loss": 0.968,
824
  "step": 560
825
  },
826
  {
827
  "epoch": 3.8698630136986303,
828
+ "grad_norm": 0.478515625,
829
+ "learning_rate": 0.00015388110681391725,
830
+ "loss": 0.9858,
831
  "step": 565
832
  },
833
  {
834
  "epoch": 3.904109589041096,
835
+ "grad_norm": 0.294921875,
836
+ "learning_rate": 0.0001528702179831338,
837
+ "loss": 0.9668,
838
  "step": 570
839
  },
840
  {
841
  "epoch": 3.9383561643835616,
842
+ "grad_norm": 0.3203125,
843
+ "learning_rate": 0.00015185177379600152,
844
+ "loss": 0.9853,
845
  "step": 575
846
  },
847
  {
848
  "epoch": 3.9726027397260273,
849
+ "grad_norm": 0.302734375,
850
+ "learning_rate": 0.00015082591979208976,
851
+ "loss": 0.9796,
852
  "step": 580
853
  },
854
  {
855
  "epoch": 4.0,
856
+ "eval_loss": 2.5387372970581055,
857
+ "eval_runtime": 0.5485,
858
+ "eval_samples_per_second": 18.233,
859
+ "eval_steps_per_second": 1.823,
860
  "step": 584
861
  },
862
  {
863
  "epoch": 4.006849315068493,
864
+ "grad_norm": 0.369140625,
865
+ "learning_rate": 0.000149792802569859,
866
+ "loss": 0.981,
867
  "step": 585
868
  },
869
  {
870
  "epoch": 4.041095890410959,
871
+ "grad_norm": 0.31640625,
872
+ "learning_rate": 0.00014875256976571135,
873
+ "loss": 0.9259,
874
  "step": 590
875
  },
876
  {
877
  "epoch": 4.075342465753424,
878
+ "grad_norm": 0.302734375,
879
+ "learning_rate": 0.0001477053700328929,
880
+ "loss": 0.9421,
881
  "step": 595
882
  },
883
  {
884
  "epoch": 4.109589041095891,
885
+ "grad_norm": 0.310546875,
886
+ "learning_rate": 0.00014665135302025035,
887
+ "loss": 0.9348,
888
  "step": 600
889
  },
890
  {
891
  "epoch": 4.1438356164383565,
892
+ "grad_norm": 0.322265625,
893
+ "learning_rate": 0.00014559066935084588,
894
+ "loss": 0.9353,
895
  "step": 605
896
  },
897
  {
898
  "epoch": 4.178082191780822,
899
+ "grad_norm": 0.314453125,
900
+ "learning_rate": 0.00014452347060043237,
901
+ "loss": 0.9267,
902
  "step": 610
903
  },
904
  {
905
  "epoch": 4.212328767123288,
906
+ "grad_norm": 0.359375,
907
+ "learning_rate": 0.00014344990927579268,
908
+ "loss": 0.9451,
909
  "step": 615
910
  },
911
  {
912
  "epoch": 4.2465753424657535,
913
+ "grad_norm": 0.486328125,
914
+ "learning_rate": 0.0001423701387929459,
915
+ "loss": 0.9245,
916
  "step": 620
917
  },
918
  {
919
  "epoch": 4.280821917808219,
920
+ "grad_norm": 0.4765625,
921
+ "learning_rate": 0.0001412843134552235,
922
+ "loss": 0.9371,
923
  "step": 625
924
  },
925
  {
926
  "epoch": 4.315068493150685,
927
+ "grad_norm": 0.38671875,
928
+ "learning_rate": 0.00014019258843121893,
929
+ "loss": 0.9309,
930
  "step": 630
931
  },
932
  {
933
  "epoch": 4.3493150684931505,
934
+ "grad_norm": 0.4453125,
935
+ "learning_rate": 0.0001390951197326134,
936
+ "loss": 0.9426,
937
  "step": 635
938
  },
939
  {
940
  "epoch": 4.383561643835616,
941
+ "grad_norm": 0.474609375,
942
+ "learning_rate": 0.00013799206419188103,
943
+ "loss": 0.9359,
944
  "step": 640
945
  },
946
  {
947
  "epoch": 4.417808219178082,
948
+ "grad_norm": 0.396484375,
949
+ "learning_rate": 0.00013688357943987732,
950
+ "loss": 0.945,
951
  "step": 645
952
  },
953
  {
954
  "epoch": 4.4520547945205475,
955
+ "grad_norm": 0.36328125,
956
+ "learning_rate": 0.0001357698238833126,
957
+ "loss": 0.9341,
958
  "step": 650
959
  },
960
  {
961
  "epoch": 4.486301369863014,
962
+ "grad_norm": 0.361328125,
963
+ "learning_rate": 0.0001346509566821153,
964
+ "loss": 0.9454,
965
  "step": 655
966
  },
967
  {
968
  "epoch": 4.52054794520548,
969
+ "grad_norm": 0.341796875,
970
+ "learning_rate": 0.00013352713772668765,
971
+ "loss": 0.9414,
972
  "step": 660
973
  },
974
  {
975
  "epoch": 4.554794520547945,
976
+ "grad_norm": 0.431640625,
977
+ "learning_rate": 0.00013239852761505626,
978
+ "loss": 0.9429,
979
  "step": 665
980
  },
981
  {
982
  "epoch": 4.589041095890411,
983
+ "grad_norm": 0.40625,
984
+ "learning_rate": 0.00013126528762992247,
985
+ "loss": 0.947,
986
  "step": 670
987
  },
988
  {
989
  "epoch": 4.623287671232877,
990
+ "grad_norm": 0.3125,
991
+ "learning_rate": 0.00013012757971561415,
992
+ "loss": 0.9387,
993
  "step": 675
994
  },
995
  {
996
  "epoch": 4.657534246575342,
997
+ "grad_norm": 0.33984375,
998
+ "learning_rate": 0.00012898556645494325,
999
+ "loss": 0.9485,
1000
  "step": 680
1001
  },
1002
  {
1003
  "epoch": 4.691780821917808,
1004
+ "grad_norm": 0.326171875,
1005
+ "learning_rate": 0.0001278394110459724,
1006
+ "loss": 0.9457,
1007
  "step": 685
1008
  },
1009
  {
1010
  "epoch": 4.726027397260274,
1011
+ "grad_norm": 0.341796875,
1012
+ "learning_rate": 0.0001266892772786929,
1013
+ "loss": 0.9463,
1014
  "step": 690
1015
  },
1016
  {
1017
  "epoch": 4.760273972602739,
1018
+ "grad_norm": 0.31640625,
1019
+ "learning_rate": 0.0001255353295116187,
1020
+ "loss": 0.9518,
1021
  "step": 695
1022
  },
1023
  {
1024
  "epoch": 4.794520547945205,
1025
+ "grad_norm": 0.349609375,
1026
+ "learning_rate": 0.00012437773264829897,
1027
+ "loss": 0.9382,
1028
  "step": 700
1029
  },
1030
  {
1031
  "epoch": 4.828767123287671,
1032
+ "grad_norm": 0.34765625,
1033
+ "learning_rate": 0.00012321665211375256,
1034
+ "loss": 0.9485,
1035
  "step": 705
1036
  },
1037
  {
1038
  "epoch": 4.863013698630137,
1039
+ "grad_norm": 0.3203125,
1040
+ "learning_rate": 0.00012205225383082843,
1041
+ "loss": 0.9393,
1042
  "step": 710
1043
  },
1044
  {
1045
  "epoch": 4.897260273972603,
1046
  "grad_norm": 0.359375,
1047
+ "learning_rate": 0.00012088470419649432,
1048
+ "loss": 0.9568,
1049
  "step": 715
1050
  },
1051
  {
1052
  "epoch": 4.931506849315069,
1053
+ "grad_norm": 0.337890625,
1054
+ "learning_rate": 0.00011971417005805818,
1055
+ "loss": 0.9352,
1056
  "step": 720
1057
  },
1058
  {
1059
  "epoch": 4.965753424657534,
1060
+ "grad_norm": 0.353515625,
1061
+ "learning_rate": 0.0001185408186893251,
1062
+ "loss": 0.9383,
1063
  "step": 725
1064
  },
1065
  {
1066
  "epoch": 5.0,
1067
+ "grad_norm": 0.322265625,
1068
+ "learning_rate": 0.00011736481776669306,
1069
+ "loss": 0.9366,
1070
  "step": 730
1071
  },
1072
  {
1073
  "epoch": 5.0,
1074
+ "eval_loss": 2.6037707328796387,
1075
+ "eval_runtime": 0.5451,
1076
+ "eval_samples_per_second": 18.346,
1077
+ "eval_steps_per_second": 1.835,
1078
  "step": 730
1079
  },
1080
  {
1081
  "epoch": 5.034246575342466,
1082
+ "grad_norm": 0.349609375,
1083
+ "learning_rate": 0.00011618633534519141,
1084
+ "loss": 0.9013,
1085
  "step": 735
1086
  },
1087
  {
1088
  "epoch": 5.068493150684931,
1089
+ "grad_norm": 0.31640625,
1090
+ "learning_rate": 0.00011500553983446527,
1091
+ "loss": 0.9034,
1092
  "step": 740
1093
  },
1094
  {
1095
  "epoch": 5.102739726027397,
1096
+ "grad_norm": 0.34765625,
1097
+ "learning_rate": 0.00011382259997470899,
1098
+ "loss": 0.8925,
1099
  "step": 745
1100
  },
1101
  {
1102
  "epoch": 5.136986301369863,
1103
+ "grad_norm": 0.345703125,
1104
+ "learning_rate": 0.00011263768481255264,
1105
+ "loss": 0.8901,
1106
  "step": 750
1107
  },
1108
  {
1109
  "epoch": 5.171232876712328,
1110
+ "grad_norm": 0.333984375,
1111
+ "learning_rate": 0.00011145096367690444,
1112
+ "loss": 0.8945,
1113
  "step": 755
1114
  },
1115
  {
1116
  "epoch": 5.205479452054795,
1117
+ "grad_norm": 0.42578125,
1118
+ "learning_rate": 0.00011026260615475333,
1119
+ "loss": 0.8961,
1120
  "step": 760
1121
  },
1122
  {
1123
  "epoch": 5.239726027397261,
1124
+ "grad_norm": 0.4765625,
1125
+ "learning_rate": 0.00010907278206693395,
1126
+ "loss": 0.8911,
1127
  "step": 765
1128
  },
1129
  {
1130
  "epoch": 5.273972602739726,
1131
+ "grad_norm": 0.3828125,
1132
+ "learning_rate": 0.00010788166144385888,
1133
+ "loss": 0.9018,
1134
  "step": 770
1135
  },
1136
  {
1137
  "epoch": 5.308219178082192,
1138
+ "grad_norm": 0.51953125,
1139
+ "learning_rate": 0.00010668941450122055,
1140
+ "loss": 0.9046,
1141
  "step": 775
1142
  },
1143
  {
1144
  "epoch": 5.342465753424658,
1145
+ "grad_norm": 0.54296875,
1146
+ "learning_rate": 0.0001054962116156667,
1147
+ "loss": 0.8964,
1148
  "step": 780
1149
  },
1150
  {
1151
  "epoch": 5.376712328767123,
1152
+ "grad_norm": 0.50390625,
1153
+ "learning_rate": 0.00010430222330045304,
1154
+ "loss": 0.9071,
1155
  "step": 785
1156
  },
1157
  {
1158
  "epoch": 5.410958904109589,
1159
+ "grad_norm": 0.4921875,
1160
+ "learning_rate": 0.0001031076201810762,
1161
+ "loss": 0.8981,
1162
  "step": 790
1163
  },
1164
  {
1165
  "epoch": 5.445205479452055,
1166
+ "grad_norm": 0.3359375,
1167
+ "learning_rate": 0.00010191257297089052,
1168
+ "loss": 0.9054,
1169
  "step": 795
1170
  },
1171
  {
1172
  "epoch": 5.47945205479452,
1173
+ "grad_norm": 0.337890625,
1174
+ "learning_rate": 0.00010071725244671282,
1175
+ "loss": 0.9061,
1176
  "step": 800
1177
  },
1178
  {
1179
  "epoch": 5.513698630136986,
1180
+ "grad_norm": 0.4140625,
1181
+ "learning_rate": 9.952182942441733e-05,
1182
+ "loss": 0.905,
1183
  "step": 805
1184
  },
1185
  {
1186
  "epoch": 5.5479452054794525,
1187
+ "grad_norm": 0.33203125,
1188
+ "learning_rate": 9.83264747345259e-05,
1189
+ "loss": 0.8981,
1190
  "step": 810
1191
  },
1192
  {
1193
  "epoch": 5.582191780821918,
1194
+ "grad_norm": 0.416015625,
1195
+ "learning_rate": 9.713135919779515e-05,
1196
+ "loss": 0.8956,
1197
  "step": 815
1198
  },
1199
  {
1200
  "epoch": 5.616438356164384,
1201
+ "grad_norm": 0.41015625,
1202
+ "learning_rate": 9.593665360080599e-05,
1203
+ "loss": 0.9116,
1204
  "step": 820
1205
  },
1206
  {
1207
  "epoch": 5.6506849315068495,
1208
+ "grad_norm": 0.357421875,
1209
+ "learning_rate": 9.474252867155732e-05,
1210
+ "loss": 0.9002,
1211
  "step": 825
1212
  },
1213
  {
1214
  "epoch": 5.684931506849315,
1215
+ "grad_norm": 0.322265625,
1216
+ "learning_rate": 9.354915505506839e-05,
1217
+ "loss": 0.8971,
1218
  "step": 830
1219
  },
1220
  {
1221
  "epoch": 5.719178082191781,
1222
+ "grad_norm": 0.3359375,
1223
+ "learning_rate": 9.235670328899293e-05,
1224
+ "loss": 0.8988,
1225
  "step": 835
1226
  },
1227
  {
1228
  "epoch": 5.7534246575342465,
1229
+ "grad_norm": 0.3359375,
1230
+ "learning_rate": 9.116534377924883e-05,
1231
+ "loss": 0.8922,
1232
  "step": 840
1233
  },
1234
  {
1235
  "epoch": 5.787671232876712,
1236
+ "grad_norm": 0.32421875,
1237
+ "learning_rate": 8.997524677566627e-05,
1238
+ "loss": 0.8953,
1239
  "step": 845
1240
  },
1241
  {
1242
  "epoch": 5.821917808219178,
1243
+ "grad_norm": 0.326171875,
1244
+ "learning_rate": 8.878658234765858e-05,
1245
+ "loss": 0.8986,
1246
  "step": 850
1247
  },
1248
  {
1249
  "epoch": 5.8561643835616435,
1250
+ "grad_norm": 0.322265625,
1251
+ "learning_rate": 8.759952035991844e-05,
1252
+ "loss": 0.8949,
1253
  "step": 855
1254
  },
1255
  {
1256
  "epoch": 5.890410958904109,
1257
+ "grad_norm": 0.33203125,
1258
+ "learning_rate": 8.641423044814374e-05,
1259
+ "loss": 0.9013,
1260
  "step": 860
1261
  },
1262
  {
1263
  "epoch": 5.924657534246576,
1264
+ "grad_norm": 0.33203125,
1265
+ "learning_rate": 8.5230881994796e-05,
1266
+ "loss": 0.9133,
1267
  "step": 865
1268
  },
1269
  {
1270
  "epoch": 5.958904109589041,
1271
+ "grad_norm": 0.33984375,
1272
+ "learning_rate": 8.404964410489485e-05,
1273
+ "loss": 0.9042,
1274
  "step": 870
1275
  },
1276
  {
1277
  "epoch": 5.993150684931507,
1278
+ "grad_norm": 0.33203125,
1279
+ "learning_rate": 8.287068558185225e-05,
1280
+ "loss": 0.9051,
1281
  "step": 875
1282
  },
1283
  {
1284
  "epoch": 6.0,
1285
+ "eval_loss": 2.6520962715148926,
1286
+ "eval_runtime": 0.5512,
1287
+ "eval_samples_per_second": 18.142,
1288
+ "eval_steps_per_second": 1.814,
1289
  "step": 876
1290
  },
1291
  {
1292
  "epoch": 6.027397260273973,
1293
+ "grad_norm": 0.33203125,
1294
+ "learning_rate": 8.169417490335007e-05,
1295
+ "loss": 0.8764,
1296
  "step": 880
1297
  },
1298
  {
1299
  "epoch": 6.061643835616438,
1300
+ "grad_norm": 0.34765625,
1301
+ "learning_rate": 8.052028019726371e-05,
1302
+ "loss": 0.8608,
1303
  "step": 885
1304
  },
1305
  {
1306
  "epoch": 6.095890410958904,
1307
+ "grad_norm": 0.349609375,
1308
+ "learning_rate": 7.934916921763628e-05,
1309
+ "loss": 0.8474,
1310
  "step": 890
1311
  },
1312
  {
1313
  "epoch": 6.13013698630137,
1314
+ "grad_norm": 0.361328125,
1315
+ "learning_rate": 7.818100932070546e-05,
1316
+ "loss": 0.8558,
1317
  "step": 895
1318
  },
1319
  {
1320
  "epoch": 6.164383561643835,
1321
+ "grad_norm": 0.34375,
1322
+ "learning_rate": 7.701596744098818e-05,
1323
+ "loss": 0.858,
1324
  "step": 900
1325
  },
1326
  {
1327
  "epoch": 6.198630136986301,
1328
+ "grad_norm": 0.34375,
1329
+ "learning_rate": 7.585421006742463e-05,
1330
+ "loss": 0.8568,
1331
  "step": 905
1332
  },
1333
  {
1334
  "epoch": 6.232876712328767,
1335
+ "grad_norm": 0.333984375,
1336
+ "learning_rate": 7.469590321958662e-05,
1337
+ "loss": 0.8626,
1338
  "step": 910
1339
  },
1340
  {
1341
  "epoch": 6.267123287671233,
1342
+ "grad_norm": 0.353515625,
1343
+ "learning_rate": 7.354121242395254e-05,
1344
+ "loss": 0.8685,
1345
  "step": 915
1346
  },
1347
  {
1348
  "epoch": 6.301369863013699,
1349
+ "grad_norm": 0.3828125,
1350
+ "learning_rate": 7.239030269025311e-05,
1351
+ "loss": 0.8621,
1352
  "step": 920
1353
  },
1354
  {
1355
  "epoch": 6.335616438356165,
1356
+ "grad_norm": 0.404296875,
1357
+ "learning_rate": 7.124333848789091e-05,
1358
+ "loss": 0.8687,
1359
  "step": 925
1360
  },
1361
  {
1362
  "epoch": 6.36986301369863,
1363
+ "grad_norm": 0.35546875,
1364
+ "learning_rate": 7.010048372243698e-05,
1365
+ "loss": 0.8691,
1366
  "step": 930
1367
  },
1368
  {
1369
  "epoch": 6.404109589041096,
1370
+ "grad_norm": 0.44140625,
1371
+ "learning_rate": 6.8961901712208e-05,
1372
+ "loss": 0.8651,
1373
  "step": 935
1374
  },
1375
  {
1376
  "epoch": 6.438356164383562,
1377
+ "grad_norm": 0.353515625,
1378
+ "learning_rate": 6.782775516492771e-05,
1379
+ "loss": 0.8656,
1380
  "step": 940
1381
  },
1382
  {
1383
  "epoch": 6.472602739726027,
1384
+ "grad_norm": 0.384765625,
1385
+ "learning_rate": 6.669820615447522e-05,
1386
+ "loss": 0.8683,
1387
  "step": 945
1388
  },
1389
  {
1390
  "epoch": 6.506849315068493,
1391
+ "grad_norm": 0.361328125,
1392
+ "learning_rate": 6.5573416097724e-05,
1393
+ "loss": 0.8632,
1394
  "step": 950
1395
  },
1396
  {
1397
  "epoch": 6.541095890410959,
1398
+ "grad_norm": 0.353515625,
1399
+ "learning_rate": 6.445354573147484e-05,
1400
+ "loss": 0.8693,
1401
  "step": 955
1402
  },
1403
  {
1404
  "epoch": 6.575342465753424,
1405
+ "grad_norm": 0.341796875,
1406
+ "learning_rate": 6.333875508948593e-05,
1407
+ "loss": 0.8724,
1408
  "step": 960
1409
  },
1410
  {
1411
  "epoch": 6.609589041095891,
1412
+ "grad_norm": 0.35546875,
1413
+ "learning_rate": 6.22292034796035e-05,
1414
+ "loss": 0.8659,
1415
  "step": 965
1416
  },
1417
  {
1418
  "epoch": 6.6438356164383565,
1419
+ "grad_norm": 0.357421875,
1420
+ "learning_rate": 6.112504946099604e-05,
1421
+ "loss": 0.8674,
1422
  "step": 970
1423
  },
1424
  {
1425
  "epoch": 6.678082191780822,
1426
+ "grad_norm": 0.390625,
1427
+ "learning_rate": 6.0026450821495536e-05,
1428
+ "loss": 0.8762,
1429
  "step": 975
1430
  },
1431
  {
1432
  "epoch": 6.712328767123288,
1433
+ "grad_norm": 0.33984375,
1434
+ "learning_rate": 5.8933564555049105e-05,
1435
+ "loss": 0.8581,
1436
  "step": 980
1437
  },
1438
  {
1439
  "epoch": 6.7465753424657535,
1440
+ "grad_norm": 0.34765625,
1441
+ "learning_rate": 5.784654683928391e-05,
1442
+ "loss": 0.8718,
1443
  "step": 985
1444
  },
1445
  {
1446
  "epoch": 6.780821917808219,
1447
+ "grad_norm": 0.36328125,
1448
+ "learning_rate": 5.6765553013188766e-05,
1449
+ "loss": 0.8637,
1450
  "step": 990
1451
  },
1452
  {
1453
  "epoch": 6.815068493150685,
1454
+ "grad_norm": 0.361328125,
1455
+ "learning_rate": 5.5690737554915604e-05,
1456
+ "loss": 0.8684,
1457
  "step": 995
1458
  },
1459
  {
1460
  "epoch": 6.8493150684931505,
1461
+ "grad_norm": 0.353515625,
1462
+ "learning_rate": 5.462225405970401e-05,
1463
+ "loss": 0.8693,
1464
  "step": 1000
1465
  },
1466
  {
1467
  "epoch": 6.883561643835616,
1468
+ "grad_norm": 0.34765625,
1469
+ "learning_rate": 5.3560255217931785e-05,
1470
+ "loss": 0.871,
1471
  "step": 1005
1472
  },
1473
  {
1474
  "epoch": 6.917808219178082,
1475
+ "grad_norm": 0.369140625,
1476
+ "learning_rate": 5.2504892793295e-05,
1477
+ "loss": 0.8644,
1478
  "step": 1010
1479
  },
1480
  {
1481
  "epoch": 6.9520547945205475,
1482
+ "grad_norm": 0.34375,
1483
+ "learning_rate": 5.145631760112022e-05,
1484
+ "loss": 0.8688,
1485
  "step": 1015
1486
  },
1487
  {
1488
  "epoch": 6.986301369863014,
1489
+ "grad_norm": 0.34765625,
1490
+ "learning_rate": 5.041467948681269e-05,
1491
+ "loss": 0.8676,
1492
  "step": 1020
1493
  },
1494
  {
1495
  "epoch": 7.0,
1496
+ "eval_loss": 2.7248690128326416,
1497
+ "eval_runtime": 0.5502,
1498
+ "eval_samples_per_second": 18.174,
1499
+ "eval_steps_per_second": 1.817,
1500
  "step": 1022
1501
  },
1502
  {
1503
  "epoch": 7.02054794520548,
1504
+ "grad_norm": 0.328125,
1505
+ "learning_rate": 4.9380127304442634e-05,
1506
+ "loss": 0.8501,
1507
  "step": 1025
1508
  },
1509
  {
1510
  "epoch": 7.054794520547945,
1511
+ "grad_norm": 0.357421875,
1512
+ "learning_rate": 4.835280889547351e-05,
1513
+ "loss": 0.8375,
1514
  "step": 1030
1515
  },
1516
  {
1517
  "epoch": 7.089041095890411,
1518
+ "grad_norm": 0.3515625,
1519
+ "learning_rate": 4.733287106763481e-05,
1520
+ "loss": 0.8297,
1521
  "step": 1035
1522
  },
1523
  {
1524
  "epoch": 7.123287671232877,
1525
+ "grad_norm": 0.375,
1526
+ "learning_rate": 4.6320459573942856e-05,
1527
+ "loss": 0.8336,
1528
  "step": 1040
1529
  },
1530
  {
1531
  "epoch": 7.157534246575342,
1532
+ "grad_norm": 0.365234375,
1533
+ "learning_rate": 4.531571909187197e-05,
1534
+ "loss": 0.832,
1535
  "step": 1045
1536
  },
1537
  {
1538
  "epoch": 7.191780821917808,
1539
+ "grad_norm": 0.376953125,
1540
+ "learning_rate": 4.431879320267972e-05,
1541
+ "loss": 0.8393,
1542
  "step": 1050
1543
  },
1544
  {
1545
  "epoch": 7.226027397260274,
1546
+ "grad_norm": 0.361328125,
1547
+ "learning_rate": 4.332982437088825e-05,
1548
+ "loss": 0.8425,
1549
  "step": 1055
1550
  },
1551
  {
1552
  "epoch": 7.260273972602739,
1553
+ "grad_norm": 0.34765625,
1554
+ "learning_rate": 4.2348953923925916e-05,
1555
+ "loss": 0.846,
1556
  "step": 1060
1557
  },
1558
  {
1559
  "epoch": 7.294520547945205,
1560
+ "grad_norm": 0.349609375,
1561
+ "learning_rate": 4.137632203193086e-05,
1562
+ "loss": 0.8396,
1563
  "step": 1065
1564
  },
1565
  {
1566
  "epoch": 7.328767123287671,
1567
+ "grad_norm": 0.3515625,
1568
+ "learning_rate": 4.041206768772022e-05,
1569
+ "loss": 0.836,
1570
  "step": 1070
1571
  },
1572
  {
1573
  "epoch": 7.363013698630137,
1574
+ "grad_norm": 0.365234375,
1575
+ "learning_rate": 3.9456328686927525e-05,
1576
+ "loss": 0.8442,
1577
  "step": 1075
1578
  },
1579
  {
1580
  "epoch": 7.397260273972603,
1581
+ "grad_norm": 0.34765625,
1582
+ "learning_rate": 3.850924160831115e-05,
1583
+ "loss": 0.8358,
1584
  "step": 1080
1585
  },
1586
  {
1587
  "epoch": 7.431506849315069,
1588
+ "grad_norm": 0.3515625,
1589
+ "learning_rate": 3.757094179423672e-05,
1590
+ "loss": 0.8381,
1591
  "step": 1085
1592
  },
1593
  {
1594
  "epoch": 7.465753424657534,
1595
+ "grad_norm": 0.349609375,
1596
+ "learning_rate": 3.6641563331336125e-05,
1597
+ "loss": 0.837,
1598
  "step": 1090
1599
  },
1600
  {
1601
  "epoch": 7.5,
1602
+ "grad_norm": 0.359375,
1603
+ "learning_rate": 3.5721239031346066e-05,
1604
+ "loss": 0.8435,
1605
  "step": 1095
1606
  },
1607
  {
1608
  "epoch": 7.534246575342466,
1609
+ "grad_norm": 0.376953125,
1610
+ "learning_rate": 3.4810100412128747e-05,
1611
+ "loss": 0.8412,
1612
  "step": 1100
1613
  },
1614
  {
1615
  "epoch": 7.568493150684931,
1616
+ "grad_norm": 0.34765625,
1617
+ "learning_rate": 3.3908277678877445e-05,
1618
+ "loss": 0.8361,
1619
  "step": 1105
1620
  },
1621
  {
1622
  "epoch": 7.602739726027397,
1623
+ "grad_norm": 0.359375,
1624
+ "learning_rate": 3.3015899705509734e-05,
1625
+ "loss": 0.84,
1626
  "step": 1110
1627
  },
1628
  {
1629
  "epoch": 7.636986301369863,
1630
+ "grad_norm": 0.369140625,
1631
+ "learning_rate": 3.21330940162508e-05,
1632
+ "loss": 0.8427,
1633
  "step": 1115
1634
  },
1635
  {
1636
  "epoch": 7.671232876712329,
1637
+ "grad_norm": 0.373046875,
1638
+ "learning_rate": 3.125998676740987e-05,
1639
+ "loss": 0.8443,
1640
  "step": 1120
1641
  },
1642
  {
1643
  "epoch": 7.705479452054795,
1644
+ "grad_norm": 0.36328125,
1645
+ "learning_rate": 3.0396702729352023e-05,
1646
+ "loss": 0.841,
1647
  "step": 1125
1648
  },
1649
  {
1650
  "epoch": 7.739726027397261,
1651
+ "grad_norm": 0.353515625,
1652
+ "learning_rate": 2.9543365268667867e-05,
1653
+ "loss": 0.8482,
1654
  "step": 1130
1655
  },
1656
  {
1657
  "epoch": 7.773972602739726,
1658
+ "grad_norm": 0.349609375,
1659
+ "learning_rate": 2.8700096330544012e-05,
1660
+ "loss": 0.8388,
1661
  "step": 1135
1662
  },
1663
  {
1664
  "epoch": 7.808219178082192,
1665
+ "grad_norm": 0.3515625,
1666
+ "learning_rate": 2.7867016421336776e-05,
1667
+ "loss": 0.8486,
1668
  "step": 1140
1669
  },
1670
  {
1671
  "epoch": 7.842465753424658,
1672
+ "grad_norm": 0.34765625,
1673
+ "learning_rate": 2.7044244591351232e-05,
1674
+ "loss": 0.8453,
1675
  "step": 1145
1676
  },
1677
  {
1678
  "epoch": 7.876712328767123,
1679
+ "grad_norm": 0.36328125,
1680
+ "learning_rate": 2.6231898417828603e-05,
1681
+ "loss": 0.8358,
1682
  "step": 1150
1683
  },
1684
  {
1685
  "epoch": 7.910958904109589,
1686
+ "grad_norm": 0.35546875,
1687
+ "learning_rate": 2.5430093988143778e-05,
1688
+ "loss": 0.8344,
1689
  "step": 1155
1690
  },
1691
  {
1692
  "epoch": 7.945205479452055,
1693
+ "grad_norm": 0.35546875,
1694
+ "learning_rate": 2.4638945883216235e-05,
1695
+ "loss": 0.8312,
1696
  "step": 1160
1697
  },
1698
  {
1699
  "epoch": 7.97945205479452,
1700
+ "grad_norm": 0.35546875,
1701
+ "learning_rate": 2.385856716113587e-05,
1702
+ "loss": 0.8291,
1703
  "step": 1165
1704
  },
1705
  {
1706
  "epoch": 8.0,
1707
+ "eval_loss": 2.766744613647461,
1708
+ "eval_runtime": 0.5415,
1709
+ "eval_samples_per_second": 18.469,
1710
+ "eval_steps_per_second": 1.847,
1711
  "step": 1168
1712
  },
1713
  {
1714
  "epoch": 8.013698630136986,
1715
+ "grad_norm": 0.369140625,
1716
+ "learning_rate": 2.3089069341006565e-05,
1717
+ "loss": 0.8283,
1718
  "step": 1170
1719
  },
1720
  {
1721
  "epoch": 8.047945205479452,
1722
+ "grad_norm": 0.349609375,
1723
+ "learning_rate": 2.2330562387009745e-05,
1724
+ "loss": 0.8203,
1725
  "step": 1175
1726
  },
1727
  {
1728
  "epoch": 8.082191780821917,
1729
+ "grad_norm": 0.353515625,
1730
+ "learning_rate": 2.1583154692689976e-05,
1731
+ "loss": 0.8221,
1732
  "step": 1180
1733
  },
1734
  {
1735
  "epoch": 8.116438356164384,
1736
+ "grad_norm": 0.36328125,
1737
+ "learning_rate": 2.08469530654652e-05,
1738
+ "loss": 0.8256,
1739
  "step": 1185
1740
  },
1741
  {
1742
  "epoch": 8.150684931506849,
1743
+ "grad_norm": 0.36328125,
1744
+ "learning_rate": 2.0122062711363532e-05,
1745
+ "loss": 0.8139,
1746
  "step": 1190
1747
  },
1748
  {
1749
  "epoch": 8.184931506849315,
1750
+ "grad_norm": 0.357421875,
1751
+ "learning_rate": 1.9408587219988805e-05,
1752
+ "loss": 0.8351,
1753
  "step": 1195
1754
  },
1755
  {
1756
  "epoch": 8.219178082191782,
1757
+ "grad_norm": 0.359375,
1758
+ "learning_rate": 1.8706628549717452e-05,
1759
+ "loss": 0.8258,
1760
  "step": 1200
1761
  },
1762
  {
1763
  "epoch": 8.253424657534246,
1764
+ "grad_norm": 0.357421875,
1765
+ "learning_rate": 1.8016287013128018e-05,
1766
+ "loss": 0.8139,
1767
  "step": 1205
1768
  },
1769
  {
1770
  "epoch": 8.287671232876713,
1771
+ "grad_norm": 0.357421875,
1772
+ "learning_rate": 1.7337661262666294e-05,
1773
+ "loss": 0.8305,
1774
  "step": 1210
1775
  },
1776
  {
1777
  "epoch": 8.321917808219178,
1778
+ "grad_norm": 0.353515625,
1779
+ "learning_rate": 1.6670848276547334e-05,
1780
+ "loss": 0.8239,
1781
  "step": 1215
1782
  },
1783
  {
1784
  "epoch": 8.356164383561644,
1785
+ "grad_norm": 0.36328125,
1786
+ "learning_rate": 1.601594334489702e-05,
1787
+ "loss": 0.8252,
1788
  "step": 1220
1789
  },
1790
  {
1791
  "epoch": 8.39041095890411,
1792
+ "grad_norm": 0.36328125,
1793
+ "learning_rate": 1.5373040056134814e-05,
1794
+ "loss": 0.8194,
1795
  "step": 1225
1796
  },
1797
  {
1798
  "epoch": 8.424657534246576,
1799
+ "grad_norm": 0.359375,
1800
+ "learning_rate": 1.474223028359939e-05,
1801
+ "loss": 0.8188,
1802
  "step": 1230
1803
  },
1804
  {
1805
  "epoch": 8.45890410958904,
1806
+ "grad_norm": 0.359375,
1807
+ "learning_rate": 1.4123604172419713e-05,
1808
+ "loss": 0.8206,
1809
  "step": 1235
1810
  },
1811
  {
1812
  "epoch": 8.493150684931507,
1813
+ "grad_norm": 0.35546875,
1814
+ "learning_rate": 1.3517250126632986e-05,
1815
+ "loss": 0.8223,
1816
  "step": 1240
1817
  },
1818
  {
1819
  "epoch": 8.527397260273972,
1820
+ "grad_norm": 0.357421875,
1821
+ "learning_rate": 1.292325479655131e-05,
1822
+ "loss": 0.8257,
1823
  "step": 1245
1824
  },
1825
  {
1826
  "epoch": 8.561643835616438,
1827
+ "grad_norm": 0.349609375,
1828
+ "learning_rate": 1.2341703066379074e-05,
1829
+ "loss": 0.8239,
1830
  "step": 1250
1831
  },
1832
  {
1833
  "epoch": 8.595890410958905,
1834
+ "grad_norm": 0.345703125,
1835
+ "learning_rate": 1.1772678042082607e-05,
1836
+ "loss": 0.8173,
1837
  "step": 1255
1838
  },
1839
  {
1840
  "epoch": 8.63013698630137,
1841
+ "grad_norm": 0.357421875,
1842
+ "learning_rate": 1.1216261039514087e-05,
1843
+ "loss": 0.8266,
1844
  "step": 1260
1845
  },
1846
  {
1847
  "epoch": 8.664383561643836,
1848
+ "grad_norm": 0.3515625,
1849
+ "learning_rate": 1.0672531572791178e-05,
1850
+ "loss": 0.8237,
1851
  "step": 1265
1852
  },
1853
  {
1854
  "epoch": 8.698630136986301,
1855
+ "grad_norm": 0.359375,
1856
+ "learning_rate": 1.0141567342934132e-05,
1857
+ "loss": 0.8335,
1858
  "step": 1270
1859
  },
1860
  {
1861
  "epoch": 8.732876712328768,
1862
+ "grad_norm": 0.353515625,
1863
+ "learning_rate": 9.623444226762035e-06,
1864
+ "loss": 0.8239,
1865
  "step": 1275
1866
  },
1867
  {
1868
  "epoch": 8.767123287671232,
1869
+ "grad_norm": 0.353515625,
1870
+ "learning_rate": 9.118236266049707e-06,
1871
+ "loss": 0.8308,
1872
  "step": 1280
1873
  },
1874
  {
1875
  "epoch": 8.801369863013699,
1876
+ "grad_norm": 0.34765625,
1877
+ "learning_rate": 8.626015656946895e-06,
1878
+ "loss": 0.8266,
1879
  "step": 1285
1880
  },
1881
  {
1882
  "epoch": 8.835616438356164,
1883
+ "grad_norm": 0.341796875,
1884
+ "learning_rate": 8.146852739661105e-06,
1885
+ "loss": 0.8248,
1886
  "step": 1290
1887
  },
1888
  {
1889
  "epoch": 8.86986301369863,
1890
+ "grad_norm": 0.353515625,
1891
+ "learning_rate": 7.6808159884057e-06,
1892
+ "loss": 0.8322,
1893
  "step": 1295
1894
  },
1895
  {
1896
  "epoch": 8.904109589041095,
1897
+ "grad_norm": 0.349609375,
1898
+ "learning_rate": 7.2279720016148244e-06,
1899
+ "loss": 0.823,
1900
  "step": 1300
1901
  },
1902
  {
1903
  "epoch": 8.938356164383562,
1904
+ "grad_norm": 0.3515625,
1905
+ "learning_rate": 6.788385492426053e-06,
1906
+ "loss": 0.825,
1907
  "step": 1305
1908
  },
1909
  {
1910
  "epoch": 8.972602739726028,
1911
+ "grad_norm": 0.357421875,
1912
+ "learning_rate": 6.36211927943271e-06,
1913
+ "loss": 0.8286,
1914
  "step": 1310
1915
  },
1916
  {
1917
  "epoch": 9.0,
1918
+ "eval_loss": 2.7898991107940674,
1919
+ "eval_runtime": 0.5501,
1920
+ "eval_samples_per_second": 18.177,
1921
+ "eval_steps_per_second": 1.818,
1922
  "step": 1314
1923
  },
1924
  {
1925
  "epoch": 9.006849315068493,
1926
+ "grad_norm": 0.34765625,
1927
+ "learning_rate": 5.949234277706861e-06,
1928
+ "loss": 0.8231,
1929
  "step": 1315
1930
  },
1931
  {
1932
  "epoch": 9.04109589041096,
1933
+ "grad_norm": 0.349609375,
1934
+ "learning_rate": 5.549789490094304e-06,
1935
+ "loss": 0.8237,
1936
  "step": 1320
1937
  },
1938
  {
1939
  "epoch": 9.075342465753424,
1940
+ "grad_norm": 0.349609375,
1941
+ "learning_rate": 5.163841998782837e-06,
1942
+ "loss": 0.8125,
1943
  "step": 1325
1944
  },
1945
  {
1946
  "epoch": 9.10958904109589,
1947
+ "grad_norm": 0.34375,
1948
+ "learning_rate": 4.79144695714504e-06,
1949
+ "loss": 0.8179,
1950
  "step": 1330
1951
  },
1952
  {
1953
  "epoch": 9.143835616438356,
1954
+ "grad_norm": 0.34375,
1955
+ "learning_rate": 4.432657581856525e-06,
1956
+ "loss": 0.8173,
1957
  "step": 1335
1958
  },
1959
  {
1960
  "epoch": 9.178082191780822,
1961
+ "grad_norm": 0.37109375,
1962
+ "learning_rate": 4.087525145291204e-06,
1963
+ "loss": 0.8186,
1964
  "step": 1340
1965
  },
1966
  {
1967
  "epoch": 9.212328767123287,
1968
+ "grad_norm": 0.3515625,
1969
+ "learning_rate": 3.7560989681941992e-06,
1970
+ "loss": 0.8233,
1971
  "step": 1345
1972
  },
1973
  {
1974
  "epoch": 9.246575342465754,
1975
+ "grad_norm": 0.345703125,
1976
+ "learning_rate": 3.4384264126337328e-06,
1977
+ "loss": 0.8147,
1978
  "step": 1350
1979
  },
1980
  {
1981
  "epoch": 9.280821917808218,
1982
+ "grad_norm": 0.3515625,
1983
+ "learning_rate": 3.1345528752329212e-06,
1984
+ "loss": 0.8076,
1985
  "step": 1355
1986
  },
1987
  {
1988
  "epoch": 9.315068493150685,
1989
+ "grad_norm": 0.3515625,
1990
+ "learning_rate": 2.8445217806824077e-06,
1991
+ "loss": 0.8259,
1992
  "step": 1360
1993
  },
1994
  {
1995
  "epoch": 9.349315068493151,
1996
+ "grad_norm": 0.3515625,
1997
+ "learning_rate": 2.5683745755348044e-06,
1998
+ "loss": 0.819,
1999
  "step": 1365
2000
  },
2001
  {
2002
  "epoch": 9.383561643835616,
2003
+ "grad_norm": 0.361328125,
2004
+ "learning_rate": 2.30615072228183e-06,
2005
+ "loss": 0.8218,
2006
  "step": 1370
2007
  },
2008
  {
2009
  "epoch": 9.417808219178083,
2010
+ "grad_norm": 0.359375,
2011
+ "learning_rate": 2.057887693714988e-06,
2012
+ "loss": 0.8166,
2013
  "step": 1375
2014
  },
2015
  {
2016
  "epoch": 9.452054794520548,
2017
+ "grad_norm": 0.3515625,
2018
+ "learning_rate": 1.8236209675705274e-06,
2019
+ "loss": 0.815,
2020
  "step": 1380
2021
  },
2022
  {
2023
  "epoch": 9.486301369863014,
2024
+ "grad_norm": 0.3515625,
2025
+ "learning_rate": 1.6033840214595308e-06,
2026
+ "loss": 0.8287,
2027
  "step": 1385
2028
  },
2029
  {
2030
  "epoch": 9.520547945205479,
2031
+ "grad_norm": 0.349609375,
2032
+ "learning_rate": 1.397208328083921e-06,
2033
+ "loss": 0.8239,
2034
  "step": 1390
2035
  },
2036
  {
2037
  "epoch": 9.554794520547945,
2038
+ "grad_norm": 0.3515625,
2039
+ "learning_rate": 1.205123350738746e-06,
2040
+ "loss": 0.8238,
2041
  "step": 1395
2042
  },
2043
  {
2044
  "epoch": 9.58904109589041,
2045
+ "grad_norm": 0.34765625,
2046
+ "learning_rate": 1.0271565391018922e-06,
2047
+ "loss": 0.8231,
2048
  "step": 1400
2049
  },
2050
  {
2051
  "epoch": 9.623287671232877,
2052
+ "grad_norm": 0.361328125,
2053
+ "learning_rate": 8.633333253113995e-07,
2054
+ "loss": 0.8246,
2055
  "step": 1405
2056
  },
2057
  {
2058
  "epoch": 9.657534246575342,
2059
+ "grad_norm": 0.361328125,
2060
+ "learning_rate": 7.136771203310245e-07,
2061
+ "loss": 0.8131,
2062
  "step": 1410
2063
  },
2064
  {
2065
  "epoch": 9.691780821917808,
2066
+ "grad_norm": 0.3515625,
2067
+ "learning_rate": 5.782093106048159e-07,
2068
+ "loss": 0.8196,
2069
  "step": 1415
2070
  },
2071
  {
2072
  "epoch": 9.726027397260275,
2073
+ "grad_norm": 0.345703125,
2074
+ "learning_rate": 4.569492550008603e-07,
2075
+ "loss": 0.8097,
2076
  "step": 1420
2077
  },
2078
  {
2079
  "epoch": 9.76027397260274,
2080
+ "grad_norm": 0.36328125,
2081
+ "learning_rate": 3.49914282044872e-07,
2082
+ "loss": 0.8272,
2083
  "step": 1425
2084
  },
2085
  {
2086
  "epoch": 9.794520547945206,
2087
+ "grad_norm": 0.365234375,
2088
+ "learning_rate": 2.5711968744382974e-07,
2089
+ "loss": 0.8285,
2090
  "step": 1430
2091
  },
2092
  {
2093
  "epoch": 9.82876712328767,
2094
+ "grad_norm": 0.353515625,
2095
+ "learning_rate": 1.7857873190019192e-07,
2096
+ "loss": 0.8323,
2097
  "step": 1435
2098
  },
2099
  {
2100
  "epoch": 9.863013698630137,
2101
+ "grad_norm": 0.359375,
2102
+ "learning_rate": 1.143026392168789e-07,
2103
+ "loss": 0.822,
2104
  "step": 1440
2105
  },
2106
  {
2107
  "epoch": 9.897260273972602,
2108
+ "grad_norm": 0.353515625,
2109
+ "learning_rate": 6.430059469334504e-08,
2110
+ "loss": 0.8329,
2111
  "step": 1445
2112
  },
2113
  {
2114
  "epoch": 9.931506849315069,
2115
+ "grad_norm": 0.349609375,
2116
+ "learning_rate": 2.8579743813006432e-08,
2117
+ "loss": 0.8213,
2118
  "step": 1450
2119
  },
2120
  {
2121
  "epoch": 9.965753424657533,
2122
+ "grad_norm": 0.35546875,
2123
+ "learning_rate": 7.145191222035497e-09,
2124
+ "loss": 0.8169,
2125
  "step": 1455
2126
  },
2127
  {
2128
  "epoch": 10.0,
2129
+ "grad_norm": 0.35546875,
2130
+ "learning_rate": 0.0,
2131
+ "loss": 0.8185,
2132
  "step": 1460
2133
  },
2134
  {
2135
  "epoch": 10.0,
2136
+ "eval_loss": 2.793144464492798,
2137
+ "eval_runtime": 0.5503,
2138
+ "eval_samples_per_second": 18.172,
2139
+ "eval_steps_per_second": 1.817,
2140
  "step": 1460
2141
  },
2142
  {
2143
+ "epoch": 10.0,
2144
+ "step": 1460,
2145
+ "total_flos": 8.702314001108828e+17,
2146
+ "train_loss": 0.9981180969982931,
2147
+ "train_runtime": 8741.0304,
2148
+ "train_samples_per_second": 8.01,
2149
+ "train_steps_per_second": 0.167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2150
  }
2151
  ],
2152
  "logging_steps": 5,
 
2166
  "attributes": {}
2167
  }
2168
  },
2169
+ "total_flos": 8.702314001108828e+17,
2170
  "train_batch_size": 8,
2171
  "trial_name": null,
2172
  "trial_params": null