chansung commited on
Commit
2770b63
1 Parent(s): 91769cd

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: google/gemma-7b
10
  datasets:
11
- - chansung/merged_ds_coding
12
  model-index:
13
  - name: coding_llamaduo_result3
14
  results: []
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # coding_llamaduo_result3
21
 
22
- This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the chansung/merged_ds_coding dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 1.2481
25
 
26
  ## Model description
27
 
@@ -45,24 +44,29 @@ The following hyperparameters were used during training:
45
  - eval_batch_size: 2
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 2
49
  - gradient_accumulation_steps: 2
50
- - total_train_batch_size: 8
51
- - total_eval_batch_size: 4
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 5
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.7425 | 1.0 | 164 | 1.1824 |
62
- | 0.6196 | 2.0 | 328 | 1.1435 |
63
- | 0.5124 | 3.0 | 492 | 1.1461 |
64
- | 0.4403 | 4.0 | 656 | 1.2068 |
65
- | 0.4072 | 5.0 | 820 | 1.2481 |
 
 
 
 
 
66
 
67
 
68
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: google/gemma-7b
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: coding_llamaduo_result3
13
  results: []
 
18
 
19
  # coding_llamaduo_result3
20
 
21
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.7502
24
 
25
  ## Model description
26
 
 
44
  - eval_batch_size: 2
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - num_devices: 4
48
  - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 16
50
+ - total_eval_batch_size: 8
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
 
56
  ### Training results
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 0.987 | 1.0 | 82 | 1.2808 |
61
+ | 0.6859 | 2.0 | 164 | 1.1719 |
62
+ | 0.5836 | 3.0 | 246 | 1.1480 |
63
+ | 0.5178 | 4.0 | 328 | 1.1717 |
64
+ | 0.4668 | 5.0 | 410 | 1.2044 |
65
+ | 0.3955 | 6.0 | 492 | 1.3252 |
66
+ | 0.3233 | 7.0 | 574 | 1.4225 |
67
+ | 0.2669 | 8.0 | 656 | 1.6119 |
68
+ | 0.2591 | 9.0 | 738 | 1.7353 |
69
+ | 0.2367 | 10.0 | 820 | 1.7502 |
70
 
71
 
72
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b81d28f781bdc22e28a6782d2c87b0b0bf979ff290d6491c839c408c0be7ba9a
3
  size 100060536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34aea7a58143aa2cce083b76d541e9e03828d7a2593e5a15e290bc6ec5443756
3
  size 100060536
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_loss": 1.248147964477539,
4
- "eval_runtime": 1.085,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 1.843,
7
- "eval_steps_per_second": 0.922,
8
- "total_flos": 6.28853392705323e+17,
9
- "train_loss": 1.6822980254161648,
10
- "train_runtime": 5646.8844,
11
  "train_samples": 19039,
12
- "train_samples_per_second": 1.159,
13
- "train_steps_per_second": 0.145
14
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.257706785410646e+18,
4
+ "train_loss": 1.5742560074096772,
5
+ "train_runtime": 5911.531,
 
 
 
 
 
6
  "train_samples": 19039,
7
+ "train_samples_per_second": 2.214,
8
+ "train_steps_per_second": 0.139
9
  }
runs/Apr21_23-51-41_060407ee4835/events.out.tfevents.1713758215.060407ee4835.3304.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f05b4cbaf30af6825844438912db09adf5801a58e91822acd703e3180454e26
3
- size 41716
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2fcf23e3c754eb90edaa64da5c0d8ad4995495d6c98c2136469e9476515a067
3
+ size 43185
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 6.28853392705323e+17,
4
- "train_loss": 1.6822980254161648,
5
- "train_runtime": 5646.8844,
6
  "train_samples": 19039,
7
- "train_samples_per_second": 1.159,
8
- "train_steps_per_second": 0.145
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.257706785410646e+18,
4
+ "train_loss": 1.5742560074096772,
5
+ "train_runtime": 5911.531,
6
  "train_samples": 19039,
7
+ "train_samples_per_second": 2.214,
8
+ "train_steps_per_second": 0.139
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
  "global_step": 820,
7
  "is_hyper_param_search": false,
@@ -9,1216 +9,1256 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.006097560975609756,
13
- "grad_norm": 470.0,
14
  "learning_rate": 2.4390243902439027e-06,
15
- "loss": 50.3127,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.03048780487804878,
20
- "grad_norm": 362.0,
21
  "learning_rate": 1.2195121951219513e-05,
22
- "loss": 41.9975,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.06097560975609756,
27
- "grad_norm": 199.0,
28
  "learning_rate": 2.4390243902439026e-05,
29
- "loss": 35.2694,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.09146341463414634,
34
- "grad_norm": 27.875,
35
  "learning_rate": 3.6585365853658535e-05,
36
- "loss": 22.3723,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.12195121951219512,
41
- "grad_norm": 21.375,
42
  "learning_rate": 4.878048780487805e-05,
43
- "loss": 18.6519,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.1524390243902439,
48
- "grad_norm": 5.8125,
49
  "learning_rate": 6.097560975609756e-05,
50
- "loss": 16.0212,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.18292682926829268,
55
- "grad_norm": 4.5625,
56
  "learning_rate": 7.317073170731707e-05,
57
- "loss": 15.6762,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.21341463414634146,
62
- "grad_norm": 5.9375,
63
  "learning_rate": 8.53658536585366e-05,
64
- "loss": 13.4942,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.24390243902439024,
69
- "grad_norm": 13.875,
70
  "learning_rate": 9.75609756097561e-05,
71
- "loss": 13.2206,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.27439024390243905,
76
- "grad_norm": 40.0,
77
  "learning_rate": 0.00010975609756097563,
78
- "loss": 9.5294,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.3048780487804878,
83
- "grad_norm": 8.0625,
84
  "learning_rate": 0.00012195121951219512,
85
- "loss": 2.8066,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.3353658536585366,
90
- "grad_norm": 2.421875,
91
  "learning_rate": 0.00013414634146341464,
92
- "loss": 1.5733,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.36585365853658536,
97
- "grad_norm": 2.0625,
98
  "learning_rate": 0.00014634146341463414,
99
- "loss": 1.3089,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.39634146341463417,
104
- "grad_norm": 3.15625,
105
  "learning_rate": 0.00015853658536585366,
106
- "loss": 1.1767,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.4268292682926829,
111
- "grad_norm": 7.0625,
112
  "learning_rate": 0.0001707317073170732,
113
- "loss": 1.1124,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.4573170731707317,
118
- "grad_norm": 3.265625,
119
  "learning_rate": 0.0001829268292682927,
120
- "loss": 1.0577,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.4878048780487805,
125
- "grad_norm": 6.59375,
126
  "learning_rate": 0.0001951219512195122,
127
- "loss": 0.9769,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.5182926829268293,
132
- "grad_norm": 10.6875,
 
 
 
 
 
 
 
 
133
  "learning_rate": 0.00019999184556954776,
134
- "loss": 0.9411,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.5487804878048781,
139
- "grad_norm": 1.171875,
140
  "learning_rate": 0.0001999420177550043,
141
- "loss": 0.939,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.5792682926829268,
146
- "grad_norm": 2.25,
147
  "learning_rate": 0.00019984691491033906,
148
- "loss": 0.9085,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.6097560975609756,
153
- "grad_norm": 0.97265625,
154
  "learning_rate": 0.00019970658011837404,
155
- "loss": 0.8808,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.6402439024390244,
160
- "grad_norm": 2.046875,
161
  "learning_rate": 0.00019952107695258992,
162
- "loss": 0.8201,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.6707317073170732,
167
- "grad_norm": 0.828125,
168
  "learning_rate": 0.00019929048944832638,
169
- "loss": 0.7852,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.7012195121951219,
174
- "grad_norm": 2.3125,
175
  "learning_rate": 0.00019901492206471325,
176
- "loss": 0.7695,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.7317073170731707,
181
- "grad_norm": 14.3125,
182
  "learning_rate": 0.00019869449963734893,
183
- "loss": 0.7996,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.7621951219512195,
188
- "grad_norm": 1.0859375,
189
  "learning_rate": 0.00019832936732174834,
190
- "loss": 0.7611,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.7926829268292683,
195
- "grad_norm": 29.0,
196
  "learning_rate": 0.00019791969052758562,
197
- "loss": 0.7835,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.823170731707317,
202
- "grad_norm": 0.6015625,
203
  "learning_rate": 0.00019746565484376132,
204
- "loss": 0.759,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.8536585365853658,
209
- "grad_norm": 0.57421875,
210
  "learning_rate": 0.00019696746595432828,
211
- "loss": 0.7598,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.8841463414634146,
216
- "grad_norm": 0.9140625,
217
  "learning_rate": 0.0001964253495453141,
218
- "loss": 0.7279,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.9146341463414634,
223
- "grad_norm": 1.484375,
224
  "learning_rate": 0.00019583955120248237,
225
- "loss": 0.7304,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.9451219512195121,
230
- "grad_norm": 0.7890625,
231
  "learning_rate": 0.00019521033630007928,
232
- "loss": 0.7659,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.975609756097561,
237
- "grad_norm": 3.03125,
238
  "learning_rate": 0.00019453798988061535,
239
- "loss": 0.7425,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 1.0,
244
- "eval_loss": 1.182375431060791,
245
- "eval_runtime": 1.0783,
246
- "eval_samples_per_second": 1.855,
247
- "eval_steps_per_second": 0.927,
248
  "step": 164
249
  },
250
  {
251
- "epoch": 1.0060975609756098,
252
- "grad_norm": 0.8515625,
253
  "learning_rate": 0.00019382281652573785,
254
- "loss": 0.6928,
255
  "step": 165
256
  },
257
  {
258
- "epoch": 1.0365853658536586,
259
- "grad_norm": 0.9375,
260
  "learning_rate": 0.00019306514021825118,
261
- "loss": 0.6592,
262
  "step": 170
263
  },
264
  {
265
- "epoch": 1.0670731707317074,
266
- "grad_norm": 0.8125,
267
  "learning_rate": 0.00019226530419534833,
268
- "loss": 0.6442,
269
  "step": 175
270
  },
271
  {
272
- "epoch": 1.0975609756097562,
273
- "grad_norm": 0.72265625,
274
  "learning_rate": 0.00019142367079312021,
275
- "loss": 0.659,
276
  "step": 180
277
  },
278
  {
279
- "epoch": 1.1280487804878048,
280
- "grad_norm": 0.462890625,
281
  "learning_rate": 0.00019054062128241264,
282
- "loss": 0.6355,
283
  "step": 185
284
  },
285
  {
286
- "epoch": 1.1585365853658536,
287
- "grad_norm": 1.1015625,
288
  "learning_rate": 0.00018961655569610557,
289
- "loss": 0.6304,
290
  "step": 190
291
  },
292
  {
293
- "epoch": 1.1890243902439024,
294
- "grad_norm": 0.5625,
295
  "learning_rate": 0.0001886518926478932,
296
- "loss": 0.6414,
297
  "step": 195
298
  },
299
  {
300
- "epoch": 1.2195121951219512,
301
- "grad_norm": 0.5,
302
  "learning_rate": 0.00018764706914264635,
303
- "loss": 0.6429,
304
  "step": 200
305
  },
306
  {
307
- "epoch": 1.25,
308
- "grad_norm": 1.046875,
309
  "learning_rate": 0.00018660254037844388,
310
- "loss": 0.6442,
311
  "step": 205
312
  },
313
  {
314
- "epoch": 1.2804878048780488,
315
- "grad_norm": 0.73046875,
316
  "learning_rate": 0.00018551877954036162,
317
- "loss": 0.6394,
318
  "step": 210
319
  },
320
  {
321
- "epoch": 1.3109756097560976,
322
- "grad_norm": 0.87109375,
323
  "learning_rate": 0.00018439627758611385,
324
- "loss": 0.6256,
325
  "step": 215
326
  },
327
  {
328
- "epoch": 1.3414634146341464,
329
- "grad_norm": 0.890625,
330
  "learning_rate": 0.00018323554302364272,
331
- "loss": 0.6397,
332
  "step": 220
333
  },
334
  {
335
- "epoch": 1.3719512195121952,
336
- "grad_norm": 0.96875,
337
  "learning_rate": 0.00018203710168075788,
338
- "loss": 0.6138,
339
  "step": 225
340
  },
341
  {
342
- "epoch": 1.4024390243902438,
343
- "grad_norm": 1.0625,
344
  "learning_rate": 0.0001808014964669293,
345
- "loss": 0.6305,
346
  "step": 230
347
  },
348
  {
349
- "epoch": 1.4329268292682926,
350
- "grad_norm": 0.51953125,
351
  "learning_rate": 0.00017952928712734268,
352
- "loss": 0.6213,
353
  "step": 235
354
  },
355
  {
356
- "epoch": 1.4634146341463414,
357
- "grad_norm": 1.78125,
358
  "learning_rate": 0.00017822104998932713,
359
- "loss": 0.6233,
360
  "step": 240
361
  },
362
  {
363
- "epoch": 1.4939024390243902,
364
- "grad_norm": 0.7265625,
365
  "learning_rate": 0.00017687737770127185,
366
- "loss": 0.6489,
367
  "step": 245
368
  },
369
  {
370
- "epoch": 1.524390243902439,
371
- "grad_norm": 0.8984375,
 
 
 
 
 
 
 
 
372
  "learning_rate": 0.00017549887896414851,
373
- "loss": 0.6058,
374
  "step": 250
375
  },
376
  {
377
- "epoch": 1.5548780487804879,
378
- "grad_norm": 1.015625,
379
  "learning_rate": 0.0001740861782557618,
380
- "loss": 0.6164,
381
  "step": 255
382
  },
383
  {
384
- "epoch": 1.5853658536585367,
385
- "grad_norm": 0.73046875,
386
  "learning_rate": 0.0001726399155478529,
387
- "loss": 0.606,
388
  "step": 260
389
  },
390
  {
391
- "epoch": 1.6158536585365852,
392
- "grad_norm": 0.5703125,
393
  "learning_rate": 0.00017116074601618417,
394
- "loss": 0.6051,
395
  "step": 265
396
  },
397
  {
398
- "epoch": 1.6463414634146343,
399
- "grad_norm": 0.7109375,
400
  "learning_rate": 0.0001696493397437357,
401
- "loss": 0.6798,
402
  "step": 270
403
  },
404
  {
405
- "epoch": 1.6768292682926829,
406
- "grad_norm": 1.6953125,
407
  "learning_rate": 0.00016810638141714934,
408
- "loss": 0.6205,
409
  "step": 275
410
  },
411
  {
412
- "epoch": 1.7073170731707317,
413
- "grad_norm": 1.015625,
414
  "learning_rate": 0.00016653257001655652,
415
- "loss": 0.6627,
416
  "step": 280
417
  },
418
  {
419
- "epoch": 1.7378048780487805,
420
- "grad_norm": 0.84765625,
421
  "learning_rate": 0.0001649286184989315,
422
- "loss": 0.6152,
423
  "step": 285
424
  },
425
  {
426
- "epoch": 1.7682926829268293,
427
- "grad_norm": 1.21875,
428
  "learning_rate": 0.0001632952534751122,
429
- "loss": 0.5972,
430
  "step": 290
431
  },
432
  {
433
- "epoch": 1.798780487804878,
434
- "grad_norm": 0.4609375,
435
  "learning_rate": 0.00016163321488063637,
436
- "loss": 0.6053,
437
  "step": 295
438
  },
439
  {
440
- "epoch": 1.8292682926829267,
441
- "grad_norm": 0.423828125,
442
  "learning_rate": 0.00015994325564054122,
443
- "loss": 0.5965,
444
  "step": 300
445
  },
446
  {
447
- "epoch": 1.8597560975609757,
448
- "grad_norm": 0.9296875,
449
  "learning_rate": 0.00015822614132827837,
450
- "loss": 0.6023,
451
  "step": 305
452
  },
453
  {
454
- "epoch": 1.8902439024390243,
455
- "grad_norm": 0.671875,
456
  "learning_rate": 0.00015648264981889934,
457
- "loss": 0.5916,
458
  "step": 310
459
  },
460
  {
461
- "epoch": 1.9207317073170733,
462
- "grad_norm": 0.68359375,
463
  "learning_rate": 0.00015471357093666804,
464
- "loss": 0.6082,
465
  "step": 315
466
  },
467
  {
468
- "epoch": 1.951219512195122,
469
- "grad_norm": 0.578125,
470
  "learning_rate": 0.00015291970609726007,
471
- "loss": 0.6176,
472
  "step": 320
473
  },
474
  {
475
- "epoch": 1.9817073170731707,
476
- "grad_norm": 0.55859375,
477
  "learning_rate": 0.00015110186794471103,
478
- "loss": 0.6196,
479
  "step": 325
480
  },
481
  {
482
- "epoch": 2.0,
483
- "eval_loss": 1.1435465812683105,
484
- "eval_runtime": 1.0793,
485
- "eval_samples_per_second": 1.853,
486
- "eval_steps_per_second": 0.927,
487
  "step": 328
488
  },
489
  {
490
- "epoch": 2.0121951219512195,
491
- "grad_norm": 0.5703125,
492
  "learning_rate": 0.00014926087998327837,
493
- "loss": 0.5623,
494
  "step": 330
495
  },
496
  {
497
- "epoch": 2.042682926829268,
498
- "grad_norm": 0.82421875,
499
  "learning_rate": 0.00014739757620438307,
500
- "loss": 0.5231,
501
  "step": 335
502
  },
503
  {
504
- "epoch": 2.073170731707317,
505
- "grad_norm": 1.25,
506
  "learning_rate": 0.0001455128007088009,
507
- "loss": 0.5611,
508
  "step": 340
509
  },
510
  {
511
- "epoch": 2.1036585365853657,
512
- "grad_norm": 0.5,
513
  "learning_rate": 0.00014360740732427367,
514
- "loss": 0.5497,
515
  "step": 345
516
  },
517
  {
518
- "epoch": 2.1341463414634148,
519
- "grad_norm": 0.443359375,
520
  "learning_rate": 0.00014168225921871433,
521
- "loss": 0.5154,
522
  "step": 350
523
  },
524
  {
525
- "epoch": 2.1646341463414633,
526
- "grad_norm": 0.498046875,
527
  "learning_rate": 0.00013973822850918055,
528
- "loss": 0.5307,
529
  "step": 355
530
  },
531
  {
532
- "epoch": 2.1951219512195124,
533
- "grad_norm": 0.79296875,
534
  "learning_rate": 0.0001377761958667946,
535
- "loss": 0.5085,
536
  "step": 360
537
  },
538
  {
539
- "epoch": 2.225609756097561,
540
- "grad_norm": 0.8046875,
541
  "learning_rate": 0.00013579705011778766,
542
- "loss": 0.5339,
543
  "step": 365
544
  },
545
  {
546
- "epoch": 2.2560975609756095,
547
- "grad_norm": 0.7890625,
548
  "learning_rate": 0.00013380168784085027,
549
- "loss": 0.5263,
550
  "step": 370
551
  },
552
  {
553
- "epoch": 2.2865853658536586,
554
- "grad_norm": 0.5234375,
555
  "learning_rate": 0.00013179101296097035,
556
- "loss": 0.5106,
557
  "step": 375
558
  },
559
  {
560
- "epoch": 2.317073170731707,
561
- "grad_norm": 0.62890625,
562
  "learning_rate": 0.00012976593633994346,
563
- "loss": 0.5371,
564
  "step": 380
565
  },
566
  {
567
- "epoch": 2.347560975609756,
568
- "grad_norm": 0.4140625,
569
  "learning_rate": 0.0001277273753637408,
570
- "loss": 0.5188,
571
  "step": 385
572
  },
573
  {
574
- "epoch": 2.3780487804878048,
575
- "grad_norm": 0.66015625,
576
  "learning_rate": 0.00012567625352692127,
577
- "loss": 0.5309,
578
  "step": 390
579
  },
580
  {
581
- "epoch": 2.408536585365854,
582
- "grad_norm": 1.21875,
583
  "learning_rate": 0.0001236135000142765,
584
- "loss": 0.5024,
585
  "step": 395
586
  },
587
  {
588
- "epoch": 2.4390243902439024,
589
- "grad_norm": 0.58203125,
590
  "learning_rate": 0.00012154004927989815,
591
- "loss": 0.5291,
592
  "step": 400
593
  },
594
  {
595
- "epoch": 2.4695121951219514,
596
- "grad_norm": 0.478515625,
597
  "learning_rate": 0.00011945684062385803,
598
- "loss": 0.5261,
599
  "step": 405
600
  },
601
  {
602
- "epoch": 2.5,
603
- "grad_norm": 0.431640625,
604
  "learning_rate": 0.00011736481776669306,
605
- "loss": 0.5218,
606
  "step": 410
607
  },
608
  {
609
- "epoch": 2.5304878048780486,
610
- "grad_norm": 0.447265625,
 
 
 
 
 
 
 
 
611
  "learning_rate": 0.00011526492842188745,
612
- "loss": 0.5259,
613
  "step": 415
614
  },
615
  {
616
- "epoch": 2.5609756097560976,
617
- "grad_norm": 0.41796875,
618
  "learning_rate": 0.0001131581238665465,
619
- "loss": 0.5077,
620
  "step": 420
621
  },
622
  {
623
- "epoch": 2.591463414634146,
624
- "grad_norm": 0.42578125,
625
  "learning_rate": 0.00011104535851045539,
626
- "loss": 0.5356,
627
  "step": 425
628
  },
629
  {
630
- "epoch": 2.6219512195121952,
631
- "grad_norm": 0.62890625,
632
  "learning_rate": 0.00010892758946371944,
633
- "loss": 0.5158,
634
  "step": 430
635
  },
636
  {
637
- "epoch": 2.652439024390244,
638
- "grad_norm": 0.46484375,
639
  "learning_rate": 0.00010680577610318072,
640
- "loss": 0.5297,
641
  "step": 435
642
  },
643
  {
644
- "epoch": 2.682926829268293,
645
- "grad_norm": 0.4609375,
646
  "learning_rate": 0.00010468087963780789,
647
- "loss": 0.5015,
648
  "step": 440
649
  },
650
  {
651
- "epoch": 2.7134146341463414,
652
- "grad_norm": 0.447265625,
653
  "learning_rate": 0.00010255386267325602,
654
- "loss": 0.5169,
655
  "step": 445
656
  },
657
  {
658
- "epoch": 2.7439024390243905,
659
- "grad_norm": 0.6171875,
660
  "learning_rate": 0.00010042568877579388,
661
- "loss": 0.5034,
662
  "step": 450
663
  },
664
  {
665
- "epoch": 2.774390243902439,
666
- "grad_norm": 0.419921875,
667
  "learning_rate": 9.829732203579584e-05,
668
- "loss": 0.5113,
669
  "step": 455
670
  },
671
  {
672
- "epoch": 2.8048780487804876,
673
- "grad_norm": 0.396484375,
674
  "learning_rate": 9.616972663099647e-05,
675
- "loss": 0.5106,
676
  "step": 460
677
  },
678
  {
679
- "epoch": 2.8353658536585367,
680
- "grad_norm": 0.4765625,
681
  "learning_rate": 9.404386638970542e-05,
682
- "loss": 0.5135,
683
  "step": 465
684
  },
685
  {
686
- "epoch": 2.8658536585365852,
687
- "grad_norm": 0.390625,
688
  "learning_rate": 9.192070435418079e-05,
689
- "loss": 0.5222,
690
  "step": 470
691
  },
692
  {
693
- "epoch": 2.8963414634146343,
694
- "grad_norm": 0.55859375,
695
  "learning_rate": 8.980120234435849e-05,
696
- "loss": 0.5214,
697
  "step": 475
698
  },
699
  {
700
- "epoch": 2.926829268292683,
701
- "grad_norm": 0.6328125,
702
  "learning_rate": 8.768632052213531e-05,
703
- "loss": 0.5408,
704
  "step": 480
705
  },
706
  {
707
- "epoch": 2.9573170731707314,
708
- "grad_norm": 0.490234375,
709
  "learning_rate": 8.557701695640321e-05,
710
- "loss": 0.5012,
711
  "step": 485
712
  },
713
  {
714
- "epoch": 2.9878048780487805,
715
- "grad_norm": 0.41796875,
716
  "learning_rate": 8.347424718903151e-05,
717
- "loss": 0.5124,
718
  "step": 490
719
  },
720
  {
721
- "epoch": 3.0,
722
- "eval_loss": 1.1460928916931152,
723
- "eval_runtime": 1.0779,
724
- "eval_samples_per_second": 1.855,
725
- "eval_steps_per_second": 0.928,
726
  "step": 492
727
  },
728
  {
729
- "epoch": 3.018292682926829,
730
- "grad_norm": 0.71484375,
731
  "learning_rate": 8.13789638019942e-05,
732
- "loss": 0.4767,
733
  "step": 495
734
  },
735
  {
736
- "epoch": 3.048780487804878,
737
- "grad_norm": 0.53515625,
738
  "learning_rate": 7.929211598583794e-05,
739
- "loss": 0.4404,
740
  "step": 500
741
  },
742
  {
743
- "epoch": 3.0792682926829267,
744
- "grad_norm": 0.546875,
745
  "learning_rate": 7.721464910968627e-05,
746
- "loss": 0.447,
747
  "step": 505
748
  },
749
  {
750
- "epoch": 3.1097560975609757,
751
- "grad_norm": 0.40234375,
752
  "learning_rate": 7.514750429297528e-05,
753
- "loss": 0.4379,
754
  "step": 510
755
  },
756
  {
757
- "epoch": 3.1402439024390243,
758
- "grad_norm": 0.59765625,
759
  "learning_rate": 7.309161797911441e-05,
760
- "loss": 0.4287,
761
  "step": 515
762
  },
763
  {
764
- "epoch": 3.1707317073170733,
765
- "grad_norm": 0.4375,
766
  "learning_rate": 7.104792151126515e-05,
767
- "loss": 0.428,
768
  "step": 520
769
  },
770
  {
771
- "epoch": 3.201219512195122,
772
- "grad_norm": 0.4296875,
773
  "learning_rate": 6.901734071043071e-05,
774
- "loss": 0.4448,
775
  "step": 525
776
  },
777
  {
778
- "epoch": 3.231707317073171,
779
- "grad_norm": 0.4453125,
780
  "learning_rate": 6.700079545604708e-05,
781
- "loss": 0.4297,
782
  "step": 530
783
  },
784
  {
785
- "epoch": 3.2621951219512195,
786
- "grad_norm": 0.435546875,
787
  "learning_rate": 6.499919926926566e-05,
788
- "loss": 0.423,
789
  "step": 535
790
  },
791
  {
792
- "epoch": 3.292682926829268,
793
- "grad_norm": 0.44921875,
794
  "learning_rate": 6.301345889911637e-05,
795
- "loss": 0.4218,
796
  "step": 540
797
  },
798
  {
799
- "epoch": 3.323170731707317,
800
- "grad_norm": 0.56640625,
801
  "learning_rate": 6.104447391173858e-05,
802
- "loss": 0.4354,
803
  "step": 545
804
  },
805
  {
806
- "epoch": 3.3536585365853657,
807
- "grad_norm": 0.47265625,
808
  "learning_rate": 5.909313628286601e-05,
809
- "loss": 0.4233,
810
  "step": 550
811
  },
812
  {
813
- "epoch": 3.3841463414634148,
814
- "grad_norm": 0.42578125,
815
  "learning_rate": 5.716032999375006e-05,
816
- "loss": 0.4634,
817
  "step": 555
818
  },
819
  {
820
- "epoch": 3.4146341463414633,
821
- "grad_norm": 0.419921875,
822
  "learning_rate": 5.524693063070492e-05,
823
- "loss": 0.4268,
824
  "step": 560
825
  },
826
  {
827
- "epoch": 3.4451219512195124,
828
- "grad_norm": 0.494140625,
829
  "learning_rate": 5.335380498845559e-05,
830
- "loss": 0.4333,
831
  "step": 565
832
  },
833
  {
834
- "epoch": 3.475609756097561,
835
- "grad_norm": 0.46875,
836
  "learning_rate": 5.148181067746862e-05,
837
- "loss": 0.471,
838
  "step": 570
839
  },
840
  {
841
- "epoch": 3.5060975609756095,
842
- "grad_norm": 0.44921875,
 
 
 
 
 
 
 
 
843
  "learning_rate": 4.963179573544357e-05,
844
- "loss": 0.4116,
845
  "step": 575
846
  },
847
  {
848
- "epoch": 3.5365853658536586,
849
- "grad_norm": 0.458984375,
850
  "learning_rate": 4.7804598243140666e-05,
851
- "loss": 0.4301,
852
  "step": 580
853
  },
854
  {
855
- "epoch": 3.567073170731707,
856
- "grad_norm": 0.431640625,
857
  "learning_rate": 4.60010459447196e-05,
858
- "loss": 0.4359,
859
  "step": 585
860
  },
861
  {
862
- "epoch": 3.597560975609756,
863
- "grad_norm": 0.44921875,
864
  "learning_rate": 4.422195587276058e-05,
865
- "loss": 0.4322,
866
  "step": 590
867
  },
868
  {
869
- "epoch": 3.6280487804878048,
870
- "grad_norm": 0.52734375,
871
  "learning_rate": 4.2468133978137945e-05,
872
- "loss": 0.4283,
873
  "step": 595
874
  },
875
  {
876
- "epoch": 3.658536585365854,
877
- "grad_norm": 0.51171875,
878
  "learning_rate": 4.0740374764914136e-05,
879
- "loss": 0.4297,
880
  "step": 600
881
  },
882
  {
883
- "epoch": 3.6890243902439024,
884
- "grad_norm": 0.52734375,
885
  "learning_rate": 3.903946093041877e-05,
886
- "loss": 0.4386,
887
  "step": 605
888
  },
889
  {
890
- "epoch": 3.7195121951219514,
891
- "grad_norm": 0.4453125,
892
  "learning_rate": 3.736616301067694e-05,
893
- "loss": 0.4279,
894
  "step": 610
895
  },
896
  {
897
- "epoch": 3.75,
898
- "grad_norm": 0.53125,
899
  "learning_rate": 3.5721239031346066e-05,
900
- "loss": 0.454,
901
  "step": 615
902
  },
903
  {
904
- "epoch": 3.7804878048780486,
905
- "grad_norm": 0.447265625,
906
  "learning_rate": 3.410543416432069e-05,
907
- "loss": 0.4497,
908
  "step": 620
909
  },
910
  {
911
- "epoch": 3.8109756097560976,
912
- "grad_norm": 0.48828125,
913
  "learning_rate": 3.2519480390159806e-05,
914
- "loss": 0.4405,
915
  "step": 625
916
  },
917
  {
918
- "epoch": 3.841463414634146,
919
- "grad_norm": 0.55078125,
920
  "learning_rate": 3.096409616649023e-05,
921
- "loss": 0.4354,
922
  "step": 630
923
  },
924
  {
925
- "epoch": 3.8719512195121952,
926
- "grad_norm": 0.443359375,
927
  "learning_rate": 2.9439986102536043e-05,
928
- "loss": 0.4382,
929
  "step": 635
930
  },
931
  {
932
- "epoch": 3.902439024390244,
933
- "grad_norm": 0.416015625,
934
  "learning_rate": 2.794784063992131e-05,
935
- "loss": 0.4204,
936
  "step": 640
937
  },
938
  {
939
- "epoch": 3.932926829268293,
940
- "grad_norm": 0.515625,
941
  "learning_rate": 2.6488335739891178e-05,
942
- "loss": 0.4199,
943
  "step": 645
944
  },
945
  {
946
- "epoch": 3.9634146341463414,
947
- "grad_norm": 0.5234375,
948
  "learning_rate": 2.50621325770927e-05,
949
- "loss": 0.4275,
950
  "step": 650
951
  },
952
  {
953
- "epoch": 3.9939024390243905,
954
- "grad_norm": 0.4765625,
955
  "learning_rate": 2.366987724005404e-05,
956
- "loss": 0.4403,
957
  "step": 655
958
  },
959
  {
960
- "epoch": 4.0,
961
- "eval_loss": 1.206821322441101,
962
- "eval_runtime": 1.0795,
963
- "eval_samples_per_second": 1.853,
964
- "eval_steps_per_second": 0.926,
965
  "step": 656
966
  },
967
  {
968
- "epoch": 4.024390243902439,
969
- "grad_norm": 0.408203125,
970
  "learning_rate": 2.2312200438498043e-05,
971
- "loss": 0.3802,
972
  "step": 660
973
  },
974
  {
975
- "epoch": 4.054878048780488,
976
- "grad_norm": 0.455078125,
977
  "learning_rate": 2.0989717217622652e-05,
978
- "loss": 0.3864,
979
  "step": 665
980
  },
981
  {
982
- "epoch": 4.085365853658536,
983
- "grad_norm": 0.443359375,
984
  "learning_rate": 1.9703026679477256e-05,
985
- "loss": 0.3842,
986
  "step": 670
987
  },
988
  {
989
- "epoch": 4.115853658536586,
990
- "grad_norm": 0.5546875,
991
  "learning_rate": 1.8452711711561842e-05,
992
- "loss": 0.3832,
993
  "step": 675
994
  },
995
  {
996
- "epoch": 4.146341463414634,
997
- "grad_norm": 0.447265625,
998
  "learning_rate": 1.7239338722771327e-05,
999
- "loss": 0.3686,
1000
  "step": 680
1001
  },
1002
  {
1003
- "epoch": 4.176829268292683,
1004
- "grad_norm": 0.466796875,
1005
  "learning_rate": 1.6063457386805004e-05,
1006
- "loss": 0.3955,
1007
  "step": 685
1008
  },
1009
  {
1010
- "epoch": 4.2073170731707314,
1011
- "grad_norm": 0.51953125,
1012
  "learning_rate": 1.4925600393157324e-05,
1013
- "loss": 0.3847,
1014
  "step": 690
1015
  },
1016
  {
1017
- "epoch": 4.237804878048781,
1018
- "grad_norm": 0.435546875,
1019
  "learning_rate": 1.3826283205802427e-05,
1020
- "loss": 0.3829,
1021
  "step": 695
1022
  },
1023
  {
1024
- "epoch": 4.2682926829268295,
1025
- "grad_norm": 0.47265625,
1026
  "learning_rate": 1.2766003829682505e-05,
1027
- "loss": 0.3862,
1028
  "step": 700
1029
  },
1030
  {
1031
- "epoch": 4.298780487804878,
1032
- "grad_norm": 0.4609375,
1033
  "learning_rate": 1.1745242585104955e-05,
1034
- "loss": 0.3965,
1035
  "step": 705
1036
  },
1037
  {
1038
- "epoch": 4.329268292682927,
1039
- "grad_norm": 0.494140625,
1040
  "learning_rate": 1.0764461890151112e-05,
1041
- "loss": 0.3807,
1042
  "step": 710
1043
  },
1044
  {
1045
- "epoch": 4.359756097560975,
1046
- "grad_norm": 0.466796875,
1047
  "learning_rate": 9.824106051194859e-06,
1048
- "loss": 0.3722,
1049
  "step": 715
1050
  },
1051
  {
1052
- "epoch": 4.390243902439025,
1053
- "grad_norm": 0.451171875,
1054
  "learning_rate": 8.924601061626048e-06,
1055
- "loss": 0.3865,
1056
  "step": 720
1057
  },
1058
  {
1059
- "epoch": 4.420731707317073,
1060
- "grad_norm": 0.455078125,
1061
  "learning_rate": 8.066354408870048e-06,
1062
- "loss": 0.3827,
1063
  "step": 725
1064
  },
1065
  {
1066
- "epoch": 4.451219512195122,
1067
- "grad_norm": 0.470703125,
1068
  "learning_rate": 7.249754889790539e-06,
1069
- "loss": 0.3932,
1070
  "step": 730
1071
  },
1072
  {
1073
- "epoch": 4.4817073170731705,
1074
- "grad_norm": 0.44140625,
1075
  "learning_rate": 6.475172434559573e-06,
1076
- "loss": 0.3762,
1077
  "step": 735
1078
  },
1079
  {
1080
- "epoch": 4.512195121951219,
1081
- "grad_norm": 0.482421875,
 
 
 
 
 
 
 
 
1082
  "learning_rate": 5.742957939074412e-06,
1083
- "loss": 0.3825,
1084
  "step": 740
1085
  },
1086
  {
1087
- "epoch": 4.5426829268292686,
1088
- "grad_norm": 0.435546875,
1089
  "learning_rate": 5.0534431059970685e-06,
1090
- "loss": 0.3846,
1091
  "step": 745
1092
  },
1093
  {
1094
- "epoch": 4.573170731707317,
1095
- "grad_norm": 0.48828125,
1096
  "learning_rate": 4.40694029448877e-06,
1097
- "loss": 0.3748,
1098
  "step": 750
1099
  },
1100
  {
1101
- "epoch": 4.603658536585366,
1102
- "grad_norm": 0.453125,
1103
  "learning_rate": 3.803742378707198e-06,
1104
- "loss": 0.4205,
1105
  "step": 755
1106
  },
1107
  {
1108
- "epoch": 4.634146341463414,
1109
- "grad_norm": 0.47265625,
1110
  "learning_rate": 3.2441226151306404e-06,
1111
- "loss": 0.3816,
1112
  "step": 760
1113
  },
1114
  {
1115
- "epoch": 4.664634146341464,
1116
- "grad_norm": 0.443359375,
1117
  "learning_rate": 2.7283345187693264e-06,
1118
- "loss": 0.3782,
1119
  "step": 765
1120
  },
1121
  {
1122
- "epoch": 4.695121951219512,
1123
- "grad_norm": 0.52734375,
1124
  "learning_rate": 2.256611748319792e-06,
1125
- "loss": 0.3842,
1126
  "step": 770
1127
  },
1128
  {
1129
- "epoch": 4.725609756097561,
1130
- "grad_norm": 0.453125,
1131
  "learning_rate": 1.8291680003145073e-06,
1132
- "loss": 0.4031,
1133
  "step": 775
1134
  },
1135
  {
1136
- "epoch": 4.7560975609756095,
1137
- "grad_norm": 0.4453125,
1138
  "learning_rate": 1.4461969123145457e-06,
1139
- "loss": 0.3891,
1140
  "step": 780
1141
  },
1142
  {
1143
- "epoch": 4.786585365853659,
1144
- "grad_norm": 0.443359375,
1145
  "learning_rate": 1.107871975189234e-06,
1146
- "loss": 0.365,
1147
  "step": 785
1148
  },
1149
  {
1150
- "epoch": 4.817073170731708,
1151
- "grad_norm": 0.54296875,
1152
  "learning_rate": 8.143464545226298e-07,
1153
- "loss": 0.3849,
1154
  "step": 790
1155
  },
1156
  {
1157
- "epoch": 4.847560975609756,
1158
- "grad_norm": 0.578125,
1159
  "learning_rate": 5.657533211820942e-07,
1160
- "loss": 0.3848,
1161
  "step": 795
1162
  },
1163
  {
1164
- "epoch": 4.878048780487805,
1165
- "grad_norm": 0.50390625,
1166
  "learning_rate": 3.622051910808666e-07,
1167
- "loss": 0.3739,
1168
  "step": 800
1169
  },
1170
  {
1171
- "epoch": 4.908536585365853,
1172
- "grad_norm": 0.47265625,
1173
  "learning_rate": 2.037942741615617e-07,
1174
- "loss": 0.392,
1175
  "step": 805
1176
  },
1177
  {
1178
- "epoch": 4.939024390243903,
1179
- "grad_norm": 0.46484375,
1180
  "learning_rate": 9.059233262386225e-08,
1181
- "loss": 0.391,
1182
  "step": 810
1183
  },
1184
  {
1185
- "epoch": 4.969512195121951,
1186
- "grad_norm": 0.419921875,
1187
  "learning_rate": 2.2650648415334376e-08,
1188
- "loss": 0.3862,
1189
  "step": 815
1190
  },
1191
  {
1192
- "epoch": 5.0,
1193
- "grad_norm": 0.486328125,
1194
  "learning_rate": 0.0,
1195
- "loss": 0.4072,
1196
  "step": 820
1197
  },
1198
  {
1199
- "epoch": 5.0,
1200
- "eval_loss": 1.248147964477539,
1201
- "eval_runtime": 1.0787,
1202
- "eval_samples_per_second": 1.854,
1203
- "eval_steps_per_second": 0.927,
1204
  "step": 820
1205
  },
1206
  {
1207
- "epoch": 5.0,
1208
  "step": 820,
1209
- "total_flos": 6.28853392705323e+17,
1210
- "train_loss": 1.6822980254161648,
1211
- "train_runtime": 5646.8844,
1212
- "train_samples_per_second": 1.159,
1213
- "train_steps_per_second": 0.145
1214
  }
1215
  ],
1216
  "logging_steps": 5,
1217
  "max_steps": 820,
1218
  "num_input_tokens_seen": 0,
1219
- "num_train_epochs": 5,
1220
  "save_steps": 100,
1221
- "total_flos": 6.28853392705323e+17,
1222
  "train_batch_size": 2,
1223
  "trial_name": null,
1224
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
  "global_step": 820,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.012195121951219513,
13
+ "grad_norm": 460.0,
14
  "learning_rate": 2.4390243902439027e-06,
15
+ "loss": 48.1093,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.06097560975609756,
20
+ "grad_norm": 360.0,
21
  "learning_rate": 1.2195121951219513e-05,
22
+ "loss": 42.5766,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.12195121951219512,
27
+ "grad_norm": 187.0,
28
  "learning_rate": 2.4390243902439026e-05,
29
+ "loss": 35.1457,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.18292682926829268,
34
+ "grad_norm": 31.375,
35
  "learning_rate": 3.6585365853658535e-05,
36
+ "loss": 21.8793,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.24390243902439024,
41
+ "grad_norm": 17.375,
42
  "learning_rate": 4.878048780487805e-05,
43
+ "loss": 17.5136,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.3048780487804878,
48
+ "grad_norm": 7.59375,
49
  "learning_rate": 6.097560975609756e-05,
50
+ "loss": 16.4183,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.36585365853658536,
55
+ "grad_norm": 4.09375,
56
  "learning_rate": 7.317073170731707e-05,
57
+ "loss": 15.0038,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.4268292682926829,
62
+ "grad_norm": 8.25,
63
  "learning_rate": 8.53658536585366e-05,
64
+ "loss": 14.2277,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.4878048780487805,
69
+ "grad_norm": 16.5,
70
  "learning_rate": 9.75609756097561e-05,
71
+ "loss": 13.0434,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.5487804878048781,
76
+ "grad_norm": 35.25,
77
  "learning_rate": 0.00010975609756097563,
78
+ "loss": 8.9174,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.6097560975609756,
83
+ "grad_norm": 11.125,
84
  "learning_rate": 0.00012195121951219512,
85
+ "loss": 2.4207,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.6707317073170732,
90
+ "grad_norm": 1.5625,
91
  "learning_rate": 0.00013414634146341464,
92
+ "loss": 1.5637,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.7317073170731707,
97
+ "grad_norm": 1.9921875,
98
  "learning_rate": 0.00014634146341463414,
99
+ "loss": 1.3611,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.7926829268292683,
104
+ "grad_norm": 1.5,
105
  "learning_rate": 0.00015853658536585366,
106
+ "loss": 1.1715,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.8536585365853658,
111
+ "grad_norm": 2.15625,
112
  "learning_rate": 0.0001707317073170732,
113
+ "loss": 1.0679,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.9146341463414634,
118
+ "grad_norm": 1.15625,
119
  "learning_rate": 0.0001829268292682927,
120
+ "loss": 0.9835,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.975609756097561,
125
+ "grad_norm": 2.078125,
126
  "learning_rate": 0.0001951219512195122,
127
+ "loss": 0.987,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 1.0,
132
+ "eval_loss": 1.2807745933532715,
133
+ "eval_runtime": 1.1183,
134
+ "eval_samples_per_second": 1.788,
135
+ "eval_steps_per_second": 0.894,
136
+ "step": 82
137
+ },
138
+ {
139
+ "epoch": 1.0365853658536586,
140
+ "grad_norm": 1.421875,
141
  "learning_rate": 0.00019999184556954776,
142
+ "loss": 0.887,
143
  "step": 85
144
  },
145
  {
146
+ "epoch": 1.0975609756097562,
147
+ "grad_norm": 1.6328125,
148
  "learning_rate": 0.0001999420177550043,
149
+ "loss": 0.849,
150
  "step": 90
151
  },
152
  {
153
+ "epoch": 1.1585365853658536,
154
+ "grad_norm": 4.1875,
155
  "learning_rate": 0.00019984691491033906,
156
+ "loss": 0.8057,
157
  "step": 95
158
  },
159
  {
160
+ "epoch": 1.2195121951219512,
161
+ "grad_norm": 3.34375,
162
  "learning_rate": 0.00019970658011837404,
163
+ "loss": 0.8035,
164
  "step": 100
165
  },
166
  {
167
+ "epoch": 1.2804878048780488,
168
+ "grad_norm": 1.4375,
169
  "learning_rate": 0.00019952107695258992,
170
+ "loss": 0.7717,
171
  "step": 105
172
  },
173
  {
174
+ "epoch": 1.3414634146341464,
175
+ "grad_norm": 1.3046875,
176
  "learning_rate": 0.00019929048944832638,
177
+ "loss": 0.7583,
178
  "step": 110
179
  },
180
  {
181
+ "epoch": 1.4024390243902438,
182
+ "grad_norm": 2.265625,
183
  "learning_rate": 0.00019901492206471325,
184
+ "loss": 0.7363,
185
  "step": 115
186
  },
187
  {
188
+ "epoch": 1.4634146341463414,
189
+ "grad_norm": 0.5546875,
190
  "learning_rate": 0.00019869449963734893,
191
+ "loss": 0.7397,
192
  "step": 120
193
  },
194
  {
195
+ "epoch": 1.524390243902439,
196
+ "grad_norm": 1.5859375,
197
  "learning_rate": 0.00019832936732174834,
198
+ "loss": 0.7268,
199
  "step": 125
200
  },
201
  {
202
+ "epoch": 1.5853658536585367,
203
+ "grad_norm": 2.796875,
204
  "learning_rate": 0.00019791969052758562,
205
+ "loss": 0.7103,
206
  "step": 130
207
  },
208
  {
209
+ "epoch": 1.6463414634146343,
210
+ "grad_norm": 1.6640625,
211
  "learning_rate": 0.00019746565484376132,
212
+ "loss": 0.7166,
213
  "step": 135
214
  },
215
  {
216
+ "epoch": 1.7073170731707317,
217
+ "grad_norm": 1.4296875,
218
  "learning_rate": 0.00019696746595432828,
219
+ "loss": 0.7174,
220
  "step": 140
221
  },
222
  {
223
+ "epoch": 1.7682926829268293,
224
+ "grad_norm": 2.296875,
225
  "learning_rate": 0.0001964253495453141,
226
+ "loss": 0.6822,
227
  "step": 145
228
  },
229
  {
230
+ "epoch": 1.8292682926829267,
231
+ "grad_norm": 1.8515625,
232
  "learning_rate": 0.00019583955120248237,
233
+ "loss": 0.679,
234
  "step": 150
235
  },
236
  {
237
+ "epoch": 1.8902439024390243,
238
+ "grad_norm": 0.42578125,
239
  "learning_rate": 0.00019521033630007928,
240
+ "loss": 0.6708,
241
  "step": 155
242
  },
243
  {
244
+ "epoch": 1.951219512195122,
245
+ "grad_norm": 0.490234375,
246
  "learning_rate": 0.00019453798988061535,
247
+ "loss": 0.6859,
248
  "step": 160
249
  },
250
  {
251
+ "epoch": 2.0,
252
+ "eval_loss": 1.1719168424606323,
253
+ "eval_runtime": 1.1192,
254
+ "eval_samples_per_second": 1.787,
255
+ "eval_steps_per_second": 0.893,
256
  "step": 164
257
  },
258
  {
259
+ "epoch": 2.0121951219512195,
260
+ "grad_norm": 0.5703125,
261
  "learning_rate": 0.00019382281652573785,
262
+ "loss": 0.6606,
263
  "step": 165
264
  },
265
  {
266
+ "epoch": 2.073170731707317,
267
+ "grad_norm": 3.296875,
268
  "learning_rate": 0.00019306514021825118,
269
+ "loss": 0.6528,
270
  "step": 170
271
  },
272
  {
273
+ "epoch": 2.1341463414634148,
274
+ "grad_norm": 2.609375,
275
  "learning_rate": 0.00019226530419534833,
276
+ "loss": 0.6424,
277
  "step": 175
278
  },
279
  {
280
+ "epoch": 2.1951219512195124,
281
+ "grad_norm": 1.5703125,
282
  "learning_rate": 0.00019142367079312021,
283
+ "loss": 0.6061,
284
  "step": 180
285
  },
286
  {
287
+ "epoch": 2.2560975609756095,
288
+ "grad_norm": 0.5234375,
289
  "learning_rate": 0.00019054062128241264,
290
+ "loss": 0.6186,
291
  "step": 185
292
  },
293
  {
294
+ "epoch": 2.317073170731707,
295
+ "grad_norm": 1.2890625,
296
  "learning_rate": 0.00018961655569610557,
297
+ "loss": 0.6099,
298
  "step": 190
299
  },
300
  {
301
+ "epoch": 2.3780487804878048,
302
+ "grad_norm": 0.89453125,
303
  "learning_rate": 0.0001886518926478932,
304
+ "loss": 0.6184,
305
  "step": 195
306
  },
307
  {
308
+ "epoch": 2.4390243902439024,
309
+ "grad_norm": 0.6796875,
310
  "learning_rate": 0.00018764706914264635,
311
+ "loss": 0.5949,
312
  "step": 200
313
  },
314
  {
315
+ "epoch": 2.5,
316
+ "grad_norm": 0.8046875,
317
  "learning_rate": 0.00018660254037844388,
318
+ "loss": 0.5993,
319
  "step": 205
320
  },
321
  {
322
+ "epoch": 2.5609756097560976,
323
+ "grad_norm": 1.3125,
324
  "learning_rate": 0.00018551877954036162,
325
+ "loss": 0.6041,
326
  "step": 210
327
  },
328
  {
329
+ "epoch": 2.6219512195121952,
330
+ "grad_norm": 0.9921875,
331
  "learning_rate": 0.00018439627758611385,
332
+ "loss": 0.6005,
333
  "step": 215
334
  },
335
  {
336
+ "epoch": 2.682926829268293,
337
+ "grad_norm": 1.0,
338
  "learning_rate": 0.00018323554302364272,
339
+ "loss": 0.5917,
340
  "step": 220
341
  },
342
  {
343
+ "epoch": 2.7439024390243905,
344
+ "grad_norm": 1.8515625,
345
  "learning_rate": 0.00018203710168075788,
346
+ "loss": 0.5894,
347
  "step": 225
348
  },
349
  {
350
+ "epoch": 2.8048780487804876,
351
+ "grad_norm": 1.15625,
352
  "learning_rate": 0.0001808014964669293,
353
+ "loss": 0.5884,
354
  "step": 230
355
  },
356
  {
357
+ "epoch": 2.8658536585365852,
358
+ "grad_norm": 0.83984375,
359
  "learning_rate": 0.00017952928712734268,
360
+ "loss": 0.5967,
361
  "step": 235
362
  },
363
  {
364
+ "epoch": 2.926829268292683,
365
+ "grad_norm": 0.3359375,
366
  "learning_rate": 0.00017822104998932713,
367
+ "loss": 0.6064,
368
  "step": 240
369
  },
370
  {
371
+ "epoch": 2.9878048780487805,
372
+ "grad_norm": 0.51171875,
373
  "learning_rate": 0.00017687737770127185,
374
+ "loss": 0.5836,
375
  "step": 245
376
  },
377
  {
378
+ "epoch": 3.0,
379
+ "eval_loss": 1.1479520797729492,
380
+ "eval_runtime": 1.1346,
381
+ "eval_samples_per_second": 1.763,
382
+ "eval_steps_per_second": 0.881,
383
+ "step": 246
384
+ },
385
+ {
386
+ "epoch": 3.048780487804878,
387
+ "grad_norm": 1.078125,
388
  "learning_rate": 0.00017549887896414851,
389
+ "loss": 0.5409,
390
  "step": 250
391
  },
392
  {
393
+ "epoch": 3.1097560975609757,
394
+ "grad_norm": 0.361328125,
395
  "learning_rate": 0.0001740861782557618,
396
+ "loss": 0.5316,
397
  "step": 255
398
  },
399
  {
400
+ "epoch": 3.1707317073170733,
401
+ "grad_norm": 0.59765625,
402
  "learning_rate": 0.0001726399155478529,
403
+ "loss": 0.5168,
404
  "step": 260
405
  },
406
  {
407
+ "epoch": 3.231707317073171,
408
+ "grad_norm": 0.40234375,
409
  "learning_rate": 0.00017116074601618417,
410
+ "loss": 0.5243,
411
  "step": 265
412
  },
413
  {
414
+ "epoch": 3.292682926829268,
415
+ "grad_norm": 0.7890625,
416
  "learning_rate": 0.0001696493397437357,
417
+ "loss": 0.5076,
418
  "step": 270
419
  },
420
  {
421
+ "epoch": 3.3536585365853657,
422
+ "grad_norm": 0.96875,
423
  "learning_rate": 0.00016810638141714934,
424
+ "loss": 0.5251,
425
  "step": 275
426
  },
427
  {
428
+ "epoch": 3.4146341463414633,
429
+ "grad_norm": 0.890625,
430
  "learning_rate": 0.00016653257001655652,
431
+ "loss": 0.5379,
432
  "step": 280
433
  },
434
  {
435
+ "epoch": 3.475609756097561,
436
+ "grad_norm": 0.7109375,
437
  "learning_rate": 0.0001649286184989315,
438
+ "loss": 0.5421,
439
  "step": 285
440
  },
441
  {
442
+ "epoch": 3.5365853658536586,
443
+ "grad_norm": 0.59765625,
444
  "learning_rate": 0.0001632952534751122,
445
+ "loss": 0.5148,
446
  "step": 290
447
  },
448
  {
449
+ "epoch": 3.597560975609756,
450
+ "grad_norm": 0.490234375,
451
  "learning_rate": 0.00016163321488063637,
452
+ "loss": 0.5222,
453
  "step": 295
454
  },
455
  {
456
+ "epoch": 3.658536585365854,
457
+ "grad_norm": 0.55078125,
458
  "learning_rate": 0.00015994325564054122,
459
+ "loss": 0.5229,
460
  "step": 300
461
  },
462
  {
463
+ "epoch": 3.7195121951219514,
464
+ "grad_norm": 0.515625,
465
  "learning_rate": 0.00015822614132827837,
466
+ "loss": 0.532,
467
  "step": 305
468
  },
469
  {
470
+ "epoch": 3.7804878048780486,
471
+ "grad_norm": 0.458984375,
472
  "learning_rate": 0.00015648264981889934,
473
+ "loss": 0.5401,
474
  "step": 310
475
  },
476
  {
477
+ "epoch": 3.841463414634146,
478
+ "grad_norm": 0.453125,
479
  "learning_rate": 0.00015471357093666804,
480
+ "loss": 0.5311,
481
  "step": 315
482
  },
483
  {
484
+ "epoch": 3.902439024390244,
485
+ "grad_norm": 0.396484375,
486
  "learning_rate": 0.00015291970609726007,
487
+ "loss": 0.525,
488
  "step": 320
489
  },
490
  {
491
+ "epoch": 3.9634146341463414,
492
+ "grad_norm": 0.359375,
493
  "learning_rate": 0.00015110186794471103,
494
+ "loss": 0.5178,
495
  "step": 325
496
  },
497
  {
498
+ "epoch": 4.0,
499
+ "eval_loss": 1.171655535697937,
500
+ "eval_runtime": 1.117,
501
+ "eval_samples_per_second": 1.791,
502
+ "eval_steps_per_second": 0.895,
503
  "step": 328
504
  },
505
  {
506
+ "epoch": 4.024390243902439,
507
+ "grad_norm": 0.8125,
508
  "learning_rate": 0.00014926087998327837,
509
+ "loss": 0.5073,
510
  "step": 330
511
  },
512
  {
513
+ "epoch": 4.085365853658536,
514
+ "grad_norm": 0.3671875,
515
  "learning_rate": 0.00014739757620438307,
516
+ "loss": 0.4417,
517
  "step": 335
518
  },
519
  {
520
+ "epoch": 4.146341463414634,
521
+ "grad_norm": 0.7265625,
522
  "learning_rate": 0.0001455128007088009,
523
+ "loss": 0.4321,
524
  "step": 340
525
  },
526
  {
527
+ "epoch": 4.2073170731707314,
528
+ "grad_norm": 0.4921875,
529
  "learning_rate": 0.00014360740732427367,
530
+ "loss": 0.4481,
531
  "step": 345
532
  },
533
  {
534
+ "epoch": 4.2682926829268295,
535
+ "grad_norm": 0.75390625,
536
  "learning_rate": 0.00014168225921871433,
537
+ "loss": 0.4489,
538
  "step": 350
539
  },
540
  {
541
+ "epoch": 4.329268292682927,
542
+ "grad_norm": 0.52734375,
543
  "learning_rate": 0.00013973822850918055,
544
+ "loss": 0.4517,
545
  "step": 355
546
  },
547
  {
548
+ "epoch": 4.390243902439025,
549
+ "grad_norm": 0.78125,
550
  "learning_rate": 0.0001377761958667946,
551
+ "loss": 0.4441,
552
  "step": 360
553
  },
554
  {
555
+ "epoch": 4.451219512195122,
556
+ "grad_norm": 0.59375,
557
  "learning_rate": 0.00013579705011778766,
558
+ "loss": 0.4539,
559
  "step": 365
560
  },
561
  {
562
+ "epoch": 4.512195121951219,
563
+ "grad_norm": 0.578125,
564
  "learning_rate": 0.00013380168784085027,
565
+ "loss": 0.4489,
566
  "step": 370
567
  },
568
  {
569
+ "epoch": 4.573170731707317,
570
+ "grad_norm": 0.70703125,
571
  "learning_rate": 0.00013179101296097035,
572
+ "loss": 0.4443,
573
  "step": 375
574
  },
575
  {
576
+ "epoch": 4.634146341463414,
577
+ "grad_norm": 0.6796875,
578
  "learning_rate": 0.00012976593633994346,
579
+ "loss": 0.4664,
580
  "step": 380
581
  },
582
  {
583
+ "epoch": 4.695121951219512,
584
+ "grad_norm": 0.43359375,
585
  "learning_rate": 0.0001277273753637408,
586
+ "loss": 0.4572,
587
  "step": 385
588
  },
589
  {
590
+ "epoch": 4.7560975609756095,
591
+ "grad_norm": 0.84765625,
592
  "learning_rate": 0.00012567625352692127,
593
+ "loss": 0.4675,
594
  "step": 390
595
  },
596
  {
597
+ "epoch": 4.817073170731708,
598
+ "grad_norm": 0.486328125,
599
  "learning_rate": 0.0001236135000142765,
600
+ "loss": 0.45,
601
  "step": 395
602
  },
603
  {
604
+ "epoch": 4.878048780487805,
605
+ "grad_norm": 0.4140625,
606
  "learning_rate": 0.00012154004927989815,
607
+ "loss": 0.4538,
608
  "step": 400
609
  },
610
  {
611
+ "epoch": 4.939024390243903,
612
+ "grad_norm": 0.3984375,
613
  "learning_rate": 0.00011945684062385803,
614
+ "loss": 0.4565,
615
  "step": 405
616
  },
617
  {
618
+ "epoch": 5.0,
619
+ "grad_norm": 0.4140625,
620
  "learning_rate": 0.00011736481776669306,
621
+ "loss": 0.4668,
622
  "step": 410
623
  },
624
  {
625
+ "epoch": 5.0,
626
+ "eval_loss": 1.2044252157211304,
627
+ "eval_runtime": 1.1171,
628
+ "eval_samples_per_second": 1.79,
629
+ "eval_steps_per_second": 0.895,
630
+ "step": 410
631
+ },
632
+ {
633
+ "epoch": 5.060975609756097,
634
+ "grad_norm": 0.63671875,
635
  "learning_rate": 0.00011526492842188745,
636
+ "loss": 0.3794,
637
  "step": 415
638
  },
639
  {
640
+ "epoch": 5.121951219512195,
641
+ "grad_norm": 0.83984375,
642
  "learning_rate": 0.0001131581238665465,
643
+ "loss": 0.376,
644
  "step": 420
645
  },
646
  {
647
+ "epoch": 5.182926829268292,
648
+ "grad_norm": 0.453125,
649
  "learning_rate": 0.00011104535851045539,
650
+ "loss": 0.3721,
651
  "step": 425
652
  },
653
  {
654
+ "epoch": 5.2439024390243905,
655
+ "grad_norm": 0.470703125,
656
  "learning_rate": 0.00010892758946371944,
657
+ "loss": 0.3812,
658
  "step": 430
659
  },
660
  {
661
+ "epoch": 5.304878048780488,
662
+ "grad_norm": 0.78125,
663
  "learning_rate": 0.00010680577610318072,
664
+ "loss": 0.3748,
665
  "step": 435
666
  },
667
  {
668
+ "epoch": 5.365853658536586,
669
+ "grad_norm": 0.7578125,
670
  "learning_rate": 0.00010468087963780789,
671
+ "loss": 0.3698,
672
  "step": 440
673
  },
674
  {
675
+ "epoch": 5.426829268292683,
676
+ "grad_norm": 0.48828125,
677
  "learning_rate": 0.00010255386267325602,
678
+ "loss": 0.3849,
679
  "step": 445
680
  },
681
  {
682
+ "epoch": 5.487804878048781,
683
+ "grad_norm": 0.45703125,
684
  "learning_rate": 0.00010042568877579388,
685
+ "loss": 0.3831,
686
  "step": 450
687
  },
688
  {
689
+ "epoch": 5.548780487804878,
690
+ "grad_norm": 0.470703125,
691
  "learning_rate": 9.829732203579584e-05,
692
+ "loss": 0.3765,
693
  "step": 455
694
  },
695
  {
696
+ "epoch": 5.609756097560975,
697
+ "grad_norm": 0.58984375,
698
  "learning_rate": 9.616972663099647e-05,
699
+ "loss": 0.4006,
700
  "step": 460
701
  },
702
  {
703
+ "epoch": 5.670731707317073,
704
+ "grad_norm": 0.4609375,
705
  "learning_rate": 9.404386638970542e-05,
706
+ "loss": 0.384,
707
  "step": 465
708
  },
709
  {
710
+ "epoch": 5.7317073170731705,
711
+ "grad_norm": 0.41796875,
712
  "learning_rate": 9.192070435418079e-05,
713
+ "loss": 0.3804,
714
  "step": 470
715
  },
716
  {
717
+ "epoch": 5.7926829268292686,
718
+ "grad_norm": 0.56640625,
719
  "learning_rate": 8.980120234435849e-05,
720
+ "loss": 0.39,
721
  "step": 475
722
  },
723
  {
724
+ "epoch": 5.853658536585366,
725
+ "grad_norm": 0.7265625,
726
  "learning_rate": 8.768632052213531e-05,
727
+ "loss": 0.3881,
728
  "step": 480
729
  },
730
  {
731
+ "epoch": 5.914634146341464,
732
+ "grad_norm": 0.44921875,
733
  "learning_rate": 8.557701695640321e-05,
734
+ "loss": 0.3908,
735
  "step": 485
736
  },
737
  {
738
+ "epoch": 5.975609756097561,
739
+ "grad_norm": 0.4140625,
740
  "learning_rate": 8.347424718903151e-05,
741
+ "loss": 0.3955,
742
  "step": 490
743
  },
744
  {
745
+ "epoch": 6.0,
746
+ "eval_loss": 1.325190782546997,
747
+ "eval_runtime": 1.1255,
748
+ "eval_samples_per_second": 1.777,
749
+ "eval_steps_per_second": 0.888,
750
  "step": 492
751
  },
752
  {
753
+ "epoch": 6.036585365853658,
754
+ "grad_norm": 0.3984375,
755
  "learning_rate": 8.13789638019942e-05,
756
+ "loss": 0.3366,
757
  "step": 495
758
  },
759
  {
760
+ "epoch": 6.097560975609756,
761
+ "grad_norm": 0.408203125,
762
  "learning_rate": 7.929211598583794e-05,
763
+ "loss": 0.3141,
764
  "step": 500
765
  },
766
  {
767
+ "epoch": 6.158536585365853,
768
+ "grad_norm": 0.54296875,
769
  "learning_rate": 7.721464910968627e-05,
770
+ "loss": 0.3234,
771
  "step": 505
772
  },
773
  {
774
+ "epoch": 6.219512195121951,
775
+ "grad_norm": 0.423828125,
776
  "learning_rate": 7.514750429297528e-05,
777
+ "loss": 0.3232,
778
  "step": 510
779
  },
780
  {
781
+ "epoch": 6.280487804878049,
782
+ "grad_norm": 0.4921875,
783
  "learning_rate": 7.309161797911441e-05,
784
+ "loss": 0.3198,
785
  "step": 515
786
  },
787
  {
788
+ "epoch": 6.341463414634147,
789
+ "grad_norm": 0.43359375,
790
  "learning_rate": 7.104792151126515e-05,
791
+ "loss": 0.3236,
792
  "step": 520
793
  },
794
  {
795
+ "epoch": 6.402439024390244,
796
+ "grad_norm": 0.447265625,
797
  "learning_rate": 6.901734071043071e-05,
798
+ "loss": 0.3126,
799
  "step": 525
800
  },
801
  {
802
+ "epoch": 6.463414634146342,
803
+ "grad_norm": 0.41015625,
804
  "learning_rate": 6.700079545604708e-05,
805
+ "loss": 0.3167,
806
  "step": 530
807
  },
808
  {
809
+ "epoch": 6.524390243902439,
810
+ "grad_norm": 0.52734375,
811
  "learning_rate": 6.499919926926566e-05,
812
+ "loss": 0.3412,
813
  "step": 535
814
  },
815
  {
816
+ "epoch": 6.585365853658536,
817
+ "grad_norm": 0.5703125,
818
  "learning_rate": 6.301345889911637e-05,
819
+ "loss": 0.3269,
820
  "step": 540
821
  },
822
  {
823
+ "epoch": 6.646341463414634,
824
+ "grad_norm": 0.423828125,
825
  "learning_rate": 6.104447391173858e-05,
826
+ "loss": 0.3309,
827
  "step": 545
828
  },
829
  {
830
+ "epoch": 6.7073170731707314,
831
+ "grad_norm": 0.482421875,
832
  "learning_rate": 5.909313628286601e-05,
833
+ "loss": 0.3172,
834
  "step": 550
835
  },
836
  {
837
+ "epoch": 6.7682926829268295,
838
+ "grad_norm": 0.53125,
839
  "learning_rate": 5.716032999375006e-05,
840
+ "loss": 0.326,
841
  "step": 555
842
  },
843
  {
844
+ "epoch": 6.829268292682927,
845
+ "grad_norm": 0.439453125,
846
  "learning_rate": 5.524693063070492e-05,
847
+ "loss": 0.3207,
848
  "step": 560
849
  },
850
  {
851
+ "epoch": 6.890243902439025,
852
+ "grad_norm": 0.55859375,
853
  "learning_rate": 5.335380498845559e-05,
854
+ "loss": 0.3295,
855
  "step": 565
856
  },
857
  {
858
+ "epoch": 6.951219512195122,
859
+ "grad_norm": 0.47265625,
860
  "learning_rate": 5.148181067746862e-05,
861
+ "loss": 0.3233,
862
  "step": 570
863
  },
864
  {
865
+ "epoch": 7.0,
866
+ "eval_loss": 1.4225410223007202,
867
+ "eval_runtime": 1.1188,
868
+ "eval_samples_per_second": 1.788,
869
+ "eval_steps_per_second": 0.894,
870
+ "step": 574
871
+ },
872
+ {
873
+ "epoch": 7.012195121951219,
874
+ "grad_norm": 0.421875,
875
  "learning_rate": 4.963179573544357e-05,
876
+ "loss": 0.3129,
877
  "step": 575
878
  },
879
  {
880
+ "epoch": 7.073170731707317,
881
+ "grad_norm": 0.57421875,
882
  "learning_rate": 4.7804598243140666e-05,
883
+ "loss": 0.2764,
884
  "step": 580
885
  },
886
  {
887
+ "epoch": 7.134146341463414,
888
+ "grad_norm": 0.5,
889
  "learning_rate": 4.60010459447196e-05,
890
+ "loss": 0.2792,
891
  "step": 585
892
  },
893
  {
894
+ "epoch": 7.195121951219512,
895
+ "grad_norm": 0.486328125,
896
  "learning_rate": 4.422195587276058e-05,
897
+ "loss": 0.2799,
898
  "step": 590
899
  },
900
  {
901
+ "epoch": 7.2560975609756095,
902
+ "grad_norm": 0.47265625,
903
  "learning_rate": 4.2468133978137945e-05,
904
+ "loss": 0.2759,
905
  "step": 595
906
  },
907
  {
908
+ "epoch": 7.317073170731708,
909
+ "grad_norm": 0.48046875,
910
  "learning_rate": 4.0740374764914136e-05,
911
+ "loss": 0.2697,
912
  "step": 600
913
  },
914
  {
915
+ "epoch": 7.378048780487805,
916
+ "grad_norm": 0.439453125,
917
  "learning_rate": 3.903946093041877e-05,
918
+ "loss": 0.2917,
919
  "step": 605
920
  },
921
  {
922
+ "epoch": 7.439024390243903,
923
+ "grad_norm": 0.47265625,
924
  "learning_rate": 3.736616301067694e-05,
925
+ "loss": 0.2748,
926
  "step": 610
927
  },
928
  {
929
+ "epoch": 7.5,
930
+ "grad_norm": 0.50390625,
931
  "learning_rate": 3.5721239031346066e-05,
932
+ "loss": 0.2771,
933
  "step": 615
934
  },
935
  {
936
+ "epoch": 7.560975609756097,
937
+ "grad_norm": 0.5,
938
  "learning_rate": 3.410543416432069e-05,
939
+ "loss": 0.2788,
940
  "step": 620
941
  },
942
  {
943
+ "epoch": 7.621951219512195,
944
+ "grad_norm": 0.44921875,
945
  "learning_rate": 3.2519480390159806e-05,
946
+ "loss": 0.2829,
947
  "step": 625
948
  },
949
  {
950
+ "epoch": 7.682926829268292,
951
+ "grad_norm": 0.494140625,
952
  "learning_rate": 3.096409616649023e-05,
953
+ "loss": 0.2708,
954
  "step": 630
955
  },
956
  {
957
+ "epoch": 7.7439024390243905,
958
+ "grad_norm": 0.46875,
959
  "learning_rate": 2.9439986102536043e-05,
960
+ "loss": 0.2945,
961
  "step": 635
962
  },
963
  {
964
+ "epoch": 7.804878048780488,
965
+ "grad_norm": 0.482421875,
966
  "learning_rate": 2.794784063992131e-05,
967
+ "loss": 0.2833,
968
  "step": 640
969
  },
970
  {
971
+ "epoch": 7.865853658536586,
972
+ "grad_norm": 0.4375,
973
  "learning_rate": 2.6488335739891178e-05,
974
+ "loss": 0.2767,
975
  "step": 645
976
  },
977
  {
978
+ "epoch": 7.926829268292683,
979
+ "grad_norm": 0.46875,
980
  "learning_rate": 2.50621325770927e-05,
981
+ "loss": 0.2725,
982
  "step": 650
983
  },
984
  {
985
+ "epoch": 7.987804878048781,
986
+ "grad_norm": 0.4453125,
987
  "learning_rate": 2.366987724005404e-05,
988
+ "loss": 0.2669,
989
  "step": 655
990
  },
991
  {
992
+ "epoch": 8.0,
993
+ "eval_loss": 1.611946702003479,
994
+ "eval_runtime": 1.1206,
995
+ "eval_samples_per_second": 1.785,
996
+ "eval_steps_per_second": 0.892,
997
  "step": 656
998
  },
999
  {
1000
+ "epoch": 8.048780487804878,
1001
+ "grad_norm": 0.435546875,
1002
  "learning_rate": 2.2312200438498043e-05,
1003
+ "loss": 0.2564,
1004
  "step": 660
1005
  },
1006
  {
1007
+ "epoch": 8.109756097560975,
1008
+ "grad_norm": 0.447265625,
1009
  "learning_rate": 2.0989717217622652e-05,
1010
+ "loss": 0.2598,
1011
  "step": 665
1012
  },
1013
  {
1014
+ "epoch": 8.170731707317072,
1015
+ "grad_norm": 0.4921875,
1016
  "learning_rate": 1.9703026679477256e-05,
1017
+ "loss": 0.2507,
1018
  "step": 670
1019
  },
1020
  {
1021
+ "epoch": 8.231707317073171,
1022
+ "grad_norm": 0.3984375,
1023
  "learning_rate": 1.8452711711561842e-05,
1024
+ "loss": 0.2433,
1025
  "step": 675
1026
  },
1027
  {
1028
+ "epoch": 8.292682926829269,
1029
+ "grad_norm": 0.408203125,
1030
  "learning_rate": 1.7239338722771327e-05,
1031
+ "loss": 0.251,
1032
  "step": 680
1033
  },
1034
  {
1035
+ "epoch": 8.353658536585366,
1036
+ "grad_norm": 0.40234375,
1037
  "learning_rate": 1.6063457386805004e-05,
1038
+ "loss": 0.252,
1039
  "step": 685
1040
  },
1041
  {
1042
+ "epoch": 8.414634146341463,
1043
+ "grad_norm": 0.404296875,
1044
  "learning_rate": 1.4925600393157324e-05,
1045
+ "loss": 0.2471,
1046
  "step": 690
1047
  },
1048
  {
1049
+ "epoch": 8.475609756097562,
1050
+ "grad_norm": 0.494140625,
1051
  "learning_rate": 1.3826283205802427e-05,
1052
+ "loss": 0.2526,
1053
  "step": 695
1054
  },
1055
  {
1056
+ "epoch": 8.536585365853659,
1057
+ "grad_norm": 0.423828125,
1058
  "learning_rate": 1.2766003829682505e-05,
1059
+ "loss": 0.2517,
1060
  "step": 700
1061
  },
1062
  {
1063
+ "epoch": 8.597560975609756,
1064
+ "grad_norm": 0.4375,
1065
  "learning_rate": 1.1745242585104955e-05,
1066
+ "loss": 0.2479,
1067
  "step": 705
1068
  },
1069
  {
1070
+ "epoch": 8.658536585365853,
1071
+ "grad_norm": 0.404296875,
1072
  "learning_rate": 1.0764461890151112e-05,
1073
+ "loss": 0.2485,
1074
  "step": 710
1075
  },
1076
  {
1077
+ "epoch": 8.71951219512195,
1078
+ "grad_norm": 0.44140625,
1079
  "learning_rate": 9.824106051194859e-06,
1080
+ "loss": 0.2559,
1081
  "step": 715
1082
  },
1083
  {
1084
+ "epoch": 8.78048780487805,
1085
+ "grad_norm": 0.40234375,
1086
  "learning_rate": 8.924601061626048e-06,
1087
+ "loss": 0.2588,
1088
  "step": 720
1089
  },
1090
  {
1091
+ "epoch": 8.841463414634147,
1092
+ "grad_norm": 0.40625,
1093
  "learning_rate": 8.066354408870048e-06,
1094
+ "loss": 0.2518,
1095
  "step": 725
1096
  },
1097
  {
1098
+ "epoch": 8.902439024390244,
1099
+ "grad_norm": 0.42578125,
1100
  "learning_rate": 7.249754889790539e-06,
1101
+ "loss": 0.2542,
1102
  "step": 730
1103
  },
1104
  {
1105
+ "epoch": 8.963414634146341,
1106
+ "grad_norm": 0.453125,
1107
  "learning_rate": 6.475172434559573e-06,
1108
+ "loss": 0.2591,
1109
  "step": 735
1110
  },
1111
  {
1112
+ "epoch": 9.0,
1113
+ "eval_loss": 1.73529052734375,
1114
+ "eval_runtime": 1.1218,
1115
+ "eval_samples_per_second": 1.783,
1116
+ "eval_steps_per_second": 0.891,
1117
+ "step": 738
1118
+ },
1119
+ {
1120
+ "epoch": 9.024390243902438,
1121
+ "grad_norm": 0.419921875,
1122
  "learning_rate": 5.742957939074412e-06,
1123
+ "loss": 0.2467,
1124
  "step": 740
1125
  },
1126
  {
1127
+ "epoch": 9.085365853658537,
1128
+ "grad_norm": 0.384765625,
1129
  "learning_rate": 5.0534431059970685e-06,
1130
+ "loss": 0.2449,
1131
  "step": 745
1132
  },
1133
  {
1134
+ "epoch": 9.146341463414634,
1135
+ "grad_norm": 0.38671875,
1136
  "learning_rate": 4.40694029448877e-06,
1137
+ "loss": 0.244,
1138
  "step": 750
1139
  },
1140
  {
1141
+ "epoch": 9.207317073170731,
1142
+ "grad_norm": 0.419921875,
1143
  "learning_rate": 3.803742378707198e-06,
1144
+ "loss": 0.2612,
1145
  "step": 755
1146
  },
1147
  {
1148
+ "epoch": 9.268292682926829,
1149
+ "grad_norm": 0.384765625,
1150
  "learning_rate": 3.2441226151306404e-06,
1151
+ "loss": 0.2497,
1152
  "step": 760
1153
  },
1154
  {
1155
+ "epoch": 9.329268292682928,
1156
+ "grad_norm": 0.41796875,
1157
  "learning_rate": 2.7283345187693264e-06,
1158
+ "loss": 0.2571,
1159
  "step": 765
1160
  },
1161
  {
1162
+ "epoch": 9.390243902439025,
1163
+ "grad_norm": 0.40234375,
1164
  "learning_rate": 2.256611748319792e-06,
1165
+ "loss": 0.2458,
1166
  "step": 770
1167
  },
1168
  {
1169
+ "epoch": 9.451219512195122,
1170
+ "grad_norm": 0.427734375,
1171
  "learning_rate": 1.8291680003145073e-06,
1172
+ "loss": 0.2585,
1173
  "step": 775
1174
  },
1175
  {
1176
+ "epoch": 9.512195121951219,
1177
+ "grad_norm": 0.423828125,
1178
  "learning_rate": 1.4461969123145457e-06,
1179
+ "loss": 0.2516,
1180
  "step": 780
1181
  },
1182
  {
1183
+ "epoch": 9.573170731707316,
1184
+ "grad_norm": 0.39453125,
1185
  "learning_rate": 1.107871975189234e-06,
1186
+ "loss": 0.2396,
1187
  "step": 785
1188
  },
1189
  {
1190
+ "epoch": 9.634146341463415,
1191
+ "grad_norm": 0.380859375,
1192
  "learning_rate": 8.143464545226298e-07,
1193
+ "loss": 0.2441,
1194
  "step": 790
1195
  },
1196
  {
1197
+ "epoch": 9.695121951219512,
1198
+ "grad_norm": 0.3984375,
1199
  "learning_rate": 5.657533211820942e-07,
1200
+ "loss": 0.2417,
1201
  "step": 795
1202
  },
1203
  {
1204
+ "epoch": 9.75609756097561,
1205
+ "grad_norm": 0.400390625,
1206
  "learning_rate": 3.622051910808666e-07,
1207
+ "loss": 0.2425,
1208
  "step": 800
1209
  },
1210
  {
1211
+ "epoch": 9.817073170731707,
1212
+ "grad_norm": 0.412109375,
1213
  "learning_rate": 2.037942741615617e-07,
1214
+ "loss": 0.2447,
1215
  "step": 805
1216
  },
1217
  {
1218
+ "epoch": 9.878048780487806,
1219
+ "grad_norm": 0.39453125,
1220
  "learning_rate": 9.059233262386225e-08,
1221
+ "loss": 0.2512,
1222
  "step": 810
1223
  },
1224
  {
1225
+ "epoch": 9.939024390243903,
1226
+ "grad_norm": 0.39453125,
1227
  "learning_rate": 2.2650648415334376e-08,
1228
+ "loss": 0.2365,
1229
  "step": 815
1230
  },
1231
  {
1232
+ "epoch": 10.0,
1233
+ "grad_norm": 0.3828125,
1234
  "learning_rate": 0.0,
1235
+ "loss": 0.2367,
1236
  "step": 820
1237
  },
1238
  {
1239
+ "epoch": 10.0,
1240
+ "eval_loss": 1.750213384628296,
1241
+ "eval_runtime": 1.1197,
1242
+ "eval_samples_per_second": 1.786,
1243
+ "eval_steps_per_second": 0.893,
1244
  "step": 820
1245
  },
1246
  {
1247
+ "epoch": 10.0,
1248
  "step": 820,
1249
+ "total_flos": 1.257706785410646e+18,
1250
+ "train_loss": 1.5742560074096772,
1251
+ "train_runtime": 5911.531,
1252
+ "train_samples_per_second": 2.214,
1253
+ "train_steps_per_second": 0.139
1254
  }
1255
  ],
1256
  "logging_steps": 5,
1257
  "max_steps": 820,
1258
  "num_input_tokens_seen": 0,
1259
+ "num_train_epochs": 10,
1260
  "save_steps": 100,
1261
+ "total_flos": 1.257706785410646e+18,
1262
  "train_batch_size": 2,
1263
  "trial_name": null,
1264
  "trial_params": null