yinmingzhang commited on
Commit
16d11a4
·
verified ·
1 Parent(s): 2252b8b

Model save

Browse files
README.md CHANGED
@@ -27,14 +27,14 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mzyin-university-of-florida/huggingface/runs/o18nogg0)
31
 
32
 
33
  This model was trained with SFT.
34
 
35
  ### Framework versions
36
 
37
- - TRL: 0.14.0
38
  - Transformers: 4.48.3
39
  - Pytorch: 2.2.2+cu121
40
  - Datasets: 3.2.0
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/mzyin-university-of-florida/huggingface/runs/vkhqux38)
31
 
32
 
33
  This model was trained with SFT.
34
 
35
  ### Framework versions
36
 
37
+ - TRL: 0.15.2
38
  - Transformers: 4.48.3
39
  - Pytorch: 2.2.2+cu121
40
  - Datasets: 3.2.0
adapter_config.json CHANGED
@@ -23,8 +23,8 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "q_proj"
28
  ],
29
  "task_type": "CAUSAL_LM",
30
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "q_proj",
27
+ "v_proj"
28
  ],
29
  "task_type": "CAUSAL_LM",
30
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76d465a76c22fc53b903b8d9d2e1adabb5b72c7fe6f306a205847f0774c7c107
3
  size 545743328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:794e0236da548e1f70ce43e7de7d10780ffe1eb7d297bcc86cdae95d5875da4f
3
  size 545743328
all_results.json CHANGED
@@ -5,10 +5,10 @@
5
  "eval_samples": 142,
6
  "eval_samples_per_second": 30.075,
7
  "eval_steps_per_second": 5.013,
8
- "total_flos": 72813143654400.0,
9
- "train_loss": 3.3806732257207233,
10
- "train_runtime": 123.5801,
11
  "train_samples": 568,
12
- "train_samples_per_second": 14.404,
13
- "train_steps_per_second": 1.942
14
  }
 
5
  "eval_samples": 142,
6
  "eval_samples_per_second": 30.075,
7
  "eval_steps_per_second": 5.013,
8
+ "total_flos": 184078172160000.0,
9
+ "train_loss": 3.2382628750801086,
10
+ "train_runtime": 305.3611,
11
  "train_samples": 568,
12
+ "train_samples_per_second": 14.737,
13
+ "train_steps_per_second": 1.965
14
  }
train_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 72813143654400.0,
4
- "train_loss": 3.3806732257207233,
5
- "train_runtime": 123.5801,
6
  "train_samples": 568,
7
- "train_samples_per_second": 14.404,
8
- "train_steps_per_second": 1.942
9
  }
 
1
  {
2
+ "total_flos": 184078172160000.0,
3
+ "train_loss": 3.2382628750801086,
4
+ "train_runtime": 305.3611,
 
5
  "train_samples": 568,
6
+ "train_samples_per_second": 14.737,
7
+ "train_steps_per_second": 1.965
8
  }
trainer_state.json CHANGED
@@ -1,530 +1,1445 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.08333333333333333,
13
- "grad_norm": 1.9178808308871647,
14
- "learning_rate": 8.333333333333333e-07,
15
- "loss": 3.4904,
 
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.4166666666666667,
20
- "grad_norm": 1.5508324121618542,
21
- "learning_rate": 4.166666666666667e-06,
22
- "loss": 3.613,
 
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.8333333333333334,
27
- "grad_norm": 3.970063835806634,
28
- "learning_rate": 8.333333333333334e-06,
29
- "loss": 3.5534,
 
30
  "step": 10
31
  },
32
  {
33
  "epoch": 1.0,
34
- "eval_loss": 3.635176420211792,
35
- "eval_runtime": 0.3923,
36
- "eval_samples_per_second": 45.887,
37
- "eval_steps_per_second": 7.648,
 
38
  "step": 12
39
  },
40
  {
41
  "epoch": 1.25,
42
- "grad_norm": 1.7466258721475718,
43
- "learning_rate": 1.25e-05,
44
- "loss": 3.4791,
 
45
  "step": 15
46
  },
47
  {
48
  "epoch": 1.6666666666666665,
49
- "grad_norm": 1.1618839918387402,
50
- "learning_rate": 1.6666666666666667e-05,
51
- "loss": 3.5451,
 
52
  "step": 20
53
  },
54
  {
55
  "epoch": 2.0,
56
- "eval_loss": 3.5881924629211426,
57
- "eval_runtime": 0.3977,
58
- "eval_samples_per_second": 45.263,
59
- "eval_steps_per_second": 7.544,
 
60
  "step": 24
61
  },
62
  {
63
  "epoch": 2.0833333333333335,
64
- "grad_norm": 1.0276735175803224,
65
- "learning_rate": 1.9998942319271076e-05,
66
- "loss": 3.4481,
 
67
  "step": 25
68
  },
69
  {
70
  "epoch": 2.5,
71
- "grad_norm": 1.0269510028877782,
72
- "learning_rate": 1.9961946980917457e-05,
73
- "loss": 3.5143,
 
74
  "step": 30
75
  },
76
  {
77
  "epoch": 2.9166666666666665,
78
- "grad_norm": 0.8284784581301078,
79
- "learning_rate": 1.9872291131173743e-05,
80
- "loss": 3.4974,
 
81
  "step": 35
82
  },
83
  {
84
  "epoch": 3.0,
85
- "eval_loss": 3.5418310165405273,
86
- "eval_runtime": 0.3965,
87
- "eval_samples_per_second": 45.397,
88
- "eval_steps_per_second": 7.566,
 
89
  "step": 36
90
  },
91
  {
92
  "epoch": 3.3333333333333335,
93
- "grad_norm": 0.8328561838416785,
94
- "learning_rate": 1.973044870579824e-05,
95
- "loss": 3.4338,
 
96
  "step": 40
97
  },
98
  {
99
  "epoch": 3.75,
100
- "grad_norm": 1.0109709585778517,
101
- "learning_rate": 1.953716950748227e-05,
102
- "loss": 3.4939,
 
103
  "step": 45
104
  },
105
  {
106
  "epoch": 4.0,
107
- "eval_loss": 3.5025124549865723,
108
- "eval_runtime": 0.3969,
109
- "eval_samples_per_second": 45.35,
110
- "eval_steps_per_second": 7.558,
 
111
  "step": 48
112
  },
113
  {
114
  "epoch": 4.166666666666667,
115
- "grad_norm": 0.7400417939368705,
116
- "learning_rate": 1.9293475242268224e-05,
117
- "loss": 3.3865,
 
118
  "step": 50
119
  },
120
  {
121
  "epoch": 4.583333333333333,
122
- "grad_norm": 0.7420166459361851,
123
- "learning_rate": 1.900065411864121e-05,
124
- "loss": 3.452,
 
125
  "step": 55
126
  },
127
  {
128
  "epoch": 5.0,
129
- "grad_norm": 1.864771582064087,
130
- "learning_rate": 1.866025403784439e-05,
131
- "loss": 3.3788,
 
132
  "step": 60
133
  },
134
  {
135
  "epoch": 5.0,
136
- "eval_loss": 3.4763269424438477,
137
- "eval_runtime": 0.3912,
138
- "eval_samples_per_second": 46.015,
139
- "eval_steps_per_second": 7.669,
 
140
  "step": 60
141
  },
142
  {
143
  "epoch": 5.416666666666667,
144
- "grad_norm": 0.7205181158730114,
145
- "learning_rate": 1.8274074411415104e-05,
146
- "loss": 3.3805,
 
147
  "step": 65
148
  },
149
  {
150
  "epoch": 5.833333333333333,
151
- "grad_norm": 0.5903690958172121,
152
- "learning_rate": 1.784415664919576e-05,
153
- "loss": 3.4453,
 
154
  "step": 70
155
  },
156
  {
157
  "epoch": 6.0,
158
- "eval_loss": 3.45723557472229,
159
- "eval_runtime": 0.4034,
160
- "eval_samples_per_second": 44.624,
161
- "eval_steps_per_second": 7.437,
 
162
  "step": 72
163
  },
164
  {
165
  "epoch": 6.25,
166
- "grad_norm": 0.5437262967095119,
167
- "learning_rate": 1.737277336810124e-05,
168
- "loss": 3.4045,
 
169
  "step": 75
170
  },
171
  {
172
  "epoch": 6.666666666666667,
173
- "grad_norm": 0.5235425065858881,
174
- "learning_rate": 1.686241637868734e-05,
175
- "loss": 3.3306,
 
176
  "step": 80
177
  },
178
  {
179
  "epoch": 7.0,
180
- "eval_loss": 3.441802501678467,
181
- "eval_runtime": 0.4108,
182
- "eval_samples_per_second": 43.819,
183
- "eval_steps_per_second": 7.303,
 
184
  "step": 84
185
  },
186
  {
187
  "epoch": 7.083333333333333,
188
- "grad_norm": 0.6513847176886255,
189
- "learning_rate": 1.6315783513024977e-05,
190
- "loss": 3.5057,
 
191
  "step": 85
192
  },
193
  {
194
  "epoch": 7.5,
195
- "grad_norm": 0.5155086538623761,
196
- "learning_rate": 1.573576436351046e-05,
197
- "loss": 3.378,
 
198
  "step": 90
199
  },
200
  {
201
  "epoch": 7.916666666666667,
202
- "grad_norm": 0.5143935752297035,
203
- "learning_rate": 1.5125425007998653e-05,
204
- "loss": 3.345,
 
205
  "step": 95
206
  },
207
  {
208
  "epoch": 8.0,
209
- "eval_loss": 3.428921937942505,
210
- "eval_runtime": 0.3894,
211
- "eval_samples_per_second": 46.225,
212
- "eval_steps_per_second": 7.704,
 
213
  "step": 96
214
  },
215
  {
216
  "epoch": 8.333333333333334,
217
- "grad_norm": 0.5652349397643913,
218
- "learning_rate": 1.4487991802004625e-05,
219
- "loss": 3.5027,
 
220
  "step": 100
221
  },
222
  {
223
  "epoch": 8.75,
224
- "grad_norm": 0.5647161437519873,
225
- "learning_rate": 1.3826834323650899e-05,
226
- "loss": 3.3973,
 
227
  "step": 105
228
  },
229
  {
230
  "epoch": 9.0,
231
- "eval_loss": 3.418201446533203,
232
- "eval_runtime": 0.4033,
233
- "eval_samples_per_second": 44.627,
234
- "eval_steps_per_second": 7.438,
 
235
  "step": 108
236
  },
237
  {
238
  "epoch": 9.166666666666666,
239
- "grad_norm": 0.5444861959273378,
240
- "learning_rate": 1.3145447561516138e-05,
241
- "loss": 3.2417,
 
242
  "step": 110
243
  },
244
  {
245
  "epoch": 9.583333333333334,
246
- "grad_norm": 0.5831760497922609,
247
- "learning_rate": 1.2447433439543239e-05,
248
- "loss": 3.369,
 
249
  "step": 115
250
  },
251
  {
252
  "epoch": 10.0,
253
- "grad_norm": 1.019730158221383,
254
- "learning_rate": 1.1736481776669307e-05,
255
- "loss": 3.2892,
 
256
  "step": 120
257
  },
258
  {
259
  "epoch": 10.0,
260
- "eval_loss": 3.4091298580169678,
261
- "eval_runtime": 0.4621,
262
- "eval_samples_per_second": 38.949,
263
- "eval_steps_per_second": 6.492,
 
264
  "step": 120
265
  },
266
  {
267
  "epoch": 10.416666666666666,
268
- "grad_norm": 0.5472889256976743,
269
- "learning_rate": 1.101635078182802e-05,
270
- "loss": 3.3538,
 
271
  "step": 125
272
  },
273
  {
274
  "epoch": 10.833333333333334,
275
- "grad_norm": 0.585130945106004,
276
- "learning_rate": 1.0290847187431115e-05,
277
- "loss": 3.3218,
 
278
  "step": 130
279
  },
280
  {
281
  "epoch": 11.0,
282
- "eval_loss": 3.401224374771118,
283
- "eval_runtime": 0.3959,
284
- "eval_samples_per_second": 45.464,
285
- "eval_steps_per_second": 7.577,
 
286
  "step": 132
287
  },
288
  {
289
  "epoch": 11.25,
290
- "grad_norm": 0.49009027634210744,
291
- "learning_rate": 9.563806126346643e-06,
292
- "loss": 3.3543,
 
293
  "step": 135
294
  },
295
  {
296
  "epoch": 11.666666666666666,
297
- "grad_norm": 0.5409056672592139,
298
- "learning_rate": 8.839070858747697e-06,
299
- "loss": 3.3185,
 
300
  "step": 140
301
  },
302
  {
303
  "epoch": 12.0,
304
- "eval_loss": 3.394521951675415,
305
- "eval_runtime": 0.4021,
306
- "eval_samples_per_second": 44.762,
307
- "eval_steps_per_second": 7.46,
 
308
  "step": 144
309
  },
310
  {
311
  "epoch": 12.083333333333334,
312
- "grad_norm": 0.5879385039357604,
313
- "learning_rate": 8.120472455998882e-06,
314
- "loss": 3.4697,
 
315
  "step": 145
316
  },
317
  {
318
  "epoch": 12.5,
319
- "grad_norm": 0.5109001687995364,
320
- "learning_rate": 7.411809548974792e-06,
321
- "loss": 3.293,
 
322
  "step": 150
323
  },
324
  {
325
  "epoch": 12.916666666666666,
326
- "grad_norm": 0.5648980578882977,
327
- "learning_rate": 6.716828247864391e-06,
328
- "loss": 3.3237,
 
329
  "step": 155
330
  },
331
  {
332
  "epoch": 13.0,
333
- "eval_loss": 3.3890793323516846,
334
- "eval_runtime": 0.3967,
335
- "eval_samples_per_second": 45.374,
336
- "eval_steps_per_second": 7.562,
 
337
  "step": 156
338
  },
339
  {
340
  "epoch": 13.333333333333334,
341
- "grad_norm": 0.5055559054562139,
342
- "learning_rate": 6.039202339608432e-06,
343
- "loss": 3.271,
 
344
  "step": 160
345
  },
346
  {
347
  "epoch": 13.75,
348
- "grad_norm": 0.5732954250338061,
349
- "learning_rate": 5.382513867649663e-06,
350
- "loss": 3.3964,
 
351
  "step": 165
352
  },
353
  {
354
  "epoch": 14.0,
355
- "eval_loss": 3.3846511840820312,
356
- "eval_runtime": 0.395,
357
- "eval_samples_per_second": 45.566,
358
- "eval_steps_per_second": 7.594,
 
359
  "step": 168
360
  },
361
  {
362
  "epoch": 14.166666666666666,
363
- "grad_norm": 0.5692530517966546,
364
- "learning_rate": 4.7502341966544e-06,
365
- "loss": 3.1924,
 
366
  "step": 170
367
  },
368
  {
369
  "epoch": 14.583333333333334,
370
- "grad_norm": 0.5478826916587504,
371
- "learning_rate": 4.1457056623005954e-06,
372
- "loss": 3.3224,
 
373
  "step": 175
374
  },
375
  {
376
  "epoch": 15.0,
377
- "grad_norm": 1.3475215612190545,
378
- "learning_rate": 3.5721239031346067e-06,
379
- "loss": 3.3293,
 
380
  "step": 180
381
  },
382
  {
383
  "epoch": 15.0,
384
- "eval_loss": 3.3815081119537354,
385
- "eval_runtime": 0.3939,
386
- "eval_samples_per_second": 45.698,
387
- "eval_steps_per_second": 7.616,
 
388
  "step": 180
389
  },
390
  {
391
  "epoch": 15.416666666666666,
392
- "grad_norm": 0.49662476648041026,
393
- "learning_rate": 3.032520967893453e-06,
394
- "loss": 3.321,
 
395
  "step": 185
396
  },
397
  {
398
  "epoch": 15.833333333333334,
399
- "grad_norm": 0.5516226107095812,
400
- "learning_rate": 2.529749287590042e-06,
401
- "loss": 3.2824,
 
402
  "step": 190
403
  },
404
  {
405
  "epoch": 16.0,
406
- "eval_loss": 3.3792712688446045,
407
- "eval_runtime": 0.3974,
408
- "eval_samples_per_second": 45.291,
409
- "eval_steps_per_second": 7.548,
 
410
  "step": 192
411
  },
412
  {
413
  "epoch": 16.25,
414
- "grad_norm": 0.5663384865119242,
415
- "learning_rate": 2.0664665970876496e-06,
416
- "loss": 3.3657,
 
417
  "step": 195
418
  },
419
  {
420
  "epoch": 16.666666666666668,
421
- "grad_norm": 0.6014416895030616,
422
- "learning_rate": 1.6451218858706374e-06,
423
- "loss": 3.3232,
 
424
  "step": 200
425
  },
426
  {
427
  "epoch": 17.0,
428
- "eval_loss": 3.377903461456299,
429
- "eval_runtime": 0.405,
430
- "eval_samples_per_second": 44.444,
431
- "eval_steps_per_second": 7.407,
 
432
  "step": 204
433
  },
434
  {
435
  "epoch": 17.083333333333332,
436
- "grad_norm": 0.4906231418660385,
437
- "learning_rate": 1.2679424522780426e-06,
438
- "loss": 3.3418,
 
439
  "step": 205
440
  },
441
  {
442
  "epoch": 17.5,
443
- "grad_norm": 0.5991421773344411,
444
- "learning_rate": 9.369221296335007e-07,
445
- "loss": 3.3385,
 
446
  "step": 210
447
  },
448
  {
449
  "epoch": 17.916666666666668,
450
- "grad_norm": 0.572738899557872,
451
- "learning_rate": 6.538107465101162e-07,
452
- "loss": 3.2725,
 
453
  "step": 215
454
  },
455
  {
456
  "epoch": 18.0,
457
- "eval_loss": 3.3772037029266357,
458
- "eval_runtime": 0.4012,
459
- "eval_samples_per_second": 44.861,
460
- "eval_steps_per_second": 7.477,
 
461
  "step": 216
462
  },
463
  {
464
  "epoch": 18.333333333333332,
465
- "grad_norm": 0.6238290992567466,
466
- "learning_rate": 4.2010487684511105e-07,
467
- "loss": 3.4302,
 
468
  "step": 220
469
  },
470
  {
471
  "epoch": 18.75,
472
- "grad_norm": 0.5767194960270463,
473
- "learning_rate": 2.370399288006664e-07,
474
- "loss": 3.2808,
 
475
  "step": 225
476
  },
477
  {
478
  "epoch": 19.0,
479
- "eval_loss": 3.3768930435180664,
480
- "eval_runtime": 0.3991,
481
- "eval_samples_per_second": 45.102,
482
- "eval_steps_per_second": 7.517,
 
483
  "step": 228
484
  },
485
  {
486
  "epoch": 19.166666666666668,
487
- "grad_norm": 0.5557398310185774,
488
- "learning_rate": 1.055836141905553e-07,
489
- "loss": 3.3581,
 
490
  "step": 230
491
  },
492
  {
493
  "epoch": 19.583333333333332,
494
- "grad_norm": 0.5389585071398526,
495
- "learning_rate": 2.643083299427751e-08,
496
- "loss": 3.3357,
 
497
  "step": 235
498
  },
499
  {
500
  "epoch": 20.0,
501
- "grad_norm": 1.3350882978965304,
502
- "learning_rate": 0.0,
503
- "loss": 3.3157,
 
504
  "step": 240
505
  },
506
  {
507
  "epoch": 20.0,
508
- "eval_loss": 3.3768398761749268,
509
- "eval_runtime": 0.3925,
510
- "eval_samples_per_second": 45.863,
511
- "eval_steps_per_second": 7.644,
 
512
  "step": 240
513
  },
514
  {
515
- "epoch": 20.0,
516
- "step": 240,
517
- "total_flos": 72813143654400.0,
518
- "train_loss": 3.3806732257207233,
519
- "train_runtime": 123.5801,
520
- "train_samples_per_second": 14.404,
521
- "train_steps_per_second": 1.942
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  }
523
  ],
524
  "logging_steps": 5,
525
- "max_steps": 240,
526
  "num_input_tokens_seen": 0,
527
- "num_train_epochs": 20,
528
  "save_steps": 500,
529
  "stateful_callbacks": {
530
  "TrainerControl": {
@@ -538,7 +1453,7 @@
538
  "attributes": {}
539
  }
540
  },
541
- "total_flos": 72813143654400.0,
542
  "train_batch_size": 8,
543
  "trial_name": null,
544
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 50.0,
5
  "eval_steps": 500,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.08333333333333333,
13
+ "grad_norm": 0.5385836071499915,
14
+ "learning_rate": 3.3333333333333335e-07,
15
+ "loss": 3.5061,
16
+ "mean_token_accuracy": 0.3923509418964386,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.4166666666666667,
21
+ "grad_norm": 0.5988682460559892,
22
+ "learning_rate": 1.6666666666666667e-06,
23
+ "loss": 3.444,
24
+ "mean_token_accuracy": 0.40139296650886536,
25
  "step": 5
26
  },
27
  {
28
  "epoch": 0.8333333333333334,
29
+ "grad_norm": 0.7225027190447023,
30
+ "learning_rate": 3.3333333333333333e-06,
31
+ "loss": 3.469,
32
+ "mean_token_accuracy": 0.40673128962516786,
33
  "step": 10
34
  },
35
  {
36
  "epoch": 1.0,
37
+ "eval_loss": 3.5662965774536133,
38
+ "eval_mean_token_accuracy": 0.4117915093898773,
39
+ "eval_runtime": 0.5523,
40
+ "eval_samples_per_second": 34.404,
41
+ "eval_steps_per_second": 5.432,
42
  "step": 12
43
  },
44
  {
45
  "epoch": 1.25,
46
+ "grad_norm": 0.5723484530658475,
47
+ "learning_rate": 5e-06,
48
+ "loss": 3.4415,
49
+ "mean_token_accuracy": 0.40831703941027325,
50
  "step": 15
51
  },
52
  {
53
  "epoch": 1.6666666666666665,
54
+ "grad_norm": 0.7162755459145884,
55
+ "learning_rate": 6.666666666666667e-06,
56
+ "loss": 3.4582,
57
+ "mean_token_accuracy": 0.4077712595462799,
58
  "step": 20
59
  },
60
  {
61
  "epoch": 2.0,
62
+ "eval_loss": 3.5607330799102783,
63
+ "eval_mean_token_accuracy": 0.4010107176644461,
64
+ "eval_runtime": 0.4408,
65
+ "eval_samples_per_second": 43.103,
66
+ "eval_steps_per_second": 6.806,
67
  "step": 24
68
  },
69
  {
70
  "epoch": 2.0833333333333335,
71
+ "grad_norm": 0.5761807107644672,
72
+ "learning_rate": 8.333333333333334e-06,
73
+ "loss": 3.4905,
74
+ "mean_token_accuracy": 0.40334799885749817,
75
  "step": 25
76
  },
77
  {
78
  "epoch": 2.5,
79
+ "grad_norm": 0.5868666256020767,
80
+ "learning_rate": 1e-05,
81
+ "loss": 3.3955,
82
+ "mean_token_accuracy": 0.4089615702629089,
83
  "step": 30
84
  },
85
  {
86
  "epoch": 2.9166666666666665,
87
+ "grad_norm": 0.7608176868125189,
88
+ "learning_rate": 1.1666666666666668e-05,
89
+ "loss": 3.4528,
90
+ "mean_token_accuracy": 0.40273705720901487,
91
  "step": 35
92
  },
93
  {
94
  "epoch": 3.0,
95
+ "eval_loss": 3.5521068572998047,
96
+ "eval_mean_token_accuracy": 0.41008396446704865,
97
+ "eval_runtime": 0.4318,
98
+ "eval_samples_per_second": 43.998,
99
+ "eval_steps_per_second": 6.947,
100
  "step": 36
101
  },
102
  {
103
  "epoch": 3.3333333333333335,
104
+ "grad_norm": 0.6096220864944857,
105
+ "learning_rate": 1.3333333333333333e-05,
106
+ "loss": 3.5249,
107
+ "mean_token_accuracy": 0.4006356969475746,
108
  "step": 40
109
  },
110
  {
111
  "epoch": 3.75,
112
+ "grad_norm": 0.5375302227210489,
113
+ "learning_rate": 1.5000000000000002e-05,
114
+ "loss": 3.4592,
115
+ "mean_token_accuracy": 0.40808895230293274,
116
  "step": 45
117
  },
118
  {
119
  "epoch": 4.0,
120
+ "eval_loss": 3.5394978523254395,
121
+ "eval_mean_token_accuracy": 0.41112928589185077,
122
+ "eval_runtime": 0.4311,
123
+ "eval_samples_per_second": 44.074,
124
+ "eval_steps_per_second": 6.959,
125
  "step": 48
126
  },
127
  {
128
  "epoch": 4.166666666666667,
129
+ "grad_norm": 0.5093292635141339,
130
+ "learning_rate": 1.6666666666666667e-05,
131
+ "loss": 3.4,
132
+ "mean_token_accuracy": 0.404203325510025,
133
  "step": 50
134
  },
135
  {
136
  "epoch": 4.583333333333333,
137
+ "grad_norm": 0.7243587198171676,
138
+ "learning_rate": 1.8333333333333333e-05,
139
+ "loss": 3.4412,
140
+ "mean_token_accuracy": 0.4058379828929901,
141
  "step": 55
142
  },
143
  {
144
  "epoch": 5.0,
145
+ "grad_norm": 0.8522717487007759,
146
+ "learning_rate": 2e-05,
147
+ "loss": 3.3885,
148
+ "mean_token_accuracy": 0.4081867039203644,
149
  "step": 60
150
  },
151
  {
152
  "epoch": 5.0,
153
+ "eval_loss": 3.523003101348877,
154
+ "eval_mean_token_accuracy": 0.41269586483637494,
155
+ "eval_runtime": 0.4315,
156
+ "eval_samples_per_second": 44.032,
157
+ "eval_steps_per_second": 6.952,
158
  "step": 60
159
  },
160
  {
161
  "epoch": 5.416666666666667,
162
+ "grad_norm": 0.5197308524638413,
163
+ "learning_rate": 1.9995769500822007e-05,
164
+ "loss": 3.4699,
165
+ "mean_token_accuracy": 0.40173509120941164,
166
  "step": 65
167
  },
168
  {
169
  "epoch": 5.833333333333333,
170
+ "grad_norm": 0.6040667002909094,
171
+ "learning_rate": 1.9983081582712684e-05,
172
+ "loss": 3.341,
173
+ "mean_token_accuracy": 0.4136533796787262,
174
  "step": 70
175
  },
176
  {
177
  "epoch": 6.0,
178
+ "eval_loss": 3.5047221183776855,
179
+ "eval_mean_token_accuracy": 0.4145672619342804,
180
+ "eval_runtime": 0.4362,
181
+ "eval_samples_per_second": 43.562,
182
+ "eval_steps_per_second": 6.878,
183
  "step": 72
184
  },
185
  {
186
  "epoch": 6.25,
187
+ "grad_norm": 0.48368556562954307,
188
+ "learning_rate": 1.9961946980917457e-05,
189
+ "loss": 3.4062,
190
+ "mean_token_accuracy": 0.4024519423643748,
191
  "step": 75
192
  },
193
  {
194
  "epoch": 6.666666666666667,
195
+ "grad_norm": 0.624860448459548,
196
+ "learning_rate": 1.9932383577419432e-05,
197
+ "loss": 3.3999,
198
+ "mean_token_accuracy": 0.41031280159950256,
199
  "step": 80
200
  },
201
  {
202
  "epoch": 7.0,
203
+ "eval_loss": 3.4868686199188232,
204
+ "eval_mean_token_accuracy": 0.41539154733930317,
205
+ "eval_runtime": 0.4392,
206
+ "eval_samples_per_second": 43.261,
207
+ "eval_steps_per_second": 6.831,
208
  "step": 84
209
  },
210
  {
211
  "epoch": 7.083333333333333,
212
+ "grad_norm": 0.5271197118444322,
213
+ "learning_rate": 1.9894416385809444e-05,
214
+ "loss": 3.3464,
215
+ "mean_token_accuracy": 0.395772248506546,
216
  "step": 85
217
  },
218
  {
219
  "epoch": 7.5,
220
+ "grad_norm": 0.6058343803263453,
221
+ "learning_rate": 1.9848077530122083e-05,
222
+ "loss": 3.332,
223
+ "mean_token_accuracy": 0.4168132901191711,
224
  "step": 90
225
  },
226
  {
227
  "epoch": 7.916666666666667,
228
+ "grad_norm": 0.5498361997261547,
229
+ "learning_rate": 1.9793406217655516e-05,
230
+ "loss": 3.3951,
231
+ "mean_token_accuracy": 0.40805121660232546,
232
  "step": 95
233
  },
234
  {
235
  "epoch": 8.0,
236
+ "eval_loss": 3.4701385498046875,
237
+ "eval_mean_token_accuracy": 0.417536236345768,
238
+ "eval_runtime": 0.4284,
239
+ "eval_samples_per_second": 44.348,
240
+ "eval_steps_per_second": 7.002,
241
  "step": 96
242
  },
243
  {
244
  "epoch": 8.333333333333334,
245
+ "grad_norm": 0.49021333765538,
246
+ "learning_rate": 1.973044870579824e-05,
247
+ "loss": 3.316,
248
+ "mean_token_accuracy": 0.41507820039987564,
249
  "step": 100
250
  },
251
  {
252
  "epoch": 8.75,
253
+ "grad_norm": 0.5876084010033806,
254
+ "learning_rate": 1.9659258262890683e-05,
255
+ "loss": 3.4082,
256
+ "mean_token_accuracy": 0.4088488757610321,
257
  "step": 105
258
  },
259
  {
260
  "epoch": 9.0,
261
+ "eval_loss": 3.4551661014556885,
262
+ "eval_mean_token_accuracy": 0.41253359615802765,
263
+ "eval_runtime": 0.4291,
264
+ "eval_samples_per_second": 44.28,
265
+ "eval_steps_per_second": 6.992,
266
  "step": 108
267
  },
268
  {
269
  "epoch": 9.166666666666666,
270
+ "grad_norm": 0.5075915346441139,
271
+ "learning_rate": 1.957989512315489e-05,
272
+ "loss": 3.3659,
273
+ "mean_token_accuracy": 0.4220624566078186,
274
  "step": 110
275
  },
276
  {
277
  "epoch": 9.583333333333334,
278
+ "grad_norm": 0.47536348642658954,
279
+ "learning_rate": 1.949242643573034e-05,
280
+ "loss": 3.3739,
281
+ "mean_token_accuracy": 0.40786901116371155,
282
  "step": 115
283
  },
284
  {
285
  "epoch": 10.0,
286
+ "grad_norm": 0.8249517268614861,
287
+ "learning_rate": 1.9396926207859085e-05,
288
+ "loss": 3.3766,
289
+ "mean_token_accuracy": 0.4057429075241089,
290
  "step": 120
291
  },
292
  {
293
  "epoch": 10.0,
294
+ "eval_loss": 3.440880298614502,
295
+ "eval_mean_token_accuracy": 0.4163530071576436,
296
+ "eval_runtime": 0.4465,
297
+ "eval_samples_per_second": 42.557,
298
+ "eval_steps_per_second": 6.72,
299
  "step": 120
300
  },
301
  {
302
  "epoch": 10.416666666666666,
303
+ "grad_norm": 0.5247440587444925,
304
+ "learning_rate": 1.9293475242268224e-05,
305
+ "loss": 3.3593,
306
+ "mean_token_accuracy": 0.40792441964149473,
307
  "step": 125
308
  },
309
  {
310
  "epoch": 10.833333333333334,
311
+ "grad_norm": 0.7084057790066337,
312
+ "learning_rate": 1.9182161068802742e-05,
313
+ "loss": 3.3108,
314
+ "mean_token_accuracy": 0.4163245320320129,
315
  "step": 130
316
  },
317
  {
318
  "epoch": 11.0,
319
+ "eval_loss": 3.426577091217041,
320
+ "eval_mean_token_accuracy": 0.41616382598876955,
321
+ "eval_runtime": 0.4272,
322
+ "eval_samples_per_second": 44.48,
323
+ "eval_steps_per_second": 7.023,
324
  "step": 132
325
  },
326
  {
327
  "epoch": 11.25,
328
+ "grad_norm": 0.5224822341776986,
329
+ "learning_rate": 1.9063077870366504e-05,
330
+ "loss": 3.2782,
331
+ "mean_token_accuracy": 0.4194770356019338,
332
  "step": 135
333
  },
334
  {
335
  "epoch": 11.666666666666666,
336
+ "grad_norm": 0.45694283255218365,
337
+ "learning_rate": 1.8936326403234125e-05,
338
+ "loss": 3.3102,
339
+ "mean_token_accuracy": 0.41273156404495237,
340
  "step": 140
341
  },
342
  {
343
  "epoch": 12.0,
344
+ "eval_loss": 3.4135658740997314,
345
+ "eval_mean_token_accuracy": 0.41462882501738413,
346
+ "eval_runtime": 0.4547,
347
+ "eval_samples_per_second": 41.783,
348
+ "eval_steps_per_second": 6.597,
349
  "step": 144
350
  },
351
  {
352
  "epoch": 12.083333333333334,
353
+ "grad_norm": 0.4475822239307412,
354
+ "learning_rate": 1.880201391180111e-05,
355
+ "loss": 3.37,
356
+ "mean_token_accuracy": 0.40713587403297424,
357
  "step": 145
358
  },
359
  {
360
  "epoch": 12.5,
361
+ "grad_norm": 0.5801727962051494,
362
+ "learning_rate": 1.866025403784439e-05,
363
+ "loss": 3.3554,
364
+ "mean_token_accuracy": 0.41182795763015745,
365
  "step": 150
366
  },
367
  {
368
  "epoch": 12.916666666666666,
369
+ "grad_norm": 0.5874670469007709,
370
+ "learning_rate": 1.8511166724369997e-05,
371
+ "loss": 3.2614,
372
+ "mean_token_accuracy": 0.4187495827674866,
373
  "step": 155
374
  },
375
  {
376
  "epoch": 13.0,
377
+ "eval_loss": 3.4014406204223633,
378
+ "eval_mean_token_accuracy": 0.4147283807396889,
379
+ "eval_runtime": 0.4294,
380
+ "eval_samples_per_second": 44.244,
381
+ "eval_steps_per_second": 6.986,
382
  "step": 156
383
  },
384
  {
385
  "epoch": 13.333333333333334,
386
+ "grad_norm": 0.550319404785184,
387
+ "learning_rate": 1.8354878114129368e-05,
388
+ "loss": 3.2805,
389
+ "mean_token_accuracy": 0.414131224155426,
390
  "step": 160
391
  },
392
  {
393
  "epoch": 13.75,
394
+ "grad_norm": 0.5904370361609361,
395
+ "learning_rate": 1.819152044288992e-05,
396
+ "loss": 3.2972,
397
+ "mean_token_accuracy": 0.41746036410331727,
398
  "step": 165
399
  },
400
  {
401
  "epoch": 14.0,
402
+ "eval_loss": 3.3907692432403564,
403
+ "eval_mean_token_accuracy": 0.4185921351114909,
404
+ "eval_runtime": 0.428,
405
+ "eval_samples_per_second": 44.396,
406
+ "eval_steps_per_second": 7.01,
407
  "step": 168
408
  },
409
  {
410
  "epoch": 14.166666666666666,
411
+ "grad_norm": 0.5164762014150277,
412
+ "learning_rate": 1.802123192755044e-05,
413
+ "loss": 3.2903,
414
+ "mean_token_accuracy": 0.41526149213314056,
415
  "step": 170
416
  },
417
  {
418
  "epoch": 14.583333333333334,
419
+ "grad_norm": 0.6449998942915702,
420
+ "learning_rate": 1.784415664919576e-05,
421
+ "loss": 3.2937,
422
+ "mean_token_accuracy": 0.4124247670173645,
423
  "step": 175
424
  },
425
  {
426
  "epoch": 15.0,
427
+ "grad_norm": 0.9850442727409189,
428
+ "learning_rate": 1.766044443118978e-05,
429
+ "loss": 3.2699,
430
+ "mean_token_accuracy": 0.41933039426803587,
431
  "step": 180
432
  },
433
  {
434
  "epoch": 15.0,
435
+ "eval_loss": 3.380833625793457,
436
+ "eval_mean_token_accuracy": 0.4198370774586995,
437
+ "eval_runtime": 0.4534,
438
+ "eval_samples_per_second": 41.906,
439
+ "eval_steps_per_second": 6.617,
440
  "step": 180
441
  },
442
  {
443
  "epoch": 15.416666666666666,
444
+ "grad_norm": 0.5357760809508904,
445
+ "learning_rate": 1.7470250712409963e-05,
446
+ "loss": 3.2893,
447
+ "mean_token_accuracy": 0.4150129497051239,
448
  "step": 185
449
  },
450
  {
451
  "epoch": 15.833333333333334,
452
+ "grad_norm": 0.6312781940606058,
453
+ "learning_rate": 1.7273736415730488e-05,
454
+ "loss": 3.2636,
455
+ "mean_token_accuracy": 0.41713098883628846,
456
  "step": 190
457
  },
458
  {
459
  "epoch": 16.0,
460
+ "eval_loss": 3.3715903759002686,
461
+ "eval_mean_token_accuracy": 0.41947720646858216,
462
+ "eval_runtime": 0.5943,
463
+ "eval_samples_per_second": 31.971,
464
+ "eval_steps_per_second": 5.048,
465
  "step": 192
466
  },
467
  {
468
  "epoch": 16.25,
469
+ "grad_norm": 0.5229064222364138,
470
+ "learning_rate": 1.7071067811865477e-05,
471
+ "loss": 3.2393,
472
+ "mean_token_accuracy": 0.41088302930196124,
473
  "step": 195
474
  },
475
  {
476
  "epoch": 16.666666666666668,
477
+ "grad_norm": 0.517899162096867,
478
+ "learning_rate": 1.686241637868734e-05,
479
+ "loss": 3.2543,
480
+ "mean_token_accuracy": 0.4211387991905212,
481
  "step": 200
482
  },
483
  {
484
  "epoch": 17.0,
485
+ "eval_loss": 3.3629448413848877,
486
+ "eval_mean_token_accuracy": 0.41943295512880596,
487
+ "eval_runtime": 0.4287,
488
+ "eval_samples_per_second": 44.317,
489
+ "eval_steps_per_second": 6.997,
490
  "step": 204
491
  },
492
  {
493
  "epoch": 17.083333333333332,
494
+ "grad_norm": 0.5400119094778694,
495
+ "learning_rate": 1.6647958656139377e-05,
496
+ "loss": 3.2249,
497
+ "mean_token_accuracy": 0.4193548262119293,
498
  "step": 205
499
  },
500
  {
501
  "epoch": 17.5,
502
+ "grad_norm": 0.488184108091038,
503
+ "learning_rate": 1.6427876096865394e-05,
504
+ "loss": 3.1743,
505
+ "mean_token_accuracy": 0.42307103872299195,
506
  "step": 210
507
  },
508
  {
509
  "epoch": 17.916666666666668,
510
+ "grad_norm": 0.6484010807393733,
511
+ "learning_rate": 1.6202354912682602e-05,
512
+ "loss": 3.3001,
513
+ "mean_token_accuracy": 0.41410067677497864,
514
  "step": 215
515
  },
516
  {
517
  "epoch": 18.0,
518
+ "eval_loss": 3.354940176010132,
519
+ "eval_mean_token_accuracy": 0.41730327904224396,
520
+ "eval_runtime": 0.431,
521
+ "eval_samples_per_second": 44.089,
522
+ "eval_steps_per_second": 6.961,
523
  "step": 216
524
  },
525
  {
526
  "epoch": 18.333333333333332,
527
+ "grad_norm": 0.5674421464466002,
528
+ "learning_rate": 1.5971585917027864e-05,
529
+ "loss": 3.2051,
530
+ "mean_token_accuracy": 0.4223179370164871,
531
  "step": 220
532
  },
533
  {
534
  "epoch": 18.75,
535
+ "grad_norm": 0.5903205133298044,
536
+ "learning_rate": 1.573576436351046e-05,
537
+ "loss": 3.3103,
538
+ "mean_token_accuracy": 0.4130250930786133,
539
  "step": 225
540
  },
541
  {
542
  "epoch": 19.0,
543
+ "eval_loss": 3.347010850906372,
544
+ "eval_mean_token_accuracy": 0.4260312815507253,
545
+ "eval_runtime": 0.4239,
546
+ "eval_samples_per_second": 44.822,
547
+ "eval_steps_per_second": 7.077,
548
  "step": 228
549
  },
550
  {
551
  "epoch": 19.166666666666668,
552
+ "grad_norm": 0.6190609888103829,
553
+ "learning_rate": 1.5495089780708062e-05,
554
+ "loss": 3.2368,
555
+ "mean_token_accuracy": 0.4261363744735718,
556
  "step": 230
557
  },
558
  {
559
  "epoch": 19.583333333333332,
560
+ "grad_norm": 0.6450479176223197,
561
+ "learning_rate": 1.5249765803345602e-05,
562
+ "loss": 3.2506,
563
+ "mean_token_accuracy": 0.41713098287582395,
564
  "step": 235
565
  },
566
  {
567
  "epoch": 20.0,
568
+ "grad_norm": 0.9163763629667666,
569
+ "learning_rate": 1.5000000000000002e-05,
570
+ "loss": 3.1941,
571
+ "mean_token_accuracy": 0.41965908408164976,
572
  "step": 240
573
  },
574
  {
575
  "epoch": 20.0,
576
+ "eval_loss": 3.339346170425415,
577
+ "eval_mean_token_accuracy": 0.42223334312438965,
578
+ "eval_runtime": 0.5247,
579
+ "eval_samples_per_second": 36.208,
580
+ "eval_steps_per_second": 5.717,
581
  "step": 240
582
  },
583
  {
584
+ "epoch": 20.416666666666668,
585
+ "grad_norm": 0.6468751066404622,
586
+ "learning_rate": 1.4746003697476406e-05,
587
+ "loss": 3.2192,
588
+ "mean_token_accuracy": 0.4208211123943329,
589
+ "step": 245
590
+ },
591
+ {
592
+ "epoch": 20.833333333333332,
593
+ "grad_norm": 0.6683774245030316,
594
+ "learning_rate": 1.4487991802004625e-05,
595
+ "loss": 3.2221,
596
+ "mean_token_accuracy": 0.4196500062942505,
597
+ "step": 250
598
+ },
599
+ {
600
+ "epoch": 21.0,
601
+ "eval_loss": 3.331696033477783,
602
+ "eval_mean_token_accuracy": 0.4289926946163177,
603
+ "eval_runtime": 0.4271,
604
+ "eval_samples_per_second": 44.486,
605
+ "eval_steps_per_second": 7.024,
606
+ "step": 252
607
+ },
608
+ {
609
+ "epoch": 21.25,
610
+ "grad_norm": 0.5643106555296927,
611
+ "learning_rate": 1.4226182617406996e-05,
612
+ "loss": 3.2169,
613
+ "mean_token_accuracy": 0.41858096917470294,
614
+ "step": 255
615
+ },
616
+ {
617
+ "epoch": 21.666666666666668,
618
+ "grad_norm": 0.5977762587235801,
619
+ "learning_rate": 1.396079766039157e-05,
620
+ "loss": 3.2027,
621
+ "mean_token_accuracy": 0.4227761447429657,
622
+ "step": 260
623
+ },
624
+ {
625
+ "epoch": 22.0,
626
+ "eval_loss": 3.324859142303467,
627
+ "eval_mean_token_accuracy": 0.41909594195229666,
628
+ "eval_runtime": 0.4259,
629
+ "eval_samples_per_second": 44.615,
630
+ "eval_steps_per_second": 7.044,
631
+ "step": 264
632
+ },
633
+ {
634
+ "epoch": 22.083333333333332,
635
+ "grad_norm": 0.6213218140554986,
636
+ "learning_rate": 1.3692061473126845e-05,
637
+ "loss": 3.2305,
638
+ "mean_token_accuracy": 0.4119012653827667,
639
+ "step": 265
640
+ },
641
+ {
642
+ "epoch": 22.5,
643
+ "grad_norm": 0.6275730529051374,
644
+ "learning_rate": 1.342020143325669e-05,
645
+ "loss": 3.1752,
646
+ "mean_token_accuracy": 0.42231183052062987,
647
+ "step": 270
648
+ },
649
+ {
650
+ "epoch": 22.916666666666668,
651
+ "grad_norm": 0.5390571854463481,
652
+ "learning_rate": 1.3145447561516138e-05,
653
+ "loss": 3.2192,
654
+ "mean_token_accuracy": 0.4236361861228943,
655
+ "step": 275
656
+ },
657
+ {
658
+ "epoch": 23.0,
659
+ "eval_loss": 3.31854248046875,
660
+ "eval_mean_token_accuracy": 0.4187733605504036,
661
+ "eval_runtime": 0.4331,
662
+ "eval_samples_per_second": 43.866,
663
+ "eval_steps_per_second": 6.926,
664
+ "step": 276
665
+ },
666
+ {
667
+ "epoch": 23.333333333333332,
668
+ "grad_norm": 0.54125472321236,
669
+ "learning_rate": 1.2868032327110904e-05,
670
+ "loss": 3.2537,
671
+ "mean_token_accuracy": 0.41950757056474686,
672
+ "step": 280
673
+ },
674
+ {
675
+ "epoch": 23.75,
676
+ "grad_norm": 0.5722463822439342,
677
+ "learning_rate": 1.2588190451025209e-05,
678
+ "loss": 3.1885,
679
+ "mean_token_accuracy": 0.4205010116100311,
680
+ "step": 285
681
+ },
682
+ {
683
+ "epoch": 24.0,
684
+ "eval_loss": 3.3127944469451904,
685
+ "eval_mean_token_accuracy": 0.422769491871198,
686
+ "eval_runtime": 0.4219,
687
+ "eval_samples_per_second": 45.038,
688
+ "eval_steps_per_second": 7.111,
689
+ "step": 288
690
+ },
691
+ {
692
+ "epoch": 24.166666666666668,
693
+ "grad_norm": 0.679559216877849,
694
+ "learning_rate": 1.2306158707424402e-05,
695
+ "loss": 3.2156,
696
+ "mean_token_accuracy": 0.42124877870082855,
697
+ "step": 290
698
+ },
699
+ {
700
+ "epoch": 24.583333333333332,
701
+ "grad_norm": 0.5610092093025837,
702
+ "learning_rate": 1.2022175723320382e-05,
703
+ "loss": 3.1649,
704
+ "mean_token_accuracy": 0.42517107129096987,
705
+ "step": 295
706
+ },
707
+ {
708
+ "epoch": 25.0,
709
+ "grad_norm": 1.426121211640697,
710
+ "learning_rate": 1.1736481776669307e-05,
711
+ "loss": 3.2052,
712
+ "mean_token_accuracy": 0.42243062853813174,
713
+ "step": 300
714
+ },
715
+ {
716
+ "epoch": 25.0,
717
+ "eval_loss": 3.307049512863159,
718
+ "eval_mean_token_accuracy": 0.4235536555449168,
719
+ "eval_runtime": 0.4166,
720
+ "eval_samples_per_second": 45.602,
721
+ "eval_steps_per_second": 7.2,
722
+ "step": 300
723
+ },
724
+ {
725
+ "epoch": 25.416666666666668,
726
+ "grad_norm": 0.5665773108951218,
727
+ "learning_rate": 1.1449318593072468e-05,
728
+ "loss": 3.1675,
729
+ "mean_token_accuracy": 0.4225806474685669,
730
+ "step": 305
731
+ },
732
+ {
733
+ "epoch": 25.833333333333332,
734
+ "grad_norm": 0.6226920179957871,
735
+ "learning_rate": 1.1160929141252303e-05,
736
+ "loss": 3.192,
737
+ "mean_token_accuracy": 0.42431179285049436,
738
+ "step": 310
739
+ },
740
+ {
741
+ "epoch": 26.0,
742
+ "eval_loss": 3.3022124767303467,
743
+ "eval_mean_token_accuracy": 0.4199638843536377,
744
+ "eval_runtime": 0.4381,
745
+ "eval_samples_per_second": 43.371,
746
+ "eval_steps_per_second": 6.848,
747
+ "step": 312
748
+ },
749
+ {
750
+ "epoch": 26.25,
751
+ "grad_norm": 0.6087103966453086,
752
+ "learning_rate": 1.0871557427476585e-05,
753
+ "loss": 3.2605,
754
+ "mean_token_accuracy": 0.417011300722758,
755
+ "step": 315
756
+ },
757
+ {
758
+ "epoch": 26.666666666666668,
759
+ "grad_norm": 0.5698423662256255,
760
+ "learning_rate": 1.0581448289104759e-05,
761
+ "loss": 3.1541,
762
+ "mean_token_accuracy": 0.4270772337913513,
763
+ "step": 320
764
+ },
765
+ {
766
+ "epoch": 27.0,
767
+ "eval_loss": 3.297753095626831,
768
+ "eval_mean_token_accuracy": 0.42913591861724854,
769
+ "eval_runtime": 0.4251,
770
+ "eval_samples_per_second": 44.69,
771
+ "eval_steps_per_second": 7.056,
772
+ "step": 324
773
+ },
774
+ {
775
+ "epoch": 27.083333333333332,
776
+ "grad_norm": 0.6813246252702757,
777
+ "learning_rate": 1.0290847187431115e-05,
778
+ "loss": 3.1759,
779
+ "mean_token_accuracy": 0.419721394777298,
780
+ "step": 325
781
+ },
782
+ {
783
+ "epoch": 27.5,
784
+ "grad_norm": 0.5500989419947021,
785
+ "learning_rate": 1e-05,
786
+ "loss": 3.1638,
787
+ "mean_token_accuracy": 0.4259925663471222,
788
+ "step": 330
789
+ },
790
+ {
791
+ "epoch": 27.916666666666668,
792
+ "grad_norm": 0.5981580816134079,
793
+ "learning_rate": 9.709152812568886e-06,
794
+ "loss": 3.174,
795
+ "mean_token_accuracy": 0.4233870983123779,
796
+ "step": 335
797
+ },
798
+ {
799
+ "epoch": 28.0,
800
+ "eval_loss": 3.293494462966919,
801
+ "eval_mean_token_accuracy": 0.4238861948251724,
802
+ "eval_runtime": 0.4411,
803
+ "eval_samples_per_second": 43.07,
804
+ "eval_steps_per_second": 6.8,
805
+ "step": 336
806
+ },
807
+ {
808
+ "epoch": 28.333333333333332,
809
+ "grad_norm": 0.5930311679061705,
810
+ "learning_rate": 9.418551710895243e-06,
811
+ "loss": 3.1868,
812
+ "mean_token_accuracy": 0.4231025353074074,
813
+ "step": 340
814
+ },
815
+ {
816
+ "epoch": 28.75,
817
+ "grad_norm": 0.5754287350322016,
818
+ "learning_rate": 9.128442572523418e-06,
819
+ "loss": 3.1808,
820
+ "mean_token_accuracy": 0.4236803472042084,
821
+ "step": 345
822
+ },
823
+ {
824
+ "epoch": 29.0,
825
+ "eval_loss": 3.2898471355438232,
826
+ "eval_mean_token_accuracy": 0.4265649865070979,
827
+ "eval_runtime": 0.4268,
828
+ "eval_samples_per_second": 44.518,
829
+ "eval_steps_per_second": 7.029,
830
+ "step": 348
831
+ },
832
+ {
833
+ "epoch": 29.166666666666668,
834
+ "grad_norm": 0.604278267795617,
835
+ "learning_rate": 8.839070858747697e-06,
836
+ "loss": 3.2018,
837
+ "mean_token_accuracy": 0.41849951446056366,
838
+ "step": 350
839
+ },
840
+ {
841
+ "epoch": 29.583333333333332,
842
+ "grad_norm": 0.6525176961848282,
843
+ "learning_rate": 8.550681406927534e-06,
844
+ "loss": 3.1601,
845
+ "mean_token_accuracy": 0.42639296054840087,
846
+ "step": 355
847
+ },
848
+ {
849
+ "epoch": 30.0,
850
+ "grad_norm": 1.3214766985272957,
851
+ "learning_rate": 8.263518223330698e-06,
852
+ "loss": 3.2003,
853
+ "mean_token_accuracy": 0.4250500977039337,
854
+ "step": 360
855
+ },
856
+ {
857
+ "epoch": 30.0,
858
+ "eval_loss": 3.2868618965148926,
859
+ "eval_mean_token_accuracy": 0.42555957039197284,
860
+ "eval_runtime": 0.4245,
861
+ "eval_samples_per_second": 44.754,
862
+ "eval_steps_per_second": 7.066,
863
+ "step": 360
864
+ },
865
+ {
866
+ "epoch": 30.416666666666668,
867
+ "grad_norm": 0.5951289198915272,
868
+ "learning_rate": 7.977824276679623e-06,
869
+ "loss": 3.1575,
870
+ "mean_token_accuracy": 0.4253473997116089,
871
+ "step": 365
872
+ },
873
+ {
874
+ "epoch": 30.833333333333332,
875
+ "grad_norm": 0.6178093593706783,
876
+ "learning_rate": 7.6938412925756e-06,
877
+ "loss": 3.1917,
878
+ "mean_token_accuracy": 0.4213587462902069,
879
+ "step": 370
880
+ },
881
+ {
882
+ "epoch": 31.0,
883
+ "eval_loss": 3.2837605476379395,
884
+ "eval_mean_token_accuracy": 0.4344413161277771,
885
+ "eval_runtime": 0.437,
886
+ "eval_samples_per_second": 43.477,
887
+ "eval_steps_per_second": 6.865,
888
+ "step": 372
889
+ },
890
+ {
891
+ "epoch": 31.25,
892
+ "grad_norm": 0.7052320977702828,
893
+ "learning_rate": 7.411809548974792e-06,
894
+ "loss": 3.1153,
895
+ "mean_token_accuracy": 0.4197881321112315,
896
+ "step": 375
897
+ },
898
+ {
899
+ "epoch": 31.666666666666668,
900
+ "grad_norm": 0.5285913317420398,
901
+ "learning_rate": 7.131967672889101e-06,
902
+ "loss": 3.1552,
903
+ "mean_token_accuracy": 0.42627077698707583,
904
+ "step": 380
905
+ },
906
+ {
907
+ "epoch": 32.0,
908
+ "eval_loss": 3.2812047004699707,
909
+ "eval_mean_token_accuracy": 0.42681941390037537,
910
+ "eval_runtime": 0.449,
911
+ "eval_samples_per_second": 42.32,
912
+ "eval_steps_per_second": 6.682,
913
+ "step": 384
914
+ },
915
+ {
916
+ "epoch": 32.083333333333336,
917
+ "grad_norm": 0.5722095071093364,
918
+ "learning_rate": 6.854552438483866e-06,
919
+ "loss": 3.1829,
920
+ "mean_token_accuracy": 0.4199657738208771,
921
+ "step": 385
922
+ },
923
+ {
924
+ "epoch": 32.5,
925
+ "grad_norm": 0.7633904658788278,
926
+ "learning_rate": 6.579798566743314e-06,
927
+ "loss": 3.1684,
928
+ "mean_token_accuracy": 0.422702831029892,
929
+ "step": 390
930
+ },
931
+ {
932
+ "epoch": 32.916666666666664,
933
+ "grad_norm": 0.7552346932336533,
934
+ "learning_rate": 6.3079385268731575e-06,
935
+ "loss": 3.1554,
936
+ "mean_token_accuracy": 0.42810051441192626,
937
+ "step": 395
938
+ },
939
+ {
940
+ "epoch": 33.0,
941
+ "eval_loss": 3.279100179672241,
942
+ "eval_mean_token_accuracy": 0.4279579147696495,
943
+ "eval_runtime": 0.4241,
944
+ "eval_samples_per_second": 44.798,
945
+ "eval_steps_per_second": 7.073,
946
+ "step": 396
947
+ },
948
+ {
949
+ "epoch": 33.333333333333336,
950
+ "grad_norm": 0.6276366843863107,
951
+ "learning_rate": 6.039202339608432e-06,
952
+ "loss": 3.1644,
953
+ "mean_token_accuracy": 0.4224095791578293,
954
+ "step": 400
955
+ },
956
+ {
957
+ "epoch": 33.75,
958
+ "grad_norm": 0.6661697390297467,
959
+ "learning_rate": 5.773817382593008e-06,
960
+ "loss": 3.1175,
961
+ "mean_token_accuracy": 0.43118279576301577,
962
+ "step": 405
963
+ },
964
+ {
965
+ "epoch": 34.0,
966
+ "eval_loss": 3.2770273685455322,
967
+ "eval_mean_token_accuracy": 0.41977598269780475,
968
+ "eval_runtime": 0.4259,
969
+ "eval_samples_per_second": 44.612,
970
+ "eval_steps_per_second": 7.044,
971
+ "step": 408
972
+ },
973
+ {
974
+ "epoch": 34.166666666666664,
975
+ "grad_norm": 0.6396786596872565,
976
+ "learning_rate": 5.512008197995379e-06,
977
+ "loss": 3.2052,
978
+ "mean_token_accuracy": 0.42155425250530243,
979
+ "step": 410
980
+ },
981
+ {
982
+ "epoch": 34.583333333333336,
983
+ "grad_norm": 0.7372526034823661,
984
+ "learning_rate": 5.253996302523596e-06,
985
+ "loss": 3.1537,
986
+ "mean_token_accuracy": 0.42629474997520445,
987
+ "step": 415
988
+ },
989
+ {
990
+ "epoch": 35.0,
991
+ "grad_norm": 1.3769536069183301,
992
+ "learning_rate": 5.000000000000003e-06,
993
+ "loss": 3.152,
994
+ "mean_token_accuracy": 0.42785924077034,
995
+ "step": 420
996
+ },
997
+ {
998
+ "epoch": 35.0,
999
+ "eval_loss": 3.275233507156372,
1000
+ "eval_mean_token_accuracy": 0.4261179069677989,
1001
+ "eval_runtime": 0.4346,
1002
+ "eval_samples_per_second": 43.72,
1003
+ "eval_steps_per_second": 6.903,
1004
+ "step": 420
1005
+ },
1006
+ {
1007
+ "epoch": 35.416666666666664,
1008
+ "grad_norm": 0.607101085973905,
1009
+ "learning_rate": 4.7502341966544e-06,
1010
+ "loss": 3.157,
1011
+ "mean_token_accuracy": 0.4252443790435791,
1012
+ "step": 425
1013
+ },
1014
+ {
1015
+ "epoch": 35.833333333333336,
1016
+ "grad_norm": 0.6902217168837974,
1017
+ "learning_rate": 4.504910219291941e-06,
1018
+ "loss": 3.2004,
1019
+ "mean_token_accuracy": 0.42346973419189454,
1020
+ "step": 430
1021
+ },
1022
+ {
1023
+ "epoch": 36.0,
1024
+ "eval_loss": 3.2738254070281982,
1025
+ "eval_mean_token_accuracy": 0.43027973771095274,
1026
+ "eval_runtime": 0.4432,
1027
+ "eval_samples_per_second": 42.866,
1028
+ "eval_steps_per_second": 6.768,
1029
+ "step": 432
1030
+ },
1031
+ {
1032
+ "epoch": 36.25,
1033
+ "grad_norm": 0.6833076608186851,
1034
+ "learning_rate": 4.264235636489542e-06,
1035
+ "loss": 3.0659,
1036
+ "mean_token_accuracy": 0.426602840423584,
1037
+ "step": 435
1038
+ },
1039
+ {
1040
+ "epoch": 36.666666666666664,
1041
+ "grad_norm": 0.5918683696670577,
1042
+ "learning_rate": 4.028414082972141e-06,
1043
+ "loss": 3.1312,
1044
+ "mean_token_accuracy": 0.42512218952178954,
1045
+ "step": 440
1046
+ },
1047
+ {
1048
+ "epoch": 37.0,
1049
+ "eval_loss": 3.2725744247436523,
1050
+ "eval_mean_token_accuracy": 0.42391158853258404,
1051
+ "eval_runtime": 0.4318,
1052
+ "eval_samples_per_second": 44.001,
1053
+ "eval_steps_per_second": 6.947,
1054
+ "step": 444
1055
+ },
1056
+ {
1057
+ "epoch": 37.083333333333336,
1058
+ "grad_norm": 0.6190943636017486,
1059
+ "learning_rate": 3.797645087317401e-06,
1060
+ "loss": 3.1971,
1061
+ "mean_token_accuracy": 0.4199657738208771,
1062
+ "step": 445
1063
+ },
1064
+ {
1065
+ "epoch": 37.5,
1066
+ "grad_norm": 0.7187996022384058,
1067
+ "learning_rate": 3.5721239031346067e-06,
1068
+ "loss": 3.1919,
1069
+ "mean_token_accuracy": 0.42737048864364624,
1070
+ "step": 450
1071
+ },
1072
+ {
1073
+ "epoch": 37.916666666666664,
1074
+ "grad_norm": 0.5959621609292731,
1075
+ "learning_rate": 3.3520413438606215e-06,
1076
+ "loss": 3.122,
1077
+ "mean_token_accuracy": 0.425152450799942,
1078
+ "step": 455
1079
+ },
1080
+ {
1081
+ "epoch": 38.0,
1082
+ "eval_loss": 3.2714850902557373,
1083
+ "eval_mean_token_accuracy": 0.42657437175512314,
1084
+ "eval_runtime": 0.4313,
1085
+ "eval_samples_per_second": 44.056,
1086
+ "eval_steps_per_second": 6.956,
1087
+ "step": 456
1088
+ },
1089
+ {
1090
+ "epoch": 38.333333333333336,
1091
+ "grad_norm": 0.5510804324691496,
1092
+ "learning_rate": 3.1375836213126653e-06,
1093
+ "loss": 3.1652,
1094
+ "mean_token_accuracy": 0.4300769865512848,
1095
+ "step": 460
1096
+ },
1097
+ {
1098
+ "epoch": 38.75,
1099
+ "grad_norm": 0.7109787627150623,
1100
+ "learning_rate": 2.9289321881345257e-06,
1101
+ "loss": 3.0988,
1102
+ "mean_token_accuracy": 0.4267013967037201,
1103
+ "step": 465
1104
+ },
1105
+ {
1106
+ "epoch": 39.0,
1107
+ "eval_loss": 3.270362615585327,
1108
+ "eval_mean_token_accuracy": 0.4215492556492488,
1109
+ "eval_runtime": 0.4303,
1110
+ "eval_samples_per_second": 44.155,
1111
+ "eval_steps_per_second": 6.972,
1112
+ "step": 468
1113
+ },
1114
+ {
1115
+ "epoch": 39.166666666666664,
1116
+ "grad_norm": 0.774771996806958,
1117
+ "learning_rate": 2.726263584269513e-06,
1118
+ "loss": 3.1793,
1119
+ "mean_token_accuracy": 0.4258919805288315,
1120
+ "step": 470
1121
+ },
1122
+ {
1123
+ "epoch": 39.583333333333336,
1124
+ "grad_norm": 0.6131163287949375,
1125
+ "learning_rate": 2.529749287590042e-06,
1126
+ "loss": 3.1375,
1127
+ "mean_token_accuracy": 0.42778592705726626,
1128
+ "step": 475
1129
+ },
1130
+ {
1131
+ "epoch": 40.0,
1132
+ "grad_norm": 1.2153887550210514,
1133
+ "learning_rate": 2.339555568810221e-06,
1134
+ "loss": 3.145,
1135
+ "mean_token_accuracy": 0.42437560558319093,
1136
+ "step": 480
1137
+ },
1138
+ {
1139
+ "epoch": 40.0,
1140
+ "eval_loss": 3.2696785926818848,
1141
+ "eval_mean_token_accuracy": 0.4261993666489919,
1142
+ "eval_runtime": 0.429,
1143
+ "eval_samples_per_second": 44.291,
1144
+ "eval_steps_per_second": 6.993,
1145
+ "step": 480
1146
+ },
1147
+ {
1148
+ "epoch": 40.416666666666664,
1149
+ "grad_norm": 0.6720557667088608,
1150
+ "learning_rate": 2.155843350804243e-06,
1151
+ "loss": 3.1445,
1152
+ "mean_token_accuracy": 0.42810567617416384,
1153
+ "step": 485
1154
+ },
1155
+ {
1156
+ "epoch": 40.833333333333336,
1157
+ "grad_norm": 0.6250108698940896,
1158
+ "learning_rate": 1.9787680724495617e-06,
1159
+ "loss": 3.1776,
1160
+ "mean_token_accuracy": 0.42465786933898925,
1161
+ "step": 490
1162
+ },
1163
+ {
1164
+ "epoch": 41.0,
1165
+ "eval_loss": 3.2690887451171875,
1166
+ "eval_mean_token_accuracy": 0.4265896141529083,
1167
+ "eval_runtime": 0.4278,
1168
+ "eval_samples_per_second": 44.411,
1169
+ "eval_steps_per_second": 7.012,
1170
+ "step": 492
1171
+ },
1172
+ {
1173
+ "epoch": 41.25,
1174
+ "grad_norm": 0.5760551828751769,
1175
+ "learning_rate": 1.808479557110081e-06,
1176
+ "loss": 3.0955,
1177
+ "mean_token_accuracy": 0.42819322148958844,
1178
+ "step": 495
1179
+ },
1180
+ {
1181
+ "epoch": 41.666666666666664,
1182
+ "grad_norm": 0.6656339982260218,
1183
+ "learning_rate": 1.6451218858706374e-06,
1184
+ "loss": 3.1127,
1185
+ "mean_token_accuracy": 0.4301280200481415,
1186
+ "step": 500
1187
+ },
1188
+ {
1189
+ "epoch": 42.0,
1190
+ "eval_loss": 3.268610954284668,
1191
+ "eval_mean_token_accuracy": 0.42469198788915363,
1192
+ "eval_runtime": 0.4334,
1193
+ "eval_samples_per_second": 43.84,
1194
+ "eval_steps_per_second": 6.922,
1195
+ "step": 504
1196
+ },
1197
+ {
1198
+ "epoch": 42.083333333333336,
1199
+ "grad_norm": 0.7475978754963217,
1200
+ "learning_rate": 1.4888332756300027e-06,
1201
+ "loss": 3.1808,
1202
+ "mean_token_accuracy": 0.423631489276886,
1203
+ "step": 505
1204
+ },
1205
+ {
1206
+ "epoch": 42.5,
1207
+ "grad_norm": 0.6501237190023742,
1208
+ "learning_rate": 1.339745962155613e-06,
1209
+ "loss": 3.1527,
1210
+ "mean_token_accuracy": 0.4245112419128418,
1211
+ "step": 510
1212
+ },
1213
+ {
1214
+ "epoch": 42.916666666666664,
1215
+ "grad_norm": 0.62569750833363,
1216
+ "learning_rate": 1.1979860881988903e-06,
1217
+ "loss": 3.1386,
1218
+ "mean_token_accuracy": 0.428892993927002,
1219
+ "step": 515
1220
+ },
1221
+ {
1222
+ "epoch": 43.0,
1223
+ "eval_loss": 3.268239974975586,
1224
+ "eval_mean_token_accuracy": 0.42221880704164505,
1225
+ "eval_runtime": 0.43,
1226
+ "eval_samples_per_second": 44.182,
1227
+ "eval_steps_per_second": 6.976,
1228
+ "step": 516
1229
+ },
1230
+ {
1231
+ "epoch": 43.333333333333336,
1232
+ "grad_norm": 0.554243721179707,
1233
+ "learning_rate": 1.0636735967658785e-06,
1234
+ "loss": 3.1772,
1235
+ "mean_token_accuracy": 0.42421653121709824,
1236
+ "step": 520
1237
+ },
1238
+ {
1239
+ "epoch": 43.75,
1240
+ "grad_norm": 0.5721327568599583,
1241
+ "learning_rate": 9.369221296335007e-07,
1242
+ "loss": 3.1272,
1243
+ "mean_token_accuracy": 0.4287878811359406,
1244
+ "step": 525
1245
+ },
1246
+ {
1247
+ "epoch": 44.0,
1248
+ "eval_loss": 3.267944812774658,
1249
+ "eval_mean_token_accuracy": 0.4253371407588323,
1250
+ "eval_runtime": 0.4335,
1251
+ "eval_samples_per_second": 43.832,
1252
+ "eval_steps_per_second": 6.921,
1253
+ "step": 528
1254
+ },
1255
+ {
1256
+ "epoch": 44.166666666666664,
1257
+ "grad_norm": 0.7430429454372077,
1258
+ "learning_rate": 8.178389311972612e-07,
1259
+ "loss": 3.1615,
1260
+ "mean_token_accuracy": 0.4282746911048889,
1261
+ "step": 530
1262
+ },
1263
+ {
1264
+ "epoch": 44.583333333333336,
1265
+ "grad_norm": 0.7042749042309677,
1266
+ "learning_rate": 7.065247577317747e-07,
1267
+ "loss": 3.1264,
1268
+ "mean_token_accuracy": 0.42416911125183104,
1269
+ "step": 535
1270
+ },
1271
+ {
1272
+ "epoch": 45.0,
1273
+ "grad_norm": 1.2711783589705017,
1274
+ "learning_rate": 6.030737921409169e-07,
1275
+ "loss": 3.1803,
1276
+ "mean_token_accuracy": 0.42598907351493837,
1277
+ "step": 540
1278
+ },
1279
+ {
1280
+ "epoch": 45.0,
1281
+ "eval_loss": 3.2677195072174072,
1282
+ "eval_mean_token_accuracy": 0.42655404408772785,
1283
+ "eval_runtime": 0.4208,
1284
+ "eval_samples_per_second": 45.15,
1285
+ "eval_steps_per_second": 7.129,
1286
+ "step": 540
1287
+ },
1288
+ {
1289
+ "epoch": 45.416666666666664,
1290
+ "grad_norm": 0.5778581622717375,
1291
+ "learning_rate": 5.075735642696611e-07,
1292
+ "loss": 3.1695,
1293
+ "mean_token_accuracy": 0.42243402600288393,
1294
+ "step": 545
1295
+ },
1296
+ {
1297
+ "epoch": 45.833333333333336,
1298
+ "grad_norm": 0.6459085305134198,
1299
+ "learning_rate": 4.2010487684511105e-07,
1300
+ "loss": 3.1143,
1301
+ "mean_token_accuracy": 0.4312965631484985,
1302
+ "step": 550
1303
+ },
1304
+ {
1305
+ "epoch": 46.0,
1306
+ "eval_loss": 3.267563819885254,
1307
+ "eval_mean_token_accuracy": 0.42787768244743346,
1308
+ "eval_runtime": 0.4291,
1309
+ "eval_samples_per_second": 44.276,
1310
+ "eval_steps_per_second": 6.991,
1311
+ "step": 552
1312
+ },
1313
+ {
1314
+ "epoch": 46.25,
1315
+ "grad_norm": 0.6444220526276241,
1316
+ "learning_rate": 3.4074173710931804e-07,
1317
+ "loss": 3.1236,
1318
+ "mean_token_accuracy": 0.4287661810715993,
1319
+ "step": 555
1320
+ },
1321
+ {
1322
+ "epoch": 46.666666666666664,
1323
+ "grad_norm": 0.610353826593815,
1324
+ "learning_rate": 2.6955129420176193e-07,
1325
+ "loss": 3.1322,
1326
+ "mean_token_accuracy": 0.42734603881835936,
1327
+ "step": 560
1328
+ },
1329
+ {
1330
+ "epoch": 47.0,
1331
+ "eval_loss": 3.2675440311431885,
1332
+ "eval_mean_token_accuracy": 0.4225522152015141,
1333
+ "eval_runtime": 0.4219,
1334
+ "eval_samples_per_second": 45.036,
1335
+ "eval_steps_per_second": 7.111,
1336
+ "step": 564
1337
+ },
1338
+ {
1339
+ "epoch": 47.083333333333336,
1340
+ "grad_norm": 0.5779765132480464,
1341
+ "learning_rate": 2.0659378234448524e-07,
1342
+ "loss": 3.2209,
1343
+ "mean_token_accuracy": 0.41336753964424133,
1344
+ "step": 565
1345
+ },
1346
+ {
1347
+ "epoch": 47.5,
1348
+ "grad_norm": 0.6848925907095541,
1349
+ "learning_rate": 1.519224698779198e-07,
1350
+ "loss": 3.142,
1351
+ "mean_token_accuracy": 0.42521933317184446,
1352
+ "step": 570
1353
+ },
1354
+ {
1355
+ "epoch": 47.916666666666664,
1356
+ "grad_norm": 0.6437140874495073,
1357
+ "learning_rate": 1.055836141905553e-07,
1358
+ "loss": 3.1318,
1359
+ "mean_token_accuracy": 0.42976540327072144,
1360
+ "step": 575
1361
+ },
1362
+ {
1363
+ "epoch": 48.0,
1364
+ "eval_loss": 3.267465114593506,
1365
+ "eval_mean_token_accuracy": 0.42656926065683365,
1366
+ "eval_runtime": 0.4269,
1367
+ "eval_samples_per_second": 44.507,
1368
+ "eval_steps_per_second": 7.027,
1369
+ "step": 576
1370
+ },
1371
+ {
1372
+ "epoch": 48.333333333333336,
1373
+ "grad_norm": 0.7317817126043084,
1374
+ "learning_rate": 6.761642258056977e-08,
1375
+ "loss": 3.1684,
1376
+ "mean_token_accuracy": 0.43004219233989716,
1377
+ "step": 580
1378
+ },
1379
+ {
1380
+ "epoch": 48.75,
1381
+ "grad_norm": 0.7021129395579033,
1382
+ "learning_rate": 3.805301908254455e-08,
1383
+ "loss": 3.1435,
1384
+ "mean_token_accuracy": 0.4232893466949463,
1385
+ "step": 585
1386
+ },
1387
+ {
1388
+ "epoch": 49.0,
1389
+ "eval_loss": 3.267409563064575,
1390
+ "eval_mean_token_accuracy": 0.42759764691193897,
1391
+ "eval_runtime": 0.4339,
1392
+ "eval_samples_per_second": 43.788,
1393
+ "eval_steps_per_second": 6.914,
1394
+ "step": 588
1395
+ },
1396
+ {
1397
+ "epoch": 49.166666666666664,
1398
+ "grad_norm": 0.6452805374112162,
1399
+ "learning_rate": 1.6918417287318245e-08,
1400
+ "loss": 3.1535,
1401
+ "mean_token_accuracy": 0.43218475580215454,
1402
+ "step": 590
1403
+ },
1404
+ {
1405
+ "epoch": 49.583333333333336,
1406
+ "grad_norm": 0.6574552722398359,
1407
+ "learning_rate": 4.230499177994007e-09,
1408
+ "loss": 3.1514,
1409
+ "mean_token_accuracy": 0.42807917594909667,
1410
+ "step": 595
1411
+ },
1412
+ {
1413
+ "epoch": 50.0,
1414
+ "grad_norm": 1.3863477885366933,
1415
+ "learning_rate": 0.0,
1416
+ "loss": 3.1944,
1417
+ "mean_token_accuracy": 0.4211083292961121,
1418
+ "step": 600
1419
+ },
1420
+ {
1421
+ "epoch": 50.0,
1422
+ "eval_loss": 3.2674479484558105,
1423
+ "eval_mean_token_accuracy": 0.42677465081214905,
1424
+ "eval_runtime": 0.4251,
1425
+ "eval_samples_per_second": 44.694,
1426
+ "eval_steps_per_second": 7.057,
1427
+ "step": 600
1428
+ },
1429
+ {
1430
+ "epoch": 50.0,
1431
+ "step": 600,
1432
+ "total_flos": 184078172160000.0,
1433
+ "train_loss": 3.2382628750801086,
1434
+ "train_runtime": 305.3611,
1435
+ "train_samples_per_second": 14.737,
1436
+ "train_steps_per_second": 1.965
1437
  }
1438
  ],
1439
  "logging_steps": 5,
1440
+ "max_steps": 600,
1441
  "num_input_tokens_seen": 0,
1442
+ "num_train_epochs": 50,
1443
  "save_steps": 500,
1444
  "stateful_callbacks": {
1445
  "TrainerControl": {
 
1453
  "attributes": {}
1454
  }
1455
  },
1456
+ "total_flos": 184078172160000.0,
1457
  "train_batch_size": 8,
1458
  "trial_name": null,
1459
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc6572bf040f8f4b9d9b41da1a82e4a0cef80f4d2912446b1ea044029db09e64
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9c239b6680cc72b75187bd68708a7121b3a7a5ffa31e0eeeaf4c66e0742042b
3
  size 7544