kanishka commited on
Commit
da74125
·
verified ·
1 Parent(s): 3bf5f22

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +14 -14
  3. eval_results.json +8 -8
  4. train_results.json +7 -7
  5. trainer_state.json +313 -313
README.md CHANGED
@@ -2,11 +2,23 @@
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
 
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
  - name: opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
9
- results: []
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
16
 
17
- This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 2.6820
20
  - Accuracy: 0.4788
 
2
  library_name: transformers
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - kanishka/babylm2-rewritten-clean-spacy
7
  metrics:
8
  - accuracy
9
  model-index:
10
  - name: opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
11
+ results:
12
+ - task:
13
+ name: Causal Language Modeling
14
+ type: text-generation
15
+ dataset:
16
+ name: kanishka/babylm2-rewritten-clean-spacy
17
+ type: kanishka/babylm2-rewritten-clean-spacy
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.47877642614021604
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3
28
 
29
+ This model was trained from scratch on the kanishka/babylm2-rewritten-clean-spacy dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 2.6820
32
  - Accuracy: 0.4788
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 19.998892396300604,
3
- "eval_accuracy": 0.47865689612852264,
4
- "eval_loss": 2.6880221366882324,
5
- "eval_runtime": 120.6441,
6
- "eval_samples": 60791,
7
- "eval_samples_per_second": 503.887,
8
- "eval_steps_per_second": 7.874,
9
- "perplexity": 14.702567474871172,
10
- "total_flos": 1.50966240067584e+18,
11
- "train_loss": 2.8030111154835997,
12
- "train_runtime": 43302.4979,
13
- "train_samples": 577799,
14
- "train_samples_per_second": 266.866,
15
- "train_steps_per_second": 1.042
16
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.47877642614021604,
4
+ "eval_loss": 2.6820449829101562,
5
+ "eval_runtime": 71.9233,
6
+ "eval_samples": 60701,
7
+ "eval_samples_per_second": 843.969,
8
+ "eval_steps_per_second": 13.195,
9
+ "perplexity": 14.614950080315884,
10
+ "total_flos": 1.50902942072832e+18,
11
+ "train_loss": 2.805498681169875,
12
+ "train_runtime": 30524.4807,
13
+ "train_samples": 577526,
14
+ "train_samples_per_second": 378.402,
15
+ "train_steps_per_second": 1.478
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 19.998892396300604,
3
- "eval_accuracy": 0.47865689612852264,
4
- "eval_loss": 2.6880221366882324,
5
- "eval_runtime": 120.6441,
6
- "eval_samples": 60791,
7
- "eval_samples_per_second": 503.887,
8
- "eval_steps_per_second": 7.874,
9
- "perplexity": 14.702567474871172
10
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.47877642614021604,
4
+ "eval_loss": 2.6820449829101562,
5
+ "eval_runtime": 71.9233,
6
+ "eval_samples": 60701,
7
+ "eval_samples_per_second": 843.969,
8
+ "eval_steps_per_second": 13.195,
9
+ "perplexity": 14.614950080315884
10
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 19.998892396300604,
3
- "total_flos": 1.50966240067584e+18,
4
- "train_loss": 2.8030111154835997,
5
- "train_runtime": 43302.4979,
6
- "train_samples": 577799,
7
- "train_samples_per_second": 266.866,
8
- "train_steps_per_second": 1.042
9
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "total_flos": 1.50902942072832e+18,
4
+ "train_loss": 2.805498681169875,
5
+ "train_runtime": 30524.4807,
6
+ "train_samples": 577526,
7
+ "train_samples_per_second": 378.402,
8
+ "train_steps_per_second": 1.478
9
  }
trainer_state.json CHANGED
@@ -1,520 +1,520 @@
1
  {
2
- "best_metric": 2.6880221366882324,
3
- "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45140",
4
- "epoch": 19.998892396300604,
5
  "eval_steps": 500,
6
- "global_step": 45140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4430414797585424,
13
- "grad_norm": 0.4268507957458496,
14
  "learning_rate": 3.125e-05,
15
- "loss": 5.5906,
16
  "step": 1000
17
  },
18
  {
19
- "epoch": 0.8860829595170848,
20
- "grad_norm": 0.5278797149658203,
21
  "learning_rate": 6.25e-05,
22
- "loss": 4.0959,
23
  "step": 2000
24
  },
25
  {
26
- "epoch": 0.9999446198150301,
27
- "eval_accuracy": 0.3612965154478169,
28
- "eval_loss": 3.8126308917999268,
29
- "eval_runtime": 121.9395,
30
- "eval_samples_per_second": 498.534,
31
- "eval_steps_per_second": 7.791,
32
- "step": 2257
33
  },
34
  {
35
- "epoch": 1.3291244392756272,
36
- "grad_norm": 0.5504615306854248,
37
  "learning_rate": 9.375e-05,
38
- "loss": 3.6988,
39
  "step": 3000
40
  },
41
  {
42
- "epoch": 1.7721659190341694,
43
- "grad_norm": 0.5281194448471069,
44
  "learning_rate": 0.000125,
45
- "loss": 3.4463,
46
  "step": 4000
47
  },
48
  {
49
- "epoch": 1.9998892396300603,
50
- "eval_accuracy": 0.4098802035001956,
51
- "eval_loss": 3.297177791595459,
52
- "eval_runtime": 122.3693,
53
- "eval_samples_per_second": 496.783,
54
- "eval_steps_per_second": 7.763,
55
- "step": 4514
56
  },
57
  {
58
- "epoch": 2.215207398792712,
59
- "grad_norm": 0.491211473941803,
60
  "learning_rate": 0.00015625,
61
- "loss": 3.2482,
62
  "step": 5000
63
  },
64
  {
65
- "epoch": 2.6582488785512544,
66
- "grad_norm": 0.5076552033424377,
67
  "learning_rate": 0.0001875,
68
- "loss": 3.1228,
69
  "step": 6000
70
  },
71
  {
72
- "epoch": 2.9998338594450904,
73
- "eval_accuracy": 0.4315480135894729,
74
- "eval_loss": 3.085052013397217,
75
- "eval_runtime": 122.1579,
76
- "eval_samples_per_second": 497.643,
77
- "eval_steps_per_second": 7.777,
78
- "step": 6771
79
  },
80
  {
81
- "epoch": 3.1012903583097966,
82
- "grad_norm": 0.4416440725326538,
83
  "learning_rate": 0.00021875,
84
- "loss": 3.0284,
85
  "step": 7000
86
  },
87
  {
88
- "epoch": 3.5443318380683393,
89
- "grad_norm": 0.37537524104118347,
90
  "learning_rate": 0.00025,
91
- "loss": 2.9593,
92
  "step": 8000
93
  },
94
  {
95
- "epoch": 3.9873733178268815,
96
- "grad_norm": 0.39330974221229553,
97
  "learning_rate": 0.00028125000000000003,
98
- "loss": 2.9166,
99
  "step": 9000
100
  },
101
  {
102
- "epoch": 3.9997784792601205,
103
- "eval_accuracy": 0.44179488643346004,
104
- "eval_loss": 2.980708599090576,
105
- "eval_runtime": 122.9667,
106
- "eval_samples_per_second": 494.37,
107
- "eval_steps_per_second": 7.726,
108
- "step": 9028
109
  },
110
  {
111
- "epoch": 4.430414797585424,
112
- "grad_norm": 0.370112806558609,
113
  "learning_rate": 0.0003125,
114
- "loss": 2.8563,
115
  "step": 10000
116
  },
117
  {
118
- "epoch": 4.8734562773439665,
119
- "grad_norm": 0.3323429524898529,
120
  "learning_rate": 0.00034375,
121
- "loss": 2.8402,
122
  "step": 11000
123
  },
124
  {
125
- "epoch": 4.999723099075151,
126
- "eval_accuracy": 0.44756373573100505,
127
- "eval_loss": 2.924882411956787,
128
- "eval_runtime": 122.7484,
129
- "eval_samples_per_second": 495.249,
130
- "eval_steps_per_second": 7.739,
131
- "step": 11285
132
  },
133
  {
134
- "epoch": 5.316497757102509,
135
- "grad_norm": 0.31359294056892395,
136
- "learning_rate": 0.00037496875000000003,
137
- "loss": 2.7944,
138
  "step": 12000
139
  },
140
  {
141
- "epoch": 5.759539236861051,
142
- "grad_norm": 0.29497697949409485,
143
- "learning_rate": 0.00040621875,
144
- "loss": 2.7832,
145
  "step": 13000
146
  },
147
  {
148
- "epoch": 5.999667718890181,
149
- "eval_accuracy": 0.45211594466544164,
150
- "eval_loss": 2.8850650787353516,
151
- "eval_runtime": 122.4548,
152
- "eval_samples_per_second": 496.436,
153
- "eval_steps_per_second": 7.758,
154
- "step": 13542
155
  },
156
  {
157
- "epoch": 6.202580716619593,
158
- "grad_norm": 0.30153268575668335,
159
- "learning_rate": 0.00043746875000000003,
160
- "loss": 2.754,
161
  "step": 14000
162
  },
163
  {
164
- "epoch": 6.645622196378136,
165
- "grad_norm": 0.29313963651657104,
166
  "learning_rate": 0.00046871875,
167
- "loss": 2.7377,
168
  "step": 15000
169
  },
170
  {
171
- "epoch": 6.999612338705211,
172
- "eval_accuracy": 0.4545553537497972,
173
- "eval_loss": 2.860243558883667,
174
- "eval_runtime": 122.3572,
175
- "eval_samples_per_second": 496.832,
176
- "eval_steps_per_second": 7.764,
177
- "step": 15799
178
  },
179
  {
180
- "epoch": 7.088663676136679,
181
- "grad_norm": 0.2767232358455658,
182
  "learning_rate": 0.00049996875,
183
- "loss": 2.7295,
184
  "step": 16000
185
  },
186
  {
187
- "epoch": 7.531705155895221,
188
- "grad_norm": 0.2507473826408386,
189
  "learning_rate": 0.0005311875000000001,
190
- "loss": 2.7014,
191
  "step": 17000
192
  },
193
  {
194
- "epoch": 7.974746635653763,
195
- "grad_norm": 0.2542831301689148,
196
- "learning_rate": 0.00056240625,
197
- "loss": 2.7101,
198
  "step": 18000
199
  },
200
  {
201
  "epoch": 8.0,
202
- "eval_accuracy": 0.4571935151649448,
203
- "eval_loss": 2.8389248847961426,
204
- "eval_runtime": 121.9399,
205
- "eval_samples_per_second": 498.532,
206
- "eval_steps_per_second": 7.791,
207
- "step": 18057
208
  },
209
  {
210
- "epoch": 8.417788115412305,
211
- "grad_norm": 0.2478516399860382,
212
  "learning_rate": 0.00059365625,
213
- "loss": 2.673,
214
  "step": 19000
215
  },
216
  {
217
- "epoch": 8.860829595170848,
218
- "grad_norm": 0.22448837757110596,
219
  "learning_rate": 0.00062490625,
220
- "loss": 2.684,
221
  "step": 20000
222
  },
223
  {
224
- "epoch": 8.99994461981503,
225
- "eval_accuracy": 0.4586443233179834,
226
- "eval_loss": 2.8260207176208496,
227
- "eval_runtime": 122.3992,
228
- "eval_samples_per_second": 496.662,
229
- "eval_steps_per_second": 7.761,
230
- "step": 20314
231
  },
232
  {
233
- "epoch": 9.30387107492939,
234
- "grad_norm": 0.22841599583625793,
235
- "learning_rate": 0.0006561562500000001,
236
  "loss": 2.6598,
237
  "step": 21000
238
  },
239
  {
240
- "epoch": 9.746912554687933,
241
- "grad_norm": 0.21708275377750397,
242
  "learning_rate": 0.0006873749999999999,
243
- "loss": 2.6654,
244
  "step": 22000
245
  },
246
  {
247
- "epoch": 9.999889239630061,
248
- "eval_accuracy": 0.45958170407706767,
249
- "eval_loss": 2.815507411956787,
250
- "eval_runtime": 120.8572,
251
- "eval_samples_per_second": 502.998,
252
- "eval_steps_per_second": 7.861,
253
- "step": 22571
254
  },
255
  {
256
- "epoch": 10.189954034446474,
257
- "grad_norm": 0.2034858614206314,
258
  "learning_rate": 0.000718625,
259
- "loss": 2.6505,
260
  "step": 23000
261
  },
262
  {
263
- "epoch": 10.632995514205017,
264
- "grad_norm": 0.20455513894557953,
265
- "learning_rate": 0.000749875,
266
- "loss": 2.6466,
267
  "step": 24000
268
  },
269
  {
270
- "epoch": 10.99983385944509,
271
- "eval_accuracy": 0.4604201924885037,
272
- "eval_loss": 2.807711601257324,
273
- "eval_runtime": 121.2532,
274
- "eval_samples_per_second": 501.356,
275
- "eval_steps_per_second": 7.835,
276
- "step": 24828
277
  },
278
  {
279
- "epoch": 11.07603699396356,
280
- "grad_norm": 0.20884878933429718,
281
- "learning_rate": 0.000781125,
282
- "loss": 2.6497,
283
  "step": 25000
284
  },
285
  {
286
- "epoch": 11.519078473722102,
287
- "grad_norm": 0.2016923576593399,
288
- "learning_rate": 0.00081234375,
289
- "loss": 2.6304,
290
  "step": 26000
291
  },
292
  {
293
- "epoch": 11.962119953480645,
294
- "grad_norm": 0.1992848664522171,
295
- "learning_rate": 0.00084359375,
296
- "loss": 2.6474,
297
  "step": 27000
298
  },
299
  {
300
- "epoch": 11.99977847926012,
301
- "eval_accuracy": 0.4614518854538904,
302
- "eval_loss": 2.802515983581543,
303
- "eval_runtime": 121.3375,
304
- "eval_samples_per_second": 501.007,
305
- "eval_steps_per_second": 7.829,
306
- "step": 27085
307
  },
308
  {
309
- "epoch": 12.405161433239186,
310
- "grad_norm": 0.19233956933021545,
311
- "learning_rate": 0.0008748125,
312
- "loss": 2.6163,
313
  "step": 28000
314
  },
315
  {
316
- "epoch": 12.84820291299773,
317
- "grad_norm": 0.18767118453979492,
318
  "learning_rate": 0.0009060312499999999,
319
- "loss": 2.6366,
320
  "step": 29000
321
  },
322
  {
323
- "epoch": 12.999723099075151,
324
- "eval_accuracy": 0.46189461094763445,
325
- "eval_loss": 2.7982876300811768,
326
- "eval_runtime": 121.2511,
327
- "eval_samples_per_second": 501.364,
328
- "eval_steps_per_second": 7.835,
329
- "step": 29342
330
  },
331
  {
332
- "epoch": 13.291244392756273,
333
- "grad_norm": 0.188876211643219,
334
- "learning_rate": 0.00093728125,
335
- "loss": 2.6143,
336
  "step": 30000
337
  },
338
  {
339
- "epoch": 13.734285872514814,
340
- "grad_norm": 0.17773953080177307,
341
- "learning_rate": 0.00096853125,
342
- "loss": 2.625,
343
  "step": 31000
344
  },
345
  {
346
- "epoch": 13.999667718890182,
347
- "eval_accuracy": 0.46255660264467685,
348
- "eval_loss": 2.792785406112671,
349
- "eval_runtime": 120.7502,
350
- "eval_samples_per_second": 503.444,
351
- "eval_steps_per_second": 7.867,
352
- "step": 31599
353
  },
354
  {
355
- "epoch": 14.177327352273357,
356
- "grad_norm": 0.19869054853916168,
357
- "learning_rate": 0.00099978125,
358
- "loss": 2.6187,
359
  "step": 32000
360
  },
361
  {
362
- "epoch": 14.620368832031899,
363
- "grad_norm": 0.17133094370365143,
364
- "learning_rate": 0.0009245053272450533,
365
- "loss": 2.6109,
366
  "step": 33000
367
  },
368
  {
369
- "epoch": 14.99961233870521,
370
- "eval_accuracy": 0.46537558287943165,
371
- "eval_loss": 2.768995523452759,
372
- "eval_runtime": 120.8755,
373
- "eval_samples_per_second": 502.922,
374
- "eval_steps_per_second": 7.859,
375
- "step": 33856
376
  },
377
  {
378
- "epoch": 15.063410311790442,
379
- "grad_norm": 0.1772530972957611,
380
- "learning_rate": 0.0008484018264840183,
381
- "loss": 2.5987,
382
  "step": 34000
383
  },
384
  {
385
- "epoch": 15.506451791548983,
386
- "grad_norm": 0.17091083526611328,
387
- "learning_rate": 0.0007723744292237444,
388
- "loss": 2.5615,
389
  "step": 35000
390
  },
391
  {
392
- "epoch": 15.949493271307526,
393
- "grad_norm": 0.17316913604736328,
394
- "learning_rate": 0.0006962709284627093,
395
- "loss": 2.5658,
396
  "step": 36000
397
  },
398
  {
399
  "epoch": 16.0,
400
- "eval_accuracy": 0.4686073564166006,
401
- "eval_loss": 2.744506359100342,
402
- "eval_runtime": 120.8269,
403
- "eval_samples_per_second": 503.125,
404
- "eval_steps_per_second": 7.862,
405
- "step": 36114
406
- },
407
- {
408
- "epoch": 16.39253475106607,
409
- "grad_norm": 0.19304046034812927,
410
- "learning_rate": 0.0006201674277016743,
411
- "loss": 2.5123,
412
  "step": 37000
413
  },
414
  {
415
- "epoch": 16.83557623082461,
416
- "grad_norm": 0.17993681132793427,
417
- "learning_rate": 0.0005440639269406394,
418
- "loss": 2.5185,
419
  "step": 38000
420
  },
421
  {
422
- "epoch": 16.99994461981503,
423
- "eval_accuracy": 0.47166972923301015,
424
- "eval_loss": 2.72279953956604,
425
- "eval_runtime": 121.0608,
426
- "eval_samples_per_second": 502.152,
427
- "eval_steps_per_second": 7.847,
428
- "step": 38371
429
  },
430
  {
431
- "epoch": 17.278617710583152,
432
- "grad_norm": 0.18638047575950623,
433
- "learning_rate": 0.0004680365296803653,
434
- "loss": 2.4713,
435
  "step": 39000
436
  },
437
  {
438
- "epoch": 17.721659190341697,
439
- "grad_norm": 0.19553589820861816,
440
- "learning_rate": 0.0003919330289193303,
441
- "loss": 2.4637,
442
  "step": 40000
443
  },
444
  {
445
- "epoch": 17.99988923963006,
446
- "eval_accuracy": 0.4746745599919493,
447
- "eval_loss": 2.7043166160583496,
448
- "eval_runtime": 121.0615,
449
- "eval_samples_per_second": 502.15,
450
- "eval_steps_per_second": 7.847,
451
- "step": 40628
452
  },
453
  {
454
- "epoch": 18.16470067010024,
455
- "grad_norm": 0.1946045607328415,
456
- "learning_rate": 0.00031590563165905634,
457
- "loss": 2.4315,
458
  "step": 41000
459
  },
460
  {
461
- "epoch": 18.60774214985878,
462
- "grad_norm": 0.1942271739244461,
463
- "learning_rate": 0.0002398021308980213,
464
- "loss": 2.3969,
465
  "step": 42000
466
  },
467
  {
468
- "epoch": 18.99983385944509,
469
- "eval_accuracy": 0.4773752951691443,
470
- "eval_loss": 2.689497947692871,
471
- "eval_runtime": 121.1389,
472
- "eval_samples_per_second": 501.829,
473
- "eval_steps_per_second": 7.842,
474
- "step": 42885
475
  },
476
  {
477
- "epoch": 19.050783629617325,
478
- "grad_norm": 0.1933259516954422,
479
- "learning_rate": 0.0001636986301369863,
480
- "loss": 2.3882,
481
  "step": 43000
482
  },
483
  {
484
- "epoch": 19.493825109375866,
485
- "grad_norm": 0.1976754367351532,
486
- "learning_rate": 8.75951293759513e-05,
487
- "loss": 2.3278,
488
  "step": 44000
489
  },
490
  {
491
- "epoch": 19.936866589134407,
492
- "grad_norm": 0.1973201334476471,
493
- "learning_rate": 1.1567732115677321e-05,
494
- "loss": 2.3245,
495
  "step": 45000
496
  },
497
  {
498
- "epoch": 19.998892396300604,
499
- "eval_accuracy": 0.47865689612852264,
500
- "eval_loss": 2.6880221366882324,
501
- "eval_runtime": 121.1537,
502
- "eval_samples_per_second": 501.768,
503
- "eval_steps_per_second": 7.841,
504
- "step": 45140
505
  },
506
  {
507
- "epoch": 19.998892396300604,
508
- "step": 45140,
509
- "total_flos": 1.50966240067584e+18,
510
- "train_loss": 2.8030111154835997,
511
- "train_runtime": 43302.4979,
512
- "train_samples_per_second": 266.866,
513
- "train_steps_per_second": 1.042
514
  }
515
  ],
516
  "logging_steps": 1000,
517
- "max_steps": 45140,
518
  "num_input_tokens_seen": 0,
519
  "num_train_epochs": 20,
520
  "save_steps": 500,
@@ -539,7 +539,7 @@
539
  "attributes": {}
540
  }
541
  },
542
- "total_flos": 1.50966240067584e+18,
543
  "train_batch_size": 32,
544
  "trial_name": null,
545
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.6820449829101562,
3
+ "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45120",
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 45120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.4432624113475177,
13
+ "grad_norm": 0.469483345746994,
14
  "learning_rate": 3.125e-05,
15
+ "loss": 5.5946,
16
  "step": 1000
17
  },
18
  {
19
+ "epoch": 0.8865248226950354,
20
+ "grad_norm": 0.6834925413131714,
21
  "learning_rate": 6.25e-05,
22
+ "loss": 4.101,
23
  "step": 2000
24
  },
25
  {
26
+ "epoch": 1.0,
27
+ "eval_accuracy": 0.36108013855119486,
28
+ "eval_loss": 3.813789129257202,
29
+ "eval_runtime": 74.5004,
30
+ "eval_samples_per_second": 814.774,
31
+ "eval_steps_per_second": 12.738,
32
+ "step": 2256
33
  },
34
  {
35
+ "epoch": 1.3297872340425532,
36
+ "grad_norm": 0.5719049572944641,
37
  "learning_rate": 9.375e-05,
38
+ "loss": 3.7004,
39
  "step": 3000
40
  },
41
  {
42
+ "epoch": 1.773049645390071,
43
+ "grad_norm": 0.5357337594032288,
44
  "learning_rate": 0.000125,
45
+ "loss": 3.445,
46
  "step": 4000
47
  },
48
  {
49
+ "epoch": 2.0,
50
+ "eval_accuracy": 0.4097865752122829,
51
+ "eval_loss": 3.29646635055542,
52
+ "eval_runtime": 74.2471,
53
+ "eval_samples_per_second": 817.554,
54
+ "eval_steps_per_second": 12.782,
55
+ "step": 4512
56
  },
57
  {
58
+ "epoch": 2.2163120567375887,
59
+ "grad_norm": 0.5515570640563965,
60
  "learning_rate": 0.00015625,
61
+ "loss": 3.2498,
62
  "step": 5000
63
  },
64
  {
65
+ "epoch": 2.6595744680851063,
66
+ "grad_norm": 0.5060694813728333,
67
  "learning_rate": 0.0001875,
68
+ "loss": 3.1279,
69
  "step": 6000
70
  },
71
  {
72
+ "epoch": 3.0,
73
+ "eval_accuracy": 0.43082928827286177,
74
+ "eval_loss": 3.0860400199890137,
75
+ "eval_runtime": 74.3001,
76
+ "eval_samples_per_second": 816.97,
77
+ "eval_steps_per_second": 12.773,
78
+ "step": 6768
79
  },
80
  {
81
+ "epoch": 3.102836879432624,
82
+ "grad_norm": 0.4440550208091736,
83
  "learning_rate": 0.00021875,
84
+ "loss": 3.0384,
85
  "step": 7000
86
  },
87
  {
88
+ "epoch": 3.546099290780142,
89
+ "grad_norm": 0.4073255956172943,
90
  "learning_rate": 0.00025,
91
+ "loss": 2.9651,
92
  "step": 8000
93
  },
94
  {
95
+ "epoch": 3.9893617021276597,
96
+ "grad_norm": 0.3835934102535248,
97
  "learning_rate": 0.00028125000000000003,
98
+ "loss": 2.9218,
99
  "step": 9000
100
  },
101
  {
102
+ "epoch": 4.0,
103
+ "eval_accuracy": 0.4411697193992669,
104
+ "eval_loss": 2.9823455810546875,
105
+ "eval_runtime": 73.4266,
106
+ "eval_samples_per_second": 826.69,
107
+ "eval_steps_per_second": 12.924,
108
+ "step": 9024
109
  },
110
  {
111
+ "epoch": 4.432624113475177,
112
+ "grad_norm": 0.35862067341804504,
113
  "learning_rate": 0.0003125,
114
+ "loss": 2.8625,
115
  "step": 10000
116
  },
117
  {
118
+ "epoch": 4.875886524822695,
119
+ "grad_norm": 0.33855435252189636,
120
  "learning_rate": 0.00034375,
121
+ "loss": 2.8441,
122
  "step": 11000
123
  },
124
  {
125
+ "epoch": 5.0,
126
+ "eval_accuracy": 0.44725845198790215,
127
+ "eval_loss": 2.9202077388763428,
128
+ "eval_runtime": 73.099,
129
+ "eval_samples_per_second": 830.394,
130
+ "eval_steps_per_second": 12.982,
131
+ "step": 11280
132
  },
133
  {
134
+ "epoch": 5.319148936170213,
135
+ "grad_norm": 0.3236384987831116,
136
+ "learning_rate": 0.000375,
137
+ "loss": 2.7995,
138
  "step": 12000
139
  },
140
  {
141
+ "epoch": 5.76241134751773,
142
+ "grad_norm": 0.3051661252975464,
143
+ "learning_rate": 0.00040625000000000004,
144
+ "loss": 2.7865,
145
  "step": 13000
146
  },
147
  {
148
+ "epoch": 6.0,
149
+ "eval_accuracy": 0.4512637482794966,
150
+ "eval_loss": 2.8827643394470215,
151
+ "eval_runtime": 72.7776,
152
+ "eval_samples_per_second": 834.061,
153
+ "eval_steps_per_second": 13.04,
154
+ "step": 13536
155
  },
156
  {
157
+ "epoch": 6.205673758865248,
158
+ "grad_norm": 0.29630789160728455,
159
+ "learning_rate": 0.0004375,
160
+ "loss": 2.759,
161
  "step": 14000
162
  },
163
  {
164
+ "epoch": 6.648936170212766,
165
+ "grad_norm": 0.27569055557250977,
166
  "learning_rate": 0.00046871875,
167
+ "loss": 2.7432,
168
  "step": 15000
169
  },
170
  {
171
+ "epoch": 7.0,
172
+ "eval_accuracy": 0.4541487994350967,
173
+ "eval_loss": 2.8589529991149902,
174
+ "eval_runtime": 72.8865,
175
+ "eval_samples_per_second": 832.815,
176
+ "eval_steps_per_second": 13.02,
177
+ "step": 15792
178
  },
179
  {
180
+ "epoch": 7.092198581560283,
181
+ "grad_norm": 0.2677430808544159,
182
  "learning_rate": 0.00049996875,
183
+ "loss": 2.7319,
184
  "step": 16000
185
  },
186
  {
187
+ "epoch": 7.535460992907802,
188
+ "grad_norm": 0.2510625123977661,
189
  "learning_rate": 0.0005311875000000001,
190
+ "loss": 2.7061,
191
  "step": 17000
192
  },
193
  {
194
+ "epoch": 7.9787234042553195,
195
+ "grad_norm": 0.23760418593883514,
196
+ "learning_rate": 0.0005624375,
197
+ "loss": 2.7146,
198
  "step": 18000
199
  },
200
  {
201
  "epoch": 8.0,
202
+ "eval_accuracy": 0.4565537086154539,
203
+ "eval_loss": 2.8377726078033447,
204
+ "eval_runtime": 72.7887,
205
+ "eval_samples_per_second": 833.934,
206
+ "eval_steps_per_second": 13.038,
207
+ "step": 18048
208
  },
209
  {
210
+ "epoch": 8.421985815602836,
211
+ "grad_norm": 0.23823712766170502,
212
  "learning_rate": 0.00059365625,
213
+ "loss": 2.675,
214
  "step": 19000
215
  },
216
  {
217
+ "epoch": 8.865248226950355,
218
+ "grad_norm": 0.23026619851589203,
219
  "learning_rate": 0.00062490625,
220
+ "loss": 2.6906,
221
  "step": 20000
222
  },
223
  {
224
+ "epoch": 9.0,
225
+ "eval_accuracy": 0.45826760614791046,
226
+ "eval_loss": 2.8225581645965576,
227
+ "eval_runtime": 72.5363,
228
+ "eval_samples_per_second": 836.836,
229
+ "eval_steps_per_second": 13.083,
230
+ "step": 20304
231
  },
232
  {
233
+ "epoch": 9.308510638297872,
234
+ "grad_norm": 0.22488652169704437,
235
+ "learning_rate": 0.000656125,
236
  "loss": 2.6598,
237
  "step": 21000
238
  },
239
  {
240
+ "epoch": 9.75177304964539,
241
+ "grad_norm": 0.20617271959781647,
242
  "learning_rate": 0.0006873749999999999,
243
+ "loss": 2.6681,
244
  "step": 22000
245
  },
246
  {
247
+ "epoch": 10.0,
248
+ "eval_accuracy": 0.459518546549771,
249
+ "eval_loss": 2.813441753387451,
250
+ "eval_runtime": 72.4442,
251
+ "eval_samples_per_second": 837.9,
252
+ "eval_steps_per_second": 13.1,
253
+ "step": 22560
254
  },
255
  {
256
+ "epoch": 10.195035460992909,
257
+ "grad_norm": 0.20597966015338898,
258
  "learning_rate": 0.000718625,
259
+ "loss": 2.6559,
260
  "step": 23000
261
  },
262
  {
263
+ "epoch": 10.638297872340425,
264
+ "grad_norm": 0.21323370933532715,
265
+ "learning_rate": 0.0007498437500000001,
266
+ "loss": 2.6498,
267
  "step": 24000
268
  },
269
  {
270
+ "epoch": 11.0,
271
+ "eval_accuracy": 0.4604723054276652,
272
+ "eval_loss": 2.8047826290130615,
273
+ "eval_runtime": 72.8612,
274
+ "eval_samples_per_second": 833.105,
275
+ "eval_steps_per_second": 13.025,
276
+ "step": 24816
277
  },
278
  {
279
+ "epoch": 11.081560283687944,
280
+ "grad_norm": 0.21533997356891632,
281
+ "learning_rate": 0.00078109375,
282
+ "loss": 2.6487,
283
  "step": 25000
284
  },
285
  {
286
+ "epoch": 11.52482269503546,
287
+ "grad_norm": 0.21542951464653015,
288
+ "learning_rate": 0.0008123125,
289
+ "loss": 2.6323,
290
  "step": 26000
291
  },
292
  {
293
+ "epoch": 11.96808510638298,
294
+ "grad_norm": 0.19053979218006134,
295
+ "learning_rate": 0.0008435625,
296
+ "loss": 2.6497,
297
  "step": 27000
298
  },
299
  {
300
+ "epoch": 12.0,
301
+ "eval_accuracy": 0.46156735473880167,
302
+ "eval_loss": 2.7969932556152344,
303
+ "eval_runtime": 72.5419,
304
+ "eval_samples_per_second": 836.771,
305
+ "eval_steps_per_second": 13.082,
306
+ "step": 27072
307
  },
308
  {
309
+ "epoch": 12.411347517730496,
310
+ "grad_norm": 0.18872858583927155,
311
+ "learning_rate": 0.00087478125,
312
+ "loss": 2.6167,
313
  "step": 28000
314
  },
315
  {
316
+ "epoch": 12.854609929078014,
317
+ "grad_norm": 0.1738893836736679,
318
  "learning_rate": 0.0009060312499999999,
319
+ "loss": 2.6375,
320
  "step": 29000
321
  },
322
  {
323
+ "epoch": 13.0,
324
+ "eval_accuracy": 0.4621078374843455,
325
+ "eval_loss": 2.7915232181549072,
326
+ "eval_runtime": 72.7216,
327
+ "eval_samples_per_second": 834.704,
328
+ "eval_steps_per_second": 13.05,
329
+ "step": 29328
330
  },
331
  {
332
+ "epoch": 13.297872340425531,
333
+ "grad_norm": 0.1877707690000534,
334
+ "learning_rate": 0.00093725,
335
+ "loss": 2.6153,
336
  "step": 30000
337
  },
338
  {
339
+ "epoch": 13.74113475177305,
340
+ "grad_norm": 0.186727836728096,
341
+ "learning_rate": 0.0009685000000000001,
342
+ "loss": 2.6278,
343
  "step": 31000
344
  },
345
  {
346
+ "epoch": 14.0,
347
+ "eval_accuracy": 0.462822946677559,
348
+ "eval_loss": 2.786189317703247,
349
+ "eval_runtime": 72.4588,
350
+ "eval_samples_per_second": 837.731,
351
+ "eval_steps_per_second": 13.097,
352
+ "step": 31584
353
  },
354
  {
355
+ "epoch": 14.184397163120567,
356
+ "grad_norm": 0.20270851254463196,
357
+ "learning_rate": 0.00099971875,
358
+ "loss": 2.6168,
359
  "step": 32000
360
  },
361
  {
362
+ "epoch": 14.627659574468085,
363
+ "grad_norm": 0.17972639203071594,
364
+ "learning_rate": 0.0009244664634146341,
365
+ "loss": 2.6102,
366
  "step": 33000
367
  },
368
  {
369
+ "epoch": 15.0,
370
+ "eval_accuracy": 0.46529976086578023,
371
+ "eval_loss": 2.763315200805664,
372
+ "eval_runtime": 72.4263,
373
+ "eval_samples_per_second": 838.107,
374
+ "eval_steps_per_second": 13.103,
375
+ "step": 33840
376
  },
377
  {
378
+ "epoch": 15.070921985815604,
379
+ "grad_norm": 0.17892582714557648,
380
+ "learning_rate": 0.0008483231707317073,
381
+ "loss": 2.6,
382
  "step": 34000
383
  },
384
  {
385
+ "epoch": 15.51418439716312,
386
+ "grad_norm": 0.16611941158771515,
387
+ "learning_rate": 0.0007721036585365854,
388
+ "loss": 2.5602,
389
  "step": 35000
390
  },
391
  {
392
+ "epoch": 15.957446808510639,
393
+ "grad_norm": 0.1676749587059021,
394
+ "learning_rate": 0.0006960365853658537,
395
+ "loss": 2.5668,
396
  "step": 36000
397
  },
398
  {
399
  "epoch": 16.0,
400
+ "eval_accuracy": 0.46856268478957125,
401
+ "eval_loss": 2.739426374435425,
402
+ "eval_runtime": 72.5666,
403
+ "eval_samples_per_second": 836.486,
404
+ "eval_steps_per_second": 13.078,
405
+ "step": 36096
406
+ },
407
+ {
408
+ "epoch": 16.400709219858157,
409
+ "grad_norm": 0.17558415234088898,
410
+ "learning_rate": 0.0006198170731707318,
411
+ "loss": 2.5115,
412
  "step": 37000
413
  },
414
  {
415
+ "epoch": 16.843971631205672,
416
+ "grad_norm": 0.1874464899301529,
417
+ "learning_rate": 0.0005435975609756098,
418
+ "loss": 2.5178,
419
  "step": 38000
420
  },
421
  {
422
+ "epoch": 17.0,
423
+ "eval_accuracy": 0.47171248592021775,
424
+ "eval_loss": 2.7182633876800537,
425
+ "eval_runtime": 72.699,
426
+ "eval_samples_per_second": 834.964,
427
+ "eval_steps_per_second": 13.054,
428
+ "step": 38352
429
  },
430
  {
431
+ "epoch": 17.28723404255319,
432
+ "grad_norm": 0.18552443385124207,
433
+ "learning_rate": 0.00046745426829268295,
434
+ "loss": 2.4731,
435
  "step": 39000
436
  },
437
  {
438
+ "epoch": 17.73049645390071,
439
+ "grad_norm": 0.1822243332862854,
440
+ "learning_rate": 0.000391234756097561,
441
+ "loss": 2.462,
442
  "step": 40000
443
  },
444
  {
445
+ "epoch": 18.0,
446
+ "eval_accuracy": 0.4747758459901975,
447
+ "eval_loss": 2.697719097137451,
448
+ "eval_runtime": 72.2911,
449
+ "eval_samples_per_second": 839.675,
450
+ "eval_steps_per_second": 13.127,
451
+ "step": 40608
452
  },
453
  {
454
+ "epoch": 18.173758865248228,
455
+ "grad_norm": 0.18331420421600342,
456
+ "learning_rate": 0.000315015243902439,
457
+ "loss": 2.4319,
458
  "step": 41000
459
  },
460
  {
461
+ "epoch": 18.617021276595743,
462
+ "grad_norm": 0.19214719533920288,
463
+ "learning_rate": 0.00023879573170731708,
464
+ "loss": 2.3974,
465
  "step": 42000
466
  },
467
  {
468
+ "epoch": 19.0,
469
+ "eval_accuracy": 0.4773433651479076,
470
+ "eval_loss": 2.683954954147339,
471
+ "eval_runtime": 72.4404,
472
+ "eval_samples_per_second": 837.944,
473
+ "eval_steps_per_second": 13.1,
474
+ "step": 42864
475
  },
476
  {
477
+ "epoch": 19.06028368794326,
478
+ "grad_norm": 0.19565586745738983,
479
+ "learning_rate": 0.00016265243902439025,
480
+ "loss": 2.3863,
481
  "step": 43000
482
  },
483
  {
484
+ "epoch": 19.50354609929078,
485
+ "grad_norm": 0.19951286911964417,
486
+ "learning_rate": 8.643292682926828e-05,
487
+ "loss": 2.3273,
488
  "step": 44000
489
  },
490
  {
491
+ "epoch": 19.9468085106383,
492
+ "grad_norm": 0.19867576658725739,
493
+ "learning_rate": 1.0213414634146342e-05,
494
+ "loss": 2.3259,
495
  "step": 45000
496
  },
497
  {
498
+ "epoch": 20.0,
499
+ "eval_accuracy": 0.47877642614021604,
500
+ "eval_loss": 2.6820449829101562,
501
+ "eval_runtime": 72.6976,
502
+ "eval_samples_per_second": 834.98,
503
+ "eval_steps_per_second": 13.054,
504
+ "step": 45120
505
  },
506
  {
507
+ "epoch": 20.0,
508
+ "step": 45120,
509
+ "total_flos": 1.50902942072832e+18,
510
+ "train_loss": 2.805498681169875,
511
+ "train_runtime": 30524.4807,
512
+ "train_samples_per_second": 378.402,
513
+ "train_steps_per_second": 1.478
514
  }
515
  ],
516
  "logging_steps": 1000,
517
+ "max_steps": 45120,
518
  "num_input_tokens_seen": 0,
519
  "num_train_epochs": 20,
520
  "save_steps": 500,
 
539
  "attributes": {}
540
  }
541
  },
542
+ "total_flos": 1.50902942072832e+18,
543
  "train_batch_size": 32,
544
  "trial_name": null,
545
  "trial_params": null