kanishka commited on
Commit
6e48671
1 Parent(s): bc2576c

End of training

Browse files
README.md CHANGED
@@ -1,11 +1,23 @@
1
  ---
2
  tags:
3
  - generated_from_trainer
 
 
4
  metrics:
5
  - accuracy
6
  model-index:
7
  - name: cria-babylm2-subset-default-1e-3
8
- results: []
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -13,7 +25,7 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # cria-babylm2-subset-default-1e-3
15
 
16
- This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
  - Loss: 2.6626
19
  - Accuracy: 0.5204
 
1
  ---
2
  tags:
3
  - generated_from_trainer
4
+ datasets:
5
+ - kanishka/babylm2-subset
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: cria-babylm2-subset-default-1e-3
10
+ results:
11
+ - task:
12
+ name: Causal Language Modeling
13
+ type: text-generation
14
+ dataset:
15
+ name: kanishka/babylm2-subset
16
+ type: kanishka/babylm2-subset
17
+ metrics:
18
+ - name: Accuracy
19
+ type: accuracy
20
+ value: 0.5203706477236009
21
  ---
22
 
23
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
25
 
26
  # cria-babylm2-subset-default-1e-3
27
 
28
+ This model was trained from scratch on the kanishka/babylm2-subset dataset.
29
  It achieves the following results on the evaluation set:
30
  - Loss: 2.6626
31
  - Accuracy: 0.5204
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.5203706477236009,
4
+ "eval_loss": 2.6626083850860596,
5
+ "eval_runtime": 101.9412,
6
+ "eval_samples": 46868,
7
+ "eval_samples_per_second": 459.755,
8
+ "eval_steps_per_second": 7.19,
9
+ "perplexity": 14.333628001897594,
10
+ "total_flos": 6.171008476428288e+17,
11
+ "train_loss": 2.0168114449748256,
12
+ "train_runtime": 24063.5613,
13
+ "train_samples": 452524,
14
+ "train_samples_per_second": 188.054,
15
+ "train_steps_per_second": 5.877
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.5203706477236009,
4
+ "eval_loss": 2.6626083850860596,
5
+ "eval_runtime": 101.9412,
6
+ "eval_samples": 46868,
7
+ "eval_samples_per_second": 459.755,
8
+ "eval_steps_per_second": 7.19,
9
+ "perplexity": 14.333628001897594
10
+ }
runs/Jul24_20-10-48_phyl-ling-p01.la.utexas.edu/events.out.tfevents.1721894287.phyl-ling-p01.la.utexas.edu.130506.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0fea37449358621fdf730edb3521f6f9e1b3d2a0b86759c96a3aa248d1ba938
3
+ size 417
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 6.171008476428288e+17,
4
+ "train_loss": 2.0168114449748256,
5
+ "train_runtime": 24063.5613,
6
+ "train_samples": 452524,
7
+ "train_samples_per_second": 188.054,
8
+ "train_steps_per_second": 5.877
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 141420,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07071135624381275,
13
+ "grad_norm": 1.5066859722137451,
14
+ "learning_rate": 3.125e-05,
15
+ "loss": 4.9729,
16
+ "step": 1000
17
+ },
18
+ {
19
+ "epoch": 0.1414227124876255,
20
+ "grad_norm": 0.9705550670623779,
21
+ "learning_rate": 6.25e-05,
22
+ "loss": 3.374,
23
+ "step": 2000
24
+ },
25
+ {
26
+ "epoch": 0.21213406873143828,
27
+ "grad_norm": 0.8380181789398193,
28
+ "learning_rate": 9.375e-05,
29
+ "loss": 3.0791,
30
+ "step": 3000
31
+ },
32
+ {
33
+ "epoch": 0.282845424975251,
34
+ "grad_norm": 0.7704618573188782,
35
+ "learning_rate": 0.000125,
36
+ "loss": 2.9038,
37
+ "step": 4000
38
+ },
39
+ {
40
+ "epoch": 0.3535567812190638,
41
+ "grad_norm": 0.675819993019104,
42
+ "learning_rate": 0.00015625,
43
+ "loss": 2.7896,
44
+ "step": 5000
45
+ },
46
+ {
47
+ "epoch": 0.42426813746287656,
48
+ "grad_norm": 0.6245794892311096,
49
+ "learning_rate": 0.0001875,
50
+ "loss": 2.6919,
51
+ "step": 6000
52
+ },
53
+ {
54
+ "epoch": 0.4949794937066893,
55
+ "grad_norm": 0.5709772109985352,
56
+ "learning_rate": 0.00021875,
57
+ "loss": 2.6272,
58
+ "step": 7000
59
+ },
60
+ {
61
+ "epoch": 0.565690849950502,
62
+ "grad_norm": 0.5335475206375122,
63
+ "learning_rate": 0.00025,
64
+ "loss": 2.5715,
65
+ "step": 8000
66
+ },
67
+ {
68
+ "epoch": 0.6364022061943148,
69
+ "grad_norm": 0.4924204349517822,
70
+ "learning_rate": 0.00028125000000000003,
71
+ "loss": 2.5384,
72
+ "step": 9000
73
+ },
74
+ {
75
+ "epoch": 0.7071135624381276,
76
+ "grad_norm": 0.49077439308166504,
77
+ "learning_rate": 0.0003125,
78
+ "loss": 2.5154,
79
+ "step": 10000
80
+ },
81
+ {
82
+ "epoch": 0.7778249186819404,
83
+ "grad_norm": 0.4304497241973877,
84
+ "learning_rate": 0.00034371875,
85
+ "loss": 2.4784,
86
+ "step": 11000
87
+ },
88
+ {
89
+ "epoch": 0.8485362749257531,
90
+ "grad_norm": 0.45384302735328674,
91
+ "learning_rate": 0.00037496875000000003,
92
+ "loss": 2.458,
93
+ "step": 12000
94
+ },
95
+ {
96
+ "epoch": 0.9192476311695659,
97
+ "grad_norm": 0.3978016674518585,
98
+ "learning_rate": 0.0004061875,
99
+ "loss": 2.4536,
100
+ "step": 13000
101
+ },
102
+ {
103
+ "epoch": 0.9899589874133786,
104
+ "grad_norm": 0.3981296718120575,
105
+ "learning_rate": 0.0004374375,
106
+ "loss": 2.4397,
107
+ "step": 14000
108
+ },
109
+ {
110
+ "epoch": 1.0,
111
+ "eval_accuracy": 0.49545158536152034,
112
+ "eval_loss": 2.6684181690216064,
113
+ "eval_runtime": 101.2526,
114
+ "eval_samples_per_second": 462.882,
115
+ "eval_steps_per_second": 7.239,
116
+ "step": 14142
117
+ },
118
+ {
119
+ "epoch": 1.0606703436571914,
120
+ "grad_norm": 0.36285269260406494,
121
+ "learning_rate": 0.00046865625,
122
+ "loss": 2.3887,
123
+ "step": 15000
124
+ },
125
+ {
126
+ "epoch": 1.131381699901004,
127
+ "grad_norm": 0.3568965494632721,
128
+ "learning_rate": 0.00049990625,
129
+ "loss": 2.3848,
130
+ "step": 16000
131
+ },
132
+ {
133
+ "epoch": 1.2020930561448169,
134
+ "grad_norm": 0.32918983697891235,
135
+ "learning_rate": 0.00053109375,
136
+ "loss": 2.3763,
137
+ "step": 17000
138
+ },
139
+ {
140
+ "epoch": 1.2728044123886297,
141
+ "grad_norm": 0.3192691206932068,
142
+ "learning_rate": 0.00056234375,
143
+ "loss": 2.3757,
144
+ "step": 18000
145
+ },
146
+ {
147
+ "epoch": 1.3435157686324424,
148
+ "grad_norm": 0.29217350482940674,
149
+ "learning_rate": 0.00059359375,
150
+ "loss": 2.3728,
151
+ "step": 19000
152
+ },
153
+ {
154
+ "epoch": 1.414227124876255,
155
+ "grad_norm": 0.2726396918296814,
156
+ "learning_rate": 0.0006248437500000001,
157
+ "loss": 2.3482,
158
+ "step": 20000
159
+ },
160
+ {
161
+ "epoch": 1.4849384811200679,
162
+ "grad_norm": 0.2647142708301544,
163
+ "learning_rate": 0.0006560625,
164
+ "loss": 2.361,
165
+ "step": 21000
166
+ },
167
+ {
168
+ "epoch": 1.5556498373638807,
169
+ "grad_norm": 0.24640022218227386,
170
+ "learning_rate": 0.00068728125,
171
+ "loss": 2.3414,
172
+ "step": 22000
173
+ },
174
+ {
175
+ "epoch": 1.6263611936076934,
176
+ "grad_norm": 0.2376652956008911,
177
+ "learning_rate": 0.00071853125,
178
+ "loss": 2.3469,
179
+ "step": 23000
180
+ },
181
+ {
182
+ "epoch": 1.697072549851506,
183
+ "grad_norm": 0.20667687058448792,
184
+ "learning_rate": 0.00074978125,
185
+ "loss": 2.3334,
186
+ "step": 24000
187
+ },
188
+ {
189
+ "epoch": 1.7677839060953189,
190
+ "grad_norm": 0.21862906217575073,
191
+ "learning_rate": 0.0007810312499999999,
192
+ "loss": 2.325,
193
+ "step": 25000
194
+ },
195
+ {
196
+ "epoch": 1.8384952623391317,
197
+ "grad_norm": 0.19700638949871063,
198
+ "learning_rate": 0.00081225,
199
+ "loss": 2.3169,
200
+ "step": 26000
201
+ },
202
+ {
203
+ "epoch": 1.9092066185829444,
204
+ "grad_norm": 0.19530941545963287,
205
+ "learning_rate": 0.00084346875,
206
+ "loss": 2.3085,
207
+ "step": 27000
208
+ },
209
+ {
210
+ "epoch": 1.979917974826757,
211
+ "grad_norm": 0.18496540188789368,
212
+ "learning_rate": 0.00087471875,
213
+ "loss": 2.3085,
214
+ "step": 28000
215
+ },
216
+ {
217
+ "epoch": 2.0,
218
+ "eval_accuracy": 0.5092843747934841,
219
+ "eval_loss": 2.5420279502868652,
220
+ "eval_runtime": 102.0041,
221
+ "eval_samples_per_second": 459.472,
222
+ "eval_steps_per_second": 7.186,
223
+ "step": 28284
224
+ },
225
+ {
226
+ "epoch": 2.05062933107057,
227
+ "grad_norm": 0.18070034682750702,
228
+ "learning_rate": 0.00090596875,
229
+ "loss": 2.2557,
230
+ "step": 29000
231
+ },
232
+ {
233
+ "epoch": 2.1213406873143827,
234
+ "grad_norm": 0.17614798247814178,
235
+ "learning_rate": 0.0009371875,
236
+ "loss": 2.248,
237
+ "step": 30000
238
+ },
239
+ {
240
+ "epoch": 2.1920520435581956,
241
+ "grad_norm": 0.18162938952445984,
242
+ "learning_rate": 0.0009684375,
243
+ "loss": 2.246,
244
+ "step": 31000
245
+ },
246
+ {
247
+ "epoch": 2.262763399802008,
248
+ "grad_norm": 0.16680462658405304,
249
+ "learning_rate": 0.0009996875,
250
+ "loss": 2.2398,
251
+ "step": 32000
252
+ },
253
+ {
254
+ "epoch": 2.333474756045821,
255
+ "grad_norm": 0.17343448102474213,
256
+ "learning_rate": 0.0009909614330104186,
257
+ "loss": 2.2332,
258
+ "step": 33000
259
+ },
260
+ {
261
+ "epoch": 2.4041861122896337,
262
+ "grad_norm": 0.15368333458900452,
263
+ "learning_rate": 0.0009818223359532078,
264
+ "loss": 2.2367,
265
+ "step": 34000
266
+ },
267
+ {
268
+ "epoch": 2.4748974685334466,
269
+ "grad_norm": 0.14444677531719208,
270
+ "learning_rate": 0.0009726832388959971,
271
+ "loss": 2.2277,
272
+ "step": 35000
273
+ },
274
+ {
275
+ "epoch": 2.5456088247772595,
276
+ "grad_norm": 0.16958372294902802,
277
+ "learning_rate": 0.0009635441418387864,
278
+ "loss": 2.2136,
279
+ "step": 36000
280
+ },
281
+ {
282
+ "epoch": 2.616320181021072,
283
+ "grad_norm": 0.15171754360198975,
284
+ "learning_rate": 0.0009544141838786328,
285
+ "loss": 2.2105,
286
+ "step": 37000
287
+ },
288
+ {
289
+ "epoch": 2.6870315372648848,
290
+ "grad_norm": 0.13588131964206696,
291
+ "learning_rate": 0.0009452750868214221,
292
+ "loss": 2.2056,
293
+ "step": 38000
294
+ },
295
+ {
296
+ "epoch": 2.7577428935086976,
297
+ "grad_norm": 0.13553854823112488,
298
+ "learning_rate": 0.0009361359897642113,
299
+ "loss": 2.1988,
300
+ "step": 39000
301
+ },
302
+ {
303
+ "epoch": 2.82845424975251,
304
+ "grad_norm": 0.15744280815124512,
305
+ "learning_rate": 0.0009269968927070006,
306
+ "loss": 2.1949,
307
+ "step": 40000
308
+ },
309
+ {
310
+ "epoch": 2.899165605996323,
311
+ "grad_norm": 0.1427813470363617,
312
+ "learning_rate": 0.000917866934746847,
313
+ "loss": 2.1875,
314
+ "step": 41000
315
+ },
316
+ {
317
+ "epoch": 2.9698769622401358,
318
+ "grad_norm": 0.14179003238677979,
319
+ "learning_rate": 0.0009087278376896363,
320
+ "loss": 2.19,
321
+ "step": 42000
322
+ },
323
+ {
324
+ "epoch": 3.0,
325
+ "eval_accuracy": 0.5214661161125094,
326
+ "eval_loss": 2.439739942550659,
327
+ "eval_runtime": 102.0725,
328
+ "eval_samples_per_second": 459.164,
329
+ "eval_steps_per_second": 7.181,
330
+ "step": 42426
331
+ },
332
+ {
333
+ "epoch": 3.0405883184839486,
334
+ "grad_norm": 0.1600356101989746,
335
+ "learning_rate": 0.0008995978797294828,
336
+ "loss": 2.13,
337
+ "step": 43000
338
+ },
339
+ {
340
+ "epoch": 3.1112996747277615,
341
+ "grad_norm": 0.16733036935329437,
342
+ "learning_rate": 0.0008904587826722719,
343
+ "loss": 2.0964,
344
+ "step": 44000
345
+ },
346
+ {
347
+ "epoch": 3.182011030971574,
348
+ "grad_norm": 0.15149937570095062,
349
+ "learning_rate": 0.0008813379638091756,
350
+ "loss": 2.0964,
351
+ "step": 45000
352
+ },
353
+ {
354
+ "epoch": 3.2527223872153868,
355
+ "grad_norm": 0.1375265121459961,
356
+ "learning_rate": 0.0008721988667519649,
357
+ "loss": 2.1021,
358
+ "step": 46000
359
+ },
360
+ {
361
+ "epoch": 3.3234337434591996,
362
+ "grad_norm": 0.13642068207263947,
363
+ "learning_rate": 0.0008630597696947542,
364
+ "loss": 2.1062,
365
+ "step": 47000
366
+ },
367
+ {
368
+ "epoch": 3.3941450997030125,
369
+ "grad_norm": 0.15942348539829254,
370
+ "learning_rate": 0.0008539206726375435,
371
+ "loss": 2.0943,
372
+ "step": 48000
373
+ },
374
+ {
375
+ "epoch": 3.464856455946825,
376
+ "grad_norm": 0.14231225848197937,
377
+ "learning_rate": 0.0008447815755803326,
378
+ "loss": 2.0968,
379
+ "step": 49000
380
+ },
381
+ {
382
+ "epoch": 3.5355678121906378,
383
+ "grad_norm": 0.13483628630638123,
384
+ "learning_rate": 0.0008356516176201791,
385
+ "loss": 2.0923,
386
+ "step": 50000
387
+ },
388
+ {
389
+ "epoch": 3.6062791684344506,
390
+ "grad_norm": 0.15377779304981232,
391
+ "learning_rate": 0.0008265125205629684,
392
+ "loss": 2.0929,
393
+ "step": 51000
394
+ },
395
+ {
396
+ "epoch": 3.6769905246782635,
397
+ "grad_norm": 0.13733841478824615,
398
+ "learning_rate": 0.0008173825626028149,
399
+ "loss": 2.0929,
400
+ "step": 52000
401
+ },
402
+ {
403
+ "epoch": 3.747701880922076,
404
+ "grad_norm": 0.13640180230140686,
405
+ "learning_rate": 0.0008082434655456042,
406
+ "loss": 2.0938,
407
+ "step": 53000
408
+ },
409
+ {
410
+ "epoch": 3.8184132371658888,
411
+ "grad_norm": 0.13909070193767548,
412
+ "learning_rate": 0.0007991135075854505,
413
+ "loss": 2.0907,
414
+ "step": 54000
415
+ },
416
+ {
417
+ "epoch": 3.8891245934097016,
418
+ "grad_norm": 0.1521981954574585,
419
+ "learning_rate": 0.0007899744105282398,
420
+ "loss": 2.0816,
421
+ "step": 55000
422
+ },
423
+ {
424
+ "epoch": 3.9598359496535145,
425
+ "grad_norm": 0.12255113571882248,
426
+ "learning_rate": 0.0007808444525680864,
427
+ "loss": 2.0865,
428
+ "step": 56000
429
+ },
430
+ {
431
+ "epoch": 4.0,
432
+ "eval_accuracy": 0.5276129432475146,
433
+ "eval_loss": 2.3943161964416504,
434
+ "eval_runtime": 104.7687,
435
+ "eval_samples_per_second": 447.347,
436
+ "eval_steps_per_second": 6.996,
437
+ "step": 56568
438
+ },
439
+ {
440
+ "epoch": 4.030547305897327,
441
+ "grad_norm": 0.1423817127943039,
442
+ "learning_rate": 0.0007717053555108755,
443
+ "loss": 2.0304,
444
+ "step": 57000
445
+ },
446
+ {
447
+ "epoch": 4.10125866214114,
448
+ "grad_norm": 0.13736553490161896,
449
+ "learning_rate": 0.0007625662584536648,
450
+ "loss": 1.9815,
451
+ "step": 58000
452
+ },
453
+ {
454
+ "epoch": 4.171970018384952,
455
+ "grad_norm": 0.1411396712064743,
456
+ "learning_rate": 0.0007534363004935113,
457
+ "loss": 1.9919,
458
+ "step": 59000
459
+ },
460
+ {
461
+ "epoch": 4.2426813746287655,
462
+ "grad_norm": 0.14484618604183197,
463
+ "learning_rate": 0.0007442972034363005,
464
+ "loss": 1.9915,
465
+ "step": 60000
466
+ },
467
+ {
468
+ "epoch": 4.313392730872578,
469
+ "grad_norm": 0.1606305092573166,
470
+ "learning_rate": 0.000735167245476147,
471
+ "loss": 1.9925,
472
+ "step": 61000
473
+ },
474
+ {
475
+ "epoch": 4.384104087116391,
476
+ "grad_norm": 0.15816234052181244,
477
+ "learning_rate": 0.0007260281484189363,
478
+ "loss": 1.9963,
479
+ "step": 62000
480
+ },
481
+ {
482
+ "epoch": 4.454815443360204,
483
+ "grad_norm": 0.14397823810577393,
484
+ "learning_rate": 0.0007168981904587826,
485
+ "loss": 1.9989,
486
+ "step": 63000
487
+ },
488
+ {
489
+ "epoch": 4.525526799604016,
490
+ "grad_norm": 0.15473702549934387,
491
+ "learning_rate": 0.0007077590934015719,
492
+ "loss": 1.9965,
493
+ "step": 64000
494
+ },
495
+ {
496
+ "epoch": 4.596238155847829,
497
+ "grad_norm": 0.14191265404224396,
498
+ "learning_rate": 0.0006986291354414184,
499
+ "loss": 2.0005,
500
+ "step": 65000
501
+ },
502
+ {
503
+ "epoch": 4.666949512091642,
504
+ "grad_norm": 0.15206751227378845,
505
+ "learning_rate": 0.0006894900383842077,
506
+ "loss": 2.0114,
507
+ "step": 66000
508
+ },
509
+ {
510
+ "epoch": 4.737660868335455,
511
+ "grad_norm": 0.18548937141895294,
512
+ "learning_rate": 0.0006803600804240542,
513
+ "loss": 2.0021,
514
+ "step": 67000
515
+ },
516
+ {
517
+ "epoch": 4.8083722245792675,
518
+ "grad_norm": 0.16364724934101105,
519
+ "learning_rate": 0.0006712209833668433,
520
+ "loss": 2.0093,
521
+ "step": 68000
522
+ },
523
+ {
524
+ "epoch": 4.87908358082308,
525
+ "grad_norm": 0.1373205929994583,
526
+ "learning_rate": 0.0006620818863096326,
527
+ "loss": 2.0073,
528
+ "step": 69000
529
+ },
530
+ {
531
+ "epoch": 4.949794937066893,
532
+ "grad_norm": 0.15305304527282715,
533
+ "learning_rate": 0.000652951928349479,
534
+ "loss": 1.9957,
535
+ "step": 70000
536
+ },
537
+ {
538
+ "epoch": 5.0,
539
+ "eval_accuracy": 0.5305311481637808,
540
+ "eval_loss": 2.3786160945892334,
541
+ "eval_runtime": 102.0738,
542
+ "eval_samples_per_second": 459.158,
543
+ "eval_steps_per_second": 7.181,
544
+ "step": 70710
545
+ },
546
+ {
547
+ "epoch": 5.020506293310706,
548
+ "grad_norm": 0.17954622209072113,
549
+ "learning_rate": 0.0006438128312922683,
550
+ "loss": 1.963,
551
+ "step": 71000
552
+ },
553
+ {
554
+ "epoch": 5.091217649554518,
555
+ "grad_norm": 0.17249706387519836,
556
+ "learning_rate": 0.0006346828733321149,
557
+ "loss": 1.8814,
558
+ "step": 72000
559
+ },
560
+ {
561
+ "epoch": 5.161929005798331,
562
+ "grad_norm": 0.16035763919353485,
563
+ "learning_rate": 0.000625543776274904,
564
+ "loss": 1.8888,
565
+ "step": 73000
566
+ },
567
+ {
568
+ "epoch": 5.232640362042144,
569
+ "grad_norm": 0.16601450741291046,
570
+ "learning_rate": 0.0006164046792176932,
571
+ "loss": 1.8945,
572
+ "step": 74000
573
+ },
574
+ {
575
+ "epoch": 5.303351718285957,
576
+ "grad_norm": 0.1559607982635498,
577
+ "learning_rate": 0.0006072747212575398,
578
+ "loss": 1.9005,
579
+ "step": 75000
580
+ },
581
+ {
582
+ "epoch": 5.3740630745297695,
583
+ "grad_norm": 0.1599714308977127,
584
+ "learning_rate": 0.000598135624200329,
585
+ "loss": 1.9056,
586
+ "step": 76000
587
+ },
588
+ {
589
+ "epoch": 5.444774430773582,
590
+ "grad_norm": 0.15538254380226135,
591
+ "learning_rate": 0.0005890056662401755,
592
+ "loss": 1.9091,
593
+ "step": 77000
594
+ },
595
+ {
596
+ "epoch": 5.515485787017395,
597
+ "grad_norm": 0.1645193099975586,
598
+ "learning_rate": 0.0005798665691829647,
599
+ "loss": 1.9138,
600
+ "step": 78000
601
+ },
602
+ {
603
+ "epoch": 5.586197143261208,
604
+ "grad_norm": 0.1560288369655609,
605
+ "learning_rate": 0.0005707366112228112,
606
+ "loss": 1.9276,
607
+ "step": 79000
608
+ },
609
+ {
610
+ "epoch": 5.65690849950502,
611
+ "grad_norm": 0.169467955827713,
612
+ "learning_rate": 0.0005615975141656004,
613
+ "loss": 1.9167,
614
+ "step": 80000
615
+ },
616
+ {
617
+ "epoch": 5.727619855748833,
618
+ "grad_norm": 0.18090558052062988,
619
+ "learning_rate": 0.0005524675562054469,
620
+ "loss": 1.9289,
621
+ "step": 81000
622
+ },
623
+ {
624
+ "epoch": 5.798331211992646,
625
+ "grad_norm": 0.16788819432258606,
626
+ "learning_rate": 0.0005433284591482362,
627
+ "loss": 1.9228,
628
+ "step": 82000
629
+ },
630
+ {
631
+ "epoch": 5.869042568236459,
632
+ "grad_norm": 0.15961690247058868,
633
+ "learning_rate": 0.0005341893620910255,
634
+ "loss": 1.9178,
635
+ "step": 83000
636
+ },
637
+ {
638
+ "epoch": 5.9397539244802715,
639
+ "grad_norm": 0.15657977759838104,
640
+ "learning_rate": 0.0005250594041308718,
641
+ "loss": 1.9161,
642
+ "step": 84000
643
+ },
644
+ {
645
+ "epoch": 6.0,
646
+ "eval_accuracy": 0.5312578351518911,
647
+ "eval_loss": 2.3910350799560547,
648
+ "eval_runtime": 102.0407,
649
+ "eval_samples_per_second": 459.307,
650
+ "eval_steps_per_second": 7.183,
651
+ "step": 84852
652
+ },
653
+ {
654
+ "epoch": 6.010465280724084,
655
+ "grad_norm": 0.15551112592220306,
656
+ "learning_rate": 0.0005159203070736611,
657
+ "loss": 1.9123,
658
+ "step": 85000
659
+ },
660
+ {
661
+ "epoch": 6.081176636967897,
662
+ "grad_norm": 0.18589554727077484,
663
+ "learning_rate": 0.0005067812100164504,
664
+ "loss": 1.7906,
665
+ "step": 86000
666
+ },
667
+ {
668
+ "epoch": 6.15188799321171,
669
+ "grad_norm": 0.16240116953849792,
670
+ "learning_rate": 0.0004976512520562968,
671
+ "loss": 1.805,
672
+ "step": 87000
673
+ },
674
+ {
675
+ "epoch": 6.222599349455523,
676
+ "grad_norm": 0.1752467155456543,
677
+ "learning_rate": 0.0004885121549990861,
678
+ "loss": 1.8147,
679
+ "step": 88000
680
+ },
681
+ {
682
+ "epoch": 6.293310705699335,
683
+ "grad_norm": 0.15973269939422607,
684
+ "learning_rate": 0.00047937305794187537,
685
+ "loss": 1.8063,
686
+ "step": 89000
687
+ },
688
+ {
689
+ "epoch": 6.364022061943148,
690
+ "grad_norm": 0.18358197808265686,
691
+ "learning_rate": 0.0004702430999817218,
692
+ "loss": 1.8182,
693
+ "step": 90000
694
+ },
695
+ {
696
+ "epoch": 6.434733418186961,
697
+ "grad_norm": 0.20550867915153503,
698
+ "learning_rate": 0.00046110400292451105,
699
+ "loss": 1.8251,
700
+ "step": 91000
701
+ },
702
+ {
703
+ "epoch": 6.5054447744307735,
704
+ "grad_norm": 0.18148034811019897,
705
+ "learning_rate": 0.0004519740449643575,
706
+ "loss": 1.8283,
707
+ "step": 92000
708
+ },
709
+ {
710
+ "epoch": 6.576156130674587,
711
+ "grad_norm": 0.1863207072019577,
712
+ "learning_rate": 0.0004428349479071468,
713
+ "loss": 1.834,
714
+ "step": 93000
715
+ },
716
+ {
717
+ "epoch": 6.646867486918399,
718
+ "grad_norm": 0.1836949586868286,
719
+ "learning_rate": 0.000433695850849936,
720
+ "loss": 1.8257,
721
+ "step": 94000
722
+ },
723
+ {
724
+ "epoch": 6.717578843162212,
725
+ "grad_norm": 0.18851223587989807,
726
+ "learning_rate": 0.00042456589288978247,
727
+ "loss": 1.8291,
728
+ "step": 95000
729
+ },
730
+ {
731
+ "epoch": 6.788290199406025,
732
+ "grad_norm": 0.16575908660888672,
733
+ "learning_rate": 0.00041542679583257176,
734
+ "loss": 1.8412,
735
+ "step": 96000
736
+ },
737
+ {
738
+ "epoch": 6.859001555649837,
739
+ "grad_norm": 0.1861979216337204,
740
+ "learning_rate": 0.000406287698775361,
741
+ "loss": 1.848,
742
+ "step": 97000
743
+ },
744
+ {
745
+ "epoch": 6.92971291189365,
746
+ "grad_norm": 0.1783532202243805,
747
+ "learning_rate": 0.00039714860171815024,
748
+ "loss": 1.8361,
749
+ "step": 98000
750
+ },
751
+ {
752
+ "epoch": 7.0,
753
+ "eval_accuracy": 0.5303533991815411,
754
+ "eval_loss": 2.4205334186553955,
755
+ "eval_runtime": 102.4141,
756
+ "eval_samples_per_second": 457.632,
757
+ "eval_steps_per_second": 7.157,
758
+ "step": 98994
759
+ },
760
+ {
761
+ "epoch": 7.000424268137463,
762
+ "grad_norm": 0.1907605677843094,
763
+ "learning_rate": 0.00038801864375799674,
764
+ "loss": 1.8413,
765
+ "step": 99000
766
+ },
767
+ {
768
+ "epoch": 7.0711356243812755,
769
+ "grad_norm": 0.21442489326000214,
770
+ "learning_rate": 0.0003788795467007859,
771
+ "loss": 1.6956,
772
+ "step": 100000
773
+ },
774
+ {
775
+ "epoch": 7.141846980625088,
776
+ "grad_norm": 0.19562986493110657,
777
+ "learning_rate": 0.0003697404496435752,
778
+ "loss": 1.7053,
779
+ "step": 101000
780
+ },
781
+ {
782
+ "epoch": 7.212558336868901,
783
+ "grad_norm": 0.23670311272144318,
784
+ "learning_rate": 0.00036060135258636445,
785
+ "loss": 1.7196,
786
+ "step": 102000
787
+ },
788
+ {
789
+ "epoch": 7.283269693112714,
790
+ "grad_norm": 0.19641369581222534,
791
+ "learning_rate": 0.00035148053372326815,
792
+ "loss": 1.719,
793
+ "step": 103000
794
+ },
795
+ {
796
+ "epoch": 7.353981049356527,
797
+ "grad_norm": 0.2086309790611267,
798
+ "learning_rate": 0.0003423414366660574,
799
+ "loss": 1.7279,
800
+ "step": 104000
801
+ },
802
+ {
803
+ "epoch": 7.424692405600339,
804
+ "grad_norm": 0.1947568953037262,
805
+ "learning_rate": 0.0003332023396088467,
806
+ "loss": 1.7389,
807
+ "step": 105000
808
+ },
809
+ {
810
+ "epoch": 7.495403761844152,
811
+ "grad_norm": 0.19536983966827393,
812
+ "learning_rate": 0.00032407238164869313,
813
+ "loss": 1.7428,
814
+ "step": 106000
815
+ },
816
+ {
817
+ "epoch": 7.566115118087965,
818
+ "grad_norm": 0.1872589886188507,
819
+ "learning_rate": 0.00031493328459148237,
820
+ "loss": 1.7463,
821
+ "step": 107000
822
+ },
823
+ {
824
+ "epoch": 7.6368264743317775,
825
+ "grad_norm": 0.22906361520290375,
826
+ "learning_rate": 0.0003057941875342716,
827
+ "loss": 1.7479,
828
+ "step": 108000
829
+ },
830
+ {
831
+ "epoch": 7.707537830575591,
832
+ "grad_norm": 0.19299902021884918,
833
+ "learning_rate": 0.0002966642295741181,
834
+ "loss": 1.7514,
835
+ "step": 109000
836
+ },
837
+ {
838
+ "epoch": 7.778249186819403,
839
+ "grad_norm": 0.19876809418201447,
840
+ "learning_rate": 0.00028752513251690734,
841
+ "loss": 1.7467,
842
+ "step": 110000
843
+ },
844
+ {
845
+ "epoch": 7.848960543063216,
846
+ "grad_norm": 0.22273430228233337,
847
+ "learning_rate": 0.0002783860354596966,
848
+ "loss": 1.76,
849
+ "step": 111000
850
+ },
851
+ {
852
+ "epoch": 7.919671899307029,
853
+ "grad_norm": 0.1979241669178009,
854
+ "learning_rate": 0.0002692560774995431,
855
+ "loss": 1.7547,
856
+ "step": 112000
857
+ },
858
+ {
859
+ "epoch": 7.990383255550841,
860
+ "grad_norm": 0.2099294811487198,
861
+ "learning_rate": 0.00026011698044233226,
862
+ "loss": 1.7477,
863
+ "step": 113000
864
+ },
865
+ {
866
+ "epoch": 8.0,
867
+ "eval_accuracy": 0.5282502161049046,
868
+ "eval_loss": 2.474827289581299,
869
+ "eval_runtime": 102.4954,
870
+ "eval_samples_per_second": 457.269,
871
+ "eval_steps_per_second": 7.152,
872
+ "step": 113136
873
+ },
874
+ {
875
+ "epoch": 8.061094611794655,
876
+ "grad_norm": 0.24672599136829376,
877
+ "learning_rate": 0.00025097788338512156,
878
+ "loss": 1.6197,
879
+ "step": 114000
880
+ },
881
+ {
882
+ "epoch": 8.131805968038467,
883
+ "grad_norm": 0.21202607452869415,
884
+ "learning_rate": 0.00024183878632791082,
885
+ "loss": 1.6192,
886
+ "step": 115000
887
+ },
888
+ {
889
+ "epoch": 8.20251732428228,
890
+ "grad_norm": 0.24981403350830078,
891
+ "learning_rate": 0.00023271796746481447,
892
+ "loss": 1.6329,
893
+ "step": 116000
894
+ },
895
+ {
896
+ "epoch": 8.273228680526092,
897
+ "grad_norm": 0.25290995836257935,
898
+ "learning_rate": 0.00022357887040760373,
899
+ "loss": 1.6386,
900
+ "step": 117000
901
+ },
902
+ {
903
+ "epoch": 8.343940036769904,
904
+ "grad_norm": 0.2473640739917755,
905
+ "learning_rate": 0.000214439773350393,
906
+ "loss": 1.6414,
907
+ "step": 118000
908
+ },
909
+ {
910
+ "epoch": 8.414651393013719,
911
+ "grad_norm": 0.20307676494121552,
912
+ "learning_rate": 0.00020530981539023944,
913
+ "loss": 1.6458,
914
+ "step": 119000
915
+ },
916
+ {
917
+ "epoch": 8.485362749257531,
918
+ "grad_norm": 0.21696613729000092,
919
+ "learning_rate": 0.0001961707183330287,
920
+ "loss": 1.6473,
921
+ "step": 120000
922
+ },
923
+ {
924
+ "epoch": 8.556074105501343,
925
+ "grad_norm": 0.23408186435699463,
926
+ "learning_rate": 0.00018703162127581797,
927
+ "loss": 1.656,
928
+ "step": 121000
929
+ },
930
+ {
931
+ "epoch": 8.626785461745156,
932
+ "grad_norm": 0.23058977723121643,
933
+ "learning_rate": 0.0001778925242186072,
934
+ "loss": 1.6578,
935
+ "step": 122000
936
+ },
937
+ {
938
+ "epoch": 8.697496817988968,
939
+ "grad_norm": 0.23317036032676697,
940
+ "learning_rate": 0.00016877170535551086,
941
+ "loss": 1.6516,
942
+ "step": 123000
943
+ },
944
+ {
945
+ "epoch": 8.768208174232782,
946
+ "grad_norm": 0.2361781746149063,
947
+ "learning_rate": 0.00015963260829830012,
948
+ "loss": 1.6525,
949
+ "step": 124000
950
+ },
951
+ {
952
+ "epoch": 8.838919530476595,
953
+ "grad_norm": 0.260776549577713,
954
+ "learning_rate": 0.00015049351124108936,
955
+ "loss": 1.6547,
956
+ "step": 125000
957
+ },
958
+ {
959
+ "epoch": 8.909630886720407,
960
+ "grad_norm": 0.2507932186126709,
961
+ "learning_rate": 0.00014136355328093583,
962
+ "loss": 1.6556,
963
+ "step": 126000
964
+ },
965
+ {
966
+ "epoch": 8.98034224296422,
967
+ "grad_norm": 0.2422228902578354,
968
+ "learning_rate": 0.0001322244562237251,
969
+ "loss": 1.6549,
970
+ "step": 127000
971
+ },
972
+ {
973
+ "epoch": 9.0,
974
+ "eval_accuracy": 0.5249380742803117,
975
+ "eval_loss": 2.5581541061401367,
976
+ "eval_runtime": 102.2383,
977
+ "eval_samples_per_second": 458.419,
978
+ "eval_steps_per_second": 7.17,
979
+ "step": 127278
980
+ },
981
+ {
982
+ "epoch": 9.051053599208032,
983
+ "grad_norm": 0.2604562044143677,
984
+ "learning_rate": 0.00012308535916651437,
985
+ "loss": 1.5675,
986
+ "step": 128000
987
+ },
988
+ {
989
+ "epoch": 9.121764955451846,
990
+ "grad_norm": 0.22102615237236023,
991
+ "learning_rate": 0.0001139462621093036,
992
+ "loss": 1.5337,
993
+ "step": 129000
994
+ },
995
+ {
996
+ "epoch": 9.192476311695659,
997
+ "grad_norm": 0.2960878014564514,
998
+ "learning_rate": 0.00010481630414915007,
999
+ "loss": 1.5556,
1000
+ "step": 130000
1001
+ },
1002
+ {
1003
+ "epoch": 9.263187667939471,
1004
+ "grad_norm": 0.22400617599487305,
1005
+ "learning_rate": 9.567720709193931e-05,
1006
+ "loss": 1.5491,
1007
+ "step": 131000
1008
+ },
1009
+ {
1010
+ "epoch": 9.333899024183284,
1011
+ "grad_norm": 0.24257275462150574,
1012
+ "learning_rate": 8.655638822884298e-05,
1013
+ "loss": 1.5502,
1014
+ "step": 132000
1015
+ },
1016
+ {
1017
+ "epoch": 9.404610380427096,
1018
+ "grad_norm": 0.24599485099315643,
1019
+ "learning_rate": 7.741729117163225e-05,
1020
+ "loss": 1.552,
1021
+ "step": 133000
1022
+ },
1023
+ {
1024
+ "epoch": 9.47532173667091,
1025
+ "grad_norm": 0.25757452845573425,
1026
+ "learning_rate": 6.82781941144215e-05,
1027
+ "loss": 1.5576,
1028
+ "step": 134000
1029
+ },
1030
+ {
1031
+ "epoch": 9.546033092914723,
1032
+ "grad_norm": 0.28276532888412476,
1033
+ "learning_rate": 5.914823615426796e-05,
1034
+ "loss": 1.5529,
1035
+ "step": 135000
1036
+ },
1037
+ {
1038
+ "epoch": 9.616744449158535,
1039
+ "grad_norm": 0.2369563728570938,
1040
+ "learning_rate": 5.000913909705721e-05,
1041
+ "loss": 1.5548,
1042
+ "step": 136000
1043
+ },
1044
+ {
1045
+ "epoch": 9.687455805402347,
1046
+ "grad_norm": 0.25778231024742126,
1047
+ "learning_rate": 4.0870042039846464e-05,
1048
+ "loss": 1.55,
1049
+ "step": 137000
1050
+ },
1051
+ {
1052
+ "epoch": 9.75816716164616,
1053
+ "grad_norm": 0.2770988941192627,
1054
+ "learning_rate": 3.173094498263571e-05,
1055
+ "loss": 1.559,
1056
+ "step": 138000
1057
+ },
1058
+ {
1059
+ "epoch": 9.828878517889972,
1060
+ "grad_norm": 0.2701665163040161,
1061
+ "learning_rate": 2.261012611953939e-05,
1062
+ "loss": 1.5579,
1063
+ "step": 139000
1064
+ },
1065
+ {
1066
+ "epoch": 9.899589874133786,
1067
+ "grad_norm": 0.2540683448314667,
1068
+ "learning_rate": 1.3471029062328641e-05,
1069
+ "loss": 1.5563,
1070
+ "step": 140000
1071
+ },
1072
+ {
1073
+ "epoch": 9.970301230377599,
1074
+ "grad_norm": 0.26811909675598145,
1075
+ "learning_rate": 4.341071102175106e-06,
1076
+ "loss": 1.5611,
1077
+ "step": 141000
1078
+ },
1079
+ {
1080
+ "epoch": 10.0,
1081
+ "eval_accuracy": 0.5203706477236009,
1082
+ "eval_loss": 2.6626083850860596,
1083
+ "eval_runtime": 102.3167,
1084
+ "eval_samples_per_second": 458.068,
1085
+ "eval_steps_per_second": 7.164,
1086
+ "step": 141420
1087
+ },
1088
+ {
1089
+ "epoch": 10.0,
1090
+ "step": 141420,
1091
+ "total_flos": 6.171008476428288e+17,
1092
+ "train_loss": 2.0168114449748256,
1093
+ "train_runtime": 24063.5613,
1094
+ "train_samples_per_second": 188.054,
1095
+ "train_steps_per_second": 5.877
1096
+ }
1097
+ ],
1098
+ "logging_steps": 1000,
1099
+ "max_steps": 141420,
1100
+ "num_input_tokens_seen": 0,
1101
+ "num_train_epochs": 10,
1102
+ "save_steps": 5000,
1103
+ "stateful_callbacks": {
1104
+ "TrainerControl": {
1105
+ "args": {
1106
+ "should_epoch_stop": false,
1107
+ "should_evaluate": false,
1108
+ "should_log": false,
1109
+ "should_save": true,
1110
+ "should_training_stop": true
1111
+ },
1112
+ "attributes": {}
1113
+ }
1114
+ },
1115
+ "total_flos": 6.171008476428288e+17,
1116
+ "train_batch_size": 32,
1117
+ "trial_name": null,
1118
+ "trial_params": null
1119
+ }