jcssafedep commited on
Commit
30a49b0
1 Parent(s): 57cb960

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +6 -6
  2. train_results.json +6 -6
  3. trainer_state.json +442 -92
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 11.15,
3
- "train_loss": 0.6717062408447265,
4
- "train_runtime": 2130.1373,
5
- "train_samples": 897,
6
- "train_samples_per_second": 4.695,
7
- "train_steps_per_second": 4.695
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.22781734634577194,
4
+ "train_runtime": 7496.2711,
5
+ "train_samples": 3519,
6
+ "train_samples_per_second": 4.694,
7
+ "train_steps_per_second": 4.694
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 11.15,
3
- "train_loss": 0.6717062408447265,
4
- "train_runtime": 2130.1373,
5
- "train_samples": 897,
6
- "train_samples_per_second": 4.695,
7
- "train_steps_per_second": 4.695
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.22781734634577194,
4
+ "train_runtime": 7496.2711,
5
+ "train_samples": 3519,
6
+ "train_samples_per_second": 4.694,
7
+ "train_steps_per_second": 4.694
8
  }
trainer_state.json CHANGED
@@ -1,169 +1,519 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 11.148272017837236,
5
  "eval_steps": 500,
6
- "global_step": 10000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.56,
13
- "grad_norm": 4.136017322540283,
14
- "learning_rate": 4.75e-05,
15
- "loss": 1.4862,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 1.11,
20
- "grad_norm": 3.5059304237365723,
21
- "learning_rate": 4.5e-05,
22
- "loss": 1.2182,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 1.67,
27
- "grad_norm": 3.1735312938690186,
28
- "learning_rate": 4.25e-05,
29
- "loss": 1.0549,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 2.23,
34
- "grad_norm": 3.1064469814300537,
35
- "learning_rate": 4e-05,
36
- "loss": 0.9527,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 2.79,
41
- "grad_norm": 2.8415098190307617,
42
- "learning_rate": 3.7500000000000003e-05,
43
- "loss": 0.8717,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 3.34,
48
- "grad_norm": 2.4496607780456543,
49
- "learning_rate": 3.5e-05,
50
- "loss": 0.783,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 3.9,
55
- "grad_norm": 2.9042794704437256,
56
- "learning_rate": 3.2500000000000004e-05,
57
- "loss": 0.7372,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 4.46,
62
- "grad_norm": 2.628190517425537,
63
- "learning_rate": 3e-05,
64
- "loss": 0.6509,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 5.02,
69
- "grad_norm": 2.9309325218200684,
70
- "learning_rate": 2.7500000000000004e-05,
71
- "loss": 0.6398,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 5.57,
76
- "grad_norm": 2.357445240020752,
77
- "learning_rate": 2.5e-05,
78
- "loss": 0.56,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 6.13,
83
- "grad_norm": 2.1420226097106934,
84
- "learning_rate": 2.25e-05,
85
- "loss": 0.5543,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 6.69,
90
- "grad_norm": 2.6394078731536865,
91
- "learning_rate": 2e-05,
92
- "loss": 0.5104,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 7.25,
97
- "grad_norm": 2.395305633544922,
98
- "learning_rate": 1.75e-05,
99
- "loss": 0.4832,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 7.8,
104
- "grad_norm": 2.048461675643921,
105
- "learning_rate": 1.5e-05,
106
- "loss": 0.4705,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 8.36,
111
- "grad_norm": 2.379493236541748,
112
- "learning_rate": 1.25e-05,
113
- "loss": 0.442,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 8.92,
118
- "grad_norm": 2.170802116394043,
119
- "learning_rate": 1e-05,
120
- "loss": 0.4264,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 9.48,
125
- "grad_norm": 2.277756690979004,
126
- "learning_rate": 7.5e-06,
127
- "loss": 0.405,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 10.03,
132
- "grad_norm": 2.3412973880767822,
133
- "learning_rate": 5e-06,
134
- "loss": 0.4069,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 10.59,
139
- "grad_norm": 2.220499038696289,
140
- "learning_rate": 2.5e-06,
141
- "loss": 0.3937,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 11.15,
146
- "grad_norm": 2.1124320030212402,
147
- "learning_rate": 0.0,
148
- "loss": 0.3872,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 11.15,
153
- "step": 10000,
154
- "total_flos": 5225840640000000.0,
155
- "train_loss": 0.6717062408447265,
156
- "train_runtime": 2130.1373,
157
- "train_samples_per_second": 4.695,
158
- "train_steps_per_second": 4.695
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
  ],
161
  "logging_steps": 500,
162
- "max_steps": 10000,
163
  "num_input_tokens_seen": 0,
164
- "num_train_epochs": 12,
165
  "save_steps": 1000,
166
- "total_flos": 5225840640000000.0,
167
  "train_batch_size": 1,
168
  "trial_name": null,
169
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 35190,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.14,
13
+ "grad_norm": 2.325218677520752,
14
+ "learning_rate": 4.92895709008241e-05,
15
+ "loss": 0.8164,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.28,
20
+ "grad_norm": 2.017394781112671,
21
+ "learning_rate": 4.85791418016482e-05,
22
+ "loss": 0.6693,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.43,
27
+ "grad_norm": 1.5872315168380737,
28
+ "learning_rate": 4.78687127024723e-05,
29
+ "loss": 0.6128,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.57,
34
+ "grad_norm": 1.8339617252349854,
35
+ "learning_rate": 4.71582836032964e-05,
36
+ "loss": 0.5742,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.71,
41
+ "grad_norm": 1.8466053009033203,
42
+ "learning_rate": 4.644785450412049e-05,
43
+ "loss": 0.5523,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 0.85,
48
+ "grad_norm": 1.4517784118652344,
49
+ "learning_rate": 4.573742540494459e-05,
50
+ "loss": 0.5346,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 0.99,
55
+ "grad_norm": 1.3925143480300903,
56
+ "learning_rate": 4.5026996305768686e-05,
57
+ "loss": 0.5028,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 1.14,
62
+ "grad_norm": 1.220083236694336,
63
+ "learning_rate": 4.4316567206592784e-05,
64
+ "loss": 0.4355,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 1.28,
69
+ "grad_norm": 1.5840811729431152,
70
+ "learning_rate": 4.360613810741688e-05,
71
+ "loss": 0.4228,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 1.42,
76
+ "grad_norm": 1.6422966718673706,
77
+ "learning_rate": 4.289570900824098e-05,
78
+ "loss": 0.4009,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 1.56,
83
+ "grad_norm": 1.7338138818740845,
84
+ "learning_rate": 4.218527990906508e-05,
85
+ "loss": 0.3916,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 1.71,
90
+ "grad_norm": 1.0937657356262207,
91
+ "learning_rate": 4.147485080988918e-05,
92
+ "loss": 0.3851,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 1.85,
97
+ "grad_norm": 1.1090480089187622,
98
+ "learning_rate": 4.076442171071328e-05,
99
+ "loss": 0.3672,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 1.99,
104
+ "grad_norm": 1.5349233150482178,
105
+ "learning_rate": 4.005399261153737e-05,
106
+ "loss": 0.3659,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 2.13,
111
+ "grad_norm": 1.6046255826950073,
112
+ "learning_rate": 3.934356351236147e-05,
113
+ "loss": 0.3117,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 2.27,
118
+ "grad_norm": 1.3127567768096924,
119
+ "learning_rate": 3.8633134413185566e-05,
120
+ "loss": 0.3043,
121
  "step": 8000
122
  },
123
  {
124
+ "epoch": 2.42,
125
+ "grad_norm": 1.3286867141723633,
126
+ "learning_rate": 3.7922705314009665e-05,
127
+ "loss": 0.3029,
128
  "step": 8500
129
  },
130
  {
131
+ "epoch": 2.56,
132
+ "grad_norm": 0.8038182258605957,
133
+ "learning_rate": 3.721227621483376e-05,
134
+ "loss": 0.2934,
135
  "step": 9000
136
  },
137
  {
138
+ "epoch": 2.7,
139
+ "grad_norm": 1.097979187965393,
140
+ "learning_rate": 3.650184711565786e-05,
141
+ "loss": 0.2822,
142
  "step": 9500
143
  },
144
  {
145
+ "epoch": 2.84,
146
+ "grad_norm": 1.42654287815094,
147
+ "learning_rate": 3.579141801648196e-05,
148
+ "loss": 0.2768,
149
  "step": 10000
150
  },
151
  {
152
+ "epoch": 2.98,
153
+ "grad_norm": 1.3376920223236084,
154
+ "learning_rate": 3.508098891730606e-05,
155
+ "loss": 0.2733,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 3.13,
160
+ "grad_norm": 1.3901501893997192,
161
+ "learning_rate": 3.437055981813016e-05,
162
+ "loss": 0.2358,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 3.27,
167
+ "grad_norm": 0.9437574148178101,
168
+ "learning_rate": 3.366013071895425e-05,
169
+ "loss": 0.2347,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 3.41,
174
+ "grad_norm": 1.7849817276000977,
175
+ "learning_rate": 3.294970161977835e-05,
176
+ "loss": 0.2282,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 3.55,
181
+ "grad_norm": 1.3517547845840454,
182
+ "learning_rate": 3.2239272520602446e-05,
183
+ "loss": 0.2196,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 3.69,
188
+ "grad_norm": 1.3112218379974365,
189
+ "learning_rate": 3.152884342142654e-05,
190
+ "loss": 0.2196,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 3.84,
195
+ "grad_norm": 1.532402515411377,
196
+ "learning_rate": 3.081841432225064e-05,
197
+ "loss": 0.222,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 3.98,
202
+ "grad_norm": 1.0918519496917725,
203
+ "learning_rate": 3.0107985223074735e-05,
204
+ "loss": 0.217,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 4.12,
209
+ "grad_norm": 1.1216357946395874,
210
+ "learning_rate": 2.9397556123898834e-05,
211
+ "loss": 0.1902,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 4.26,
216
+ "grad_norm": 0.9181265234947205,
217
+ "learning_rate": 2.8687127024722932e-05,
218
+ "loss": 0.1854,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 4.4,
223
+ "grad_norm": 1.3160160779953003,
224
+ "learning_rate": 2.797669792554703e-05,
225
+ "loss": 0.1784,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 4.55,
230
+ "grad_norm": 1.198168396949768,
231
+ "learning_rate": 2.7266268826371126e-05,
232
+ "loss": 0.1812,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 4.69,
237
+ "grad_norm": 1.515939474105835,
238
+ "learning_rate": 2.6555839727195225e-05,
239
+ "loss": 0.1824,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 4.83,
244
+ "grad_norm": 1.1988515853881836,
245
+ "learning_rate": 2.5845410628019323e-05,
246
+ "loss": 0.178,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 4.97,
251
+ "grad_norm": 0.8168569803237915,
252
+ "learning_rate": 2.5134981528843422e-05,
253
+ "loss": 0.1758,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 5.12,
258
+ "grad_norm": 1.039239764213562,
259
+ "learning_rate": 2.442455242966752e-05,
260
+ "loss": 0.1517,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 5.26,
265
+ "grad_norm": 0.9721378087997437,
266
+ "learning_rate": 2.371412333049162e-05,
267
+ "loss": 0.1452,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 5.4,
272
+ "grad_norm": 1.0437672138214111,
273
+ "learning_rate": 2.3003694231315714e-05,
274
+ "loss": 0.1513,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 5.54,
279
+ "grad_norm": 1.620819091796875,
280
+ "learning_rate": 2.2293265132139813e-05,
281
+ "loss": 0.1539,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 5.68,
286
+ "grad_norm": 1.2683171033859253,
287
+ "learning_rate": 2.158283603296391e-05,
288
+ "loss": 0.152,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 5.83,
293
+ "grad_norm": 1.0495145320892334,
294
+ "learning_rate": 2.0872406933788007e-05,
295
+ "loss": 0.1435,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 5.97,
300
+ "grad_norm": 1.5612547397613525,
301
+ "learning_rate": 2.0161977834612105e-05,
302
+ "loss": 0.1471,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 6.11,
307
+ "grad_norm": 1.3628000020980835,
308
+ "learning_rate": 1.9451548735436204e-05,
309
+ "loss": 0.1349,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 6.25,
314
+ "grad_norm": 0.9959810376167297,
315
+ "learning_rate": 1.8741119636260302e-05,
316
+ "loss": 0.1229,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 6.39,
321
+ "grad_norm": 0.9821630120277405,
322
+ "learning_rate": 1.80306905370844e-05,
323
+ "loss": 0.1288,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 6.54,
328
+ "grad_norm": 1.5605982542037964,
329
+ "learning_rate": 1.7320261437908496e-05,
330
+ "loss": 0.1285,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 6.68,
335
+ "grad_norm": 1.138551115989685,
336
+ "learning_rate": 1.6609832338732595e-05,
337
+ "loss": 0.128,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 6.82,
342
+ "grad_norm": 0.9666039347648621,
343
+ "learning_rate": 1.5899403239556693e-05,
344
+ "loss": 0.127,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 6.96,
349
+ "grad_norm": 0.7725051641464233,
350
+ "learning_rate": 1.518897414038079e-05,
351
+ "loss": 0.1268,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 7.1,
356
+ "grad_norm": 0.8455806970596313,
357
+ "learning_rate": 1.4478545041204889e-05,
358
+ "loss": 0.1077,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 7.25,
363
+ "grad_norm": 0.9111607074737549,
364
+ "learning_rate": 1.3768115942028985e-05,
365
+ "loss": 0.1129,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 7.39,
370
+ "grad_norm": 1.432874083518982,
371
+ "learning_rate": 1.3057686842853084e-05,
372
+ "loss": 0.1105,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 7.53,
377
+ "grad_norm": 1.1109156608581543,
378
+ "learning_rate": 1.2347257743677183e-05,
379
+ "loss": 0.1151,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 7.67,
384
+ "grad_norm": 0.8510033488273621,
385
+ "learning_rate": 1.163682864450128e-05,
386
+ "loss": 0.1113,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 7.81,
391
+ "grad_norm": 0.6120481491088867,
392
+ "learning_rate": 1.0926399545325378e-05,
393
+ "loss": 0.1116,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 7.96,
398
+ "grad_norm": 0.8336113691329956,
399
+ "learning_rate": 1.0215970446149475e-05,
400
+ "loss": 0.112,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 8.1,
405
+ "grad_norm": 1.1527196168899536,
406
+ "learning_rate": 9.505541346973572e-06,
407
+ "loss": 0.0994,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 8.24,
412
+ "grad_norm": 0.9409528970718384,
413
+ "learning_rate": 8.79511224779767e-06,
414
+ "loss": 0.0999,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 8.38,
419
+ "grad_norm": 1.298326849937439,
420
+ "learning_rate": 8.084683148621767e-06,
421
+ "loss": 0.1007,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 8.53,
426
+ "grad_norm": 1.2974679470062256,
427
+ "learning_rate": 7.374254049445865e-06,
428
+ "loss": 0.1007,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 8.67,
433
+ "grad_norm": 1.1797147989273071,
434
+ "learning_rate": 6.6638249502699635e-06,
435
+ "loss": 0.1012,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 8.81,
440
+ "grad_norm": 1.3059791326522827,
441
+ "learning_rate": 5.953395851094061e-06,
442
+ "loss": 0.0987,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 8.95,
447
+ "grad_norm": 0.8831413388252258,
448
+ "learning_rate": 5.242966751918159e-06,
449
+ "loss": 0.0988,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 9.09,
454
+ "grad_norm": 0.7082082629203796,
455
+ "learning_rate": 4.532537652742257e-06,
456
+ "loss": 0.0961,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 9.24,
461
+ "grad_norm": 0.956742525100708,
462
+ "learning_rate": 3.822108553566354e-06,
463
+ "loss": 0.0911,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 9.38,
468
+ "grad_norm": 0.6049565672874451,
469
+ "learning_rate": 3.111679454390452e-06,
470
+ "loss": 0.0907,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 9.52,
475
+ "grad_norm": 1.5750799179077148,
476
+ "learning_rate": 2.40125035521455e-06,
477
+ "loss": 0.0957,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 9.66,
482
+ "grad_norm": 1.0990025997161865,
483
+ "learning_rate": 1.6908212560386474e-06,
484
+ "loss": 0.0913,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 9.8,
489
+ "grad_norm": 1.086700677871704,
490
+ "learning_rate": 9.80392156862745e-07,
491
+ "loss": 0.0937,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 9.95,
496
+ "grad_norm": 0.9267112612724304,
497
+ "learning_rate": 2.6996305768684286e-07,
498
+ "loss": 0.0941,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 10.0,
503
+ "step": 35190,
504
+ "total_flos": 1.838973321216e+16,
505
+ "train_loss": 0.22781734634577194,
506
+ "train_runtime": 7496.2711,
507
+ "train_samples_per_second": 4.694,
508
+ "train_steps_per_second": 4.694
509
  }
510
  ],
511
  "logging_steps": 500,
512
+ "max_steps": 35190,
513
  "num_input_tokens_seen": 0,
514
+ "num_train_epochs": 10,
515
  "save_steps": 1000,
516
+ "total_flos": 1.838973321216e+16,
517
  "train_batch_size": 1,
518
  "trial_name": null,
519
  "trial_params": null