File size: 9,494 Bytes
9197d8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.3657457189463597,
  "eval_steps": 500,
  "global_step": 50000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 7.314914378927195e-06,
      "grad_norm": 2702.876220703125,
      "learning_rate": 3.6573769292663306e-10,
      "loss": 185.8854,
      "step": 1
    },
    {
      "epoch": 0.007314914378927195,
      "grad_norm": 25910.705078125,
      "learning_rate": 3.6573769292663306e-07,
      "loss": 48.434,
      "step": 1000
    },
    {
      "epoch": 0.01462982875785439,
      "grad_norm": 3.9086620807647705,
      "learning_rate": 7.314753858532661e-07,
      "loss": 17.9804,
      "step": 2000
    },
    {
      "epoch": 0.021944743136781585,
      "grad_norm": 17.556304931640625,
      "learning_rate": 1.097213078779899e-06,
      "loss": 10.5262,
      "step": 3000
    },
    {
      "epoch": 0.02925965751570878,
      "grad_norm": 4689.71240234375,
      "learning_rate": 1.4629507717065323e-06,
      "loss": 11.6651,
      "step": 4000
    },
    {
      "epoch": 0.03657457189463598,
      "grad_norm": 0.0065016308799386024,
      "learning_rate": 1.8286884646331652e-06,
      "loss": 9.4746,
      "step": 5000
    },
    {
      "epoch": 0.04388948627356317,
      "grad_norm": 0.060599055141210556,
      "learning_rate": 2.194426157559798e-06,
      "loss": 7.1303,
      "step": 6000
    },
    {
      "epoch": 0.051204400652490364,
      "grad_norm": 1028.9453125,
      "learning_rate": 2.560163850486431e-06,
      "loss": 8.917,
      "step": 7000
    },
    {
      "epoch": 0.05851931503141756,
      "grad_norm": 5.463726043701172,
      "learning_rate": 2.9259015434130645e-06,
      "loss": 8.9339,
      "step": 8000
    },
    {
      "epoch": 0.06583422941034475,
      "grad_norm": 0.3307730257511139,
      "learning_rate": 3.2916392363396975e-06,
      "loss": 8.746,
      "step": 9000
    },
    {
      "epoch": 0.07314914378927195,
      "grad_norm": 0.003347629914060235,
      "learning_rate": 3.6573769292663304e-06,
      "loss": 9.4711,
      "step": 10000
    },
    {
      "epoch": 0.08046405816819914,
      "grad_norm": 0.006155295763164759,
      "learning_rate": 4.023114622192964e-06,
      "loss": 7.647,
      "step": 11000
    },
    {
      "epoch": 0.08777897254712634,
      "grad_norm": 0.0019895241130143404,
      "learning_rate": 4.388852315119596e-06,
      "loss": 5.7327,
      "step": 12000
    },
    {
      "epoch": 0.09509388692605353,
      "grad_norm": 0.004527593031525612,
      "learning_rate": 4.75459000804623e-06,
      "loss": 5.5751,
      "step": 13000
    },
    {
      "epoch": 0.10240880130498073,
      "grad_norm": 0.009128883481025696,
      "learning_rate": 4.986629929451543e-06,
      "loss": 8.6148,
      "step": 14000
    },
    {
      "epoch": 0.10972371568390792,
      "grad_norm": 0.012683026492595673,
      "learning_rate": 4.9459914171462015e-06,
      "loss": 8.5558,
      "step": 15000
    },
    {
      "epoch": 0.11703863006283512,
      "grad_norm": 0.0028167981654405594,
      "learning_rate": 4.90535290484086e-06,
      "loss": 6.0433,
      "step": 16000
    },
    {
      "epoch": 0.1243535444417623,
      "grad_norm": 0.0011900264071300626,
      "learning_rate": 4.864714392535519e-06,
      "loss": 6.9084,
      "step": 17000
    },
    {
      "epoch": 0.1316684588206895,
      "grad_norm": 0.025524910539388657,
      "learning_rate": 4.824075880230177e-06,
      "loss": 6.7333,
      "step": 18000
    },
    {
      "epoch": 0.1389833731996167,
      "grad_norm": 0.027619725093245506,
      "learning_rate": 4.783437367924835e-06,
      "loss": 4.1436,
      "step": 19000
    },
    {
      "epoch": 0.1462982875785439,
      "grad_norm": 0.012698939070105553,
      "learning_rate": 4.742798855619494e-06,
      "loss": 5.397,
      "step": 20000
    },
    {
      "epoch": 0.15361320195747108,
      "grad_norm": 0.031142177060246468,
      "learning_rate": 4.702160343314152e-06,
      "loss": 5.5156,
      "step": 21000
    },
    {
      "epoch": 0.16092811633639828,
      "grad_norm": 0.0004189134924672544,
      "learning_rate": 4.66152183100881e-06,
      "loss": 4.8633,
      "step": 22000
    },
    {
      "epoch": 0.16824303071532548,
      "grad_norm": 0.0103899035602808,
      "learning_rate": 4.620883318703469e-06,
      "loss": 7.2146,
      "step": 23000
    },
    {
      "epoch": 0.17555794509425268,
      "grad_norm": 1158.8017578125,
      "learning_rate": 4.580244806398127e-06,
      "loss": 5.7667,
      "step": 24000
    },
    {
      "epoch": 0.18287285947317986,
      "grad_norm": 0.03399639576673508,
      "learning_rate": 4.5396062940927856e-06,
      "loss": 5.0472,
      "step": 25000
    },
    {
      "epoch": 0.19018777385210706,
      "grad_norm": 0.017644532024860382,
      "learning_rate": 4.4989677817874446e-06,
      "loss": 4.8188,
      "step": 26000
    },
    {
      "epoch": 0.19750268823103426,
      "grad_norm": 0.00010079160711029544,
      "learning_rate": 4.4583292694821035e-06,
      "loss": 5.7598,
      "step": 27000
    },
    {
      "epoch": 0.20481760260996146,
      "grad_norm": 0.009618501178920269,
      "learning_rate": 4.417690757176762e-06,
      "loss": 4.683,
      "step": 28000
    },
    {
      "epoch": 0.21213251698888866,
      "grad_norm": 0.018396975472569466,
      "learning_rate": 4.377052244871421e-06,
      "loss": 5.7816,
      "step": 29000
    },
    {
      "epoch": 0.21944743136781583,
      "grad_norm": 0.026549218222498894,
      "learning_rate": 4.336413732566079e-06,
      "loss": 5.5149,
      "step": 30000
    },
    {
      "epoch": 0.22676234574674303,
      "grad_norm": 0.01402178592979908,
      "learning_rate": 4.295775220260737e-06,
      "loss": 6.1021,
      "step": 31000
    },
    {
      "epoch": 0.23407726012567023,
      "grad_norm": 2950.190185546875,
      "learning_rate": 4.255136707955396e-06,
      "loss": 5.1742,
      "step": 32000
    },
    {
      "epoch": 0.24139217450459743,
      "grad_norm": 0.01243713591247797,
      "learning_rate": 4.214498195650054e-06,
      "loss": 4.8856,
      "step": 33000
    },
    {
      "epoch": 0.2487070888835246,
      "grad_norm": 0.00121857482008636,
      "learning_rate": 4.173859683344712e-06,
      "loss": 4.0296,
      "step": 34000
    },
    {
      "epoch": 0.25602200326245184,
      "grad_norm": 0.021528728306293488,
      "learning_rate": 4.133221171039371e-06,
      "loss": 3.7989,
      "step": 35000
    },
    {
      "epoch": 0.263336917641379,
      "grad_norm": 0.067794568836689,
      "learning_rate": 4.092582658734029e-06,
      "loss": 4.8373,
      "step": 36000
    },
    {
      "epoch": 0.2706518320203062,
      "grad_norm": 83.66200256347656,
      "learning_rate": 4.0519441464286876e-06,
      "loss": 3.2441,
      "step": 37000
    },
    {
      "epoch": 0.2779667463992334,
      "grad_norm": 0.00217541866004467,
      "learning_rate": 4.0113056341233466e-06,
      "loss": 3.5578,
      "step": 38000
    },
    {
      "epoch": 0.2852816607781606,
      "grad_norm": 0.0008728219545446336,
      "learning_rate": 3.970667121818005e-06,
      "loss": 2.6644,
      "step": 39000
    },
    {
      "epoch": 0.2925965751570878,
      "grad_norm": 0.027021408081054688,
      "learning_rate": 3.930028609512664e-06,
      "loss": 3.7778,
      "step": 40000
    },
    {
      "epoch": 0.299911489536015,
      "grad_norm": 0.001312136766500771,
      "learning_rate": 3.889390097207322e-06,
      "loss": 4.2509,
      "step": 41000
    },
    {
      "epoch": 0.30722640391494216,
      "grad_norm": 0.011145360767841339,
      "learning_rate": 3.84875158490198e-06,
      "loss": 4.6084,
      "step": 42000
    },
    {
      "epoch": 0.3145413182938694,
      "grad_norm": 289.0566711425781,
      "learning_rate": 3.8081130725966386e-06,
      "loss": 3.3176,
      "step": 43000
    },
    {
      "epoch": 0.32185623267279656,
      "grad_norm": 0.006871068850159645,
      "learning_rate": 3.767474560291297e-06,
      "loss": 3.0108,
      "step": 44000
    },
    {
      "epoch": 0.32917114705172373,
      "grad_norm": 0.002771923318505287,
      "learning_rate": 3.7268360479859557e-06,
      "loss": 3.5366,
      "step": 45000
    },
    {
      "epoch": 0.33648606143065096,
      "grad_norm": 0.0030192858539521694,
      "learning_rate": 3.686197535680614e-06,
      "loss": 3.3997,
      "step": 46000
    },
    {
      "epoch": 0.34380097580957814,
      "grad_norm": 3926.616943359375,
      "learning_rate": 3.6455590233752724e-06,
      "loss": 3.5042,
      "step": 47000
    },
    {
      "epoch": 0.35111589018850536,
      "grad_norm": 0.000544128124602139,
      "learning_rate": 3.604920511069931e-06,
      "loss": 2.9953,
      "step": 48000
    },
    {
      "epoch": 0.35843080456743254,
      "grad_norm": 14144.9677734375,
      "learning_rate": 3.564281998764589e-06,
      "loss": 2.9742,
      "step": 49000
    },
    {
      "epoch": 0.3657457189463597,
      "grad_norm": 0.0021343908738344908,
      "learning_rate": 3.5236434864592477e-06,
      "loss": 3.4145,
      "step": 50000
    }
  ],
  "logging_steps": 1000,
  "max_steps": 136707,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50000,
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}