File size: 8,426 Bytes
fa6a641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.0,
  "eval_steps": 25,
  "global_step": 41,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.04878048780487805,
      "grad_norm": 0.052720945328474045,
      "learning_rate": 2.9999999999999997e-05,
      "loss": 11.9203,
      "step": 1
    },
    {
      "epoch": 0.04878048780487805,
      "eval_loss": 11.916321754455566,
      "eval_runtime": 0.0625,
      "eval_samples_per_second": 800.599,
      "eval_steps_per_second": 32.024,
      "step": 1
    },
    {
      "epoch": 0.0975609756097561,
      "grad_norm": 0.05649564787745476,
      "learning_rate": 5.9999999999999995e-05,
      "loss": 11.9118,
      "step": 2
    },
    {
      "epoch": 0.14634146341463414,
      "grad_norm": 0.06007275730371475,
      "learning_rate": 8.999999999999999e-05,
      "loss": 11.91,
      "step": 3
    },
    {
      "epoch": 0.1951219512195122,
      "grad_norm": 0.06864983588457108,
      "learning_rate": 0.00011999999999999999,
      "loss": 11.908,
      "step": 4
    },
    {
      "epoch": 0.24390243902439024,
      "grad_norm": 0.13928121328353882,
      "learning_rate": 0.00015,
      "loss": 11.888,
      "step": 5
    },
    {
      "epoch": 0.2926829268292683,
      "grad_norm": 0.050554897636175156,
      "learning_rate": 0.00017999999999999998,
      "loss": 11.9175,
      "step": 6
    },
    {
      "epoch": 0.34146341463414637,
      "grad_norm": 0.051585853099823,
      "learning_rate": 0.00020999999999999998,
      "loss": 11.9123,
      "step": 7
    },
    {
      "epoch": 0.3902439024390244,
      "grad_norm": 0.06403446942567825,
      "learning_rate": 0.00023999999999999998,
      "loss": 11.9104,
      "step": 8
    },
    {
      "epoch": 0.43902439024390244,
      "grad_norm": 0.07250206172466278,
      "learning_rate": 0.00027,
      "loss": 11.9101,
      "step": 9
    },
    {
      "epoch": 0.4878048780487805,
      "grad_norm": 0.12581613659858704,
      "learning_rate": 0.0003,
      "loss": 11.8811,
      "step": 10
    },
    {
      "epoch": 0.5365853658536586,
      "grad_norm": 0.051407769322395325,
      "learning_rate": 0.00029923039850878423,
      "loss": 11.9158,
      "step": 11
    },
    {
      "epoch": 0.5853658536585366,
      "grad_norm": 0.053157199174165726,
      "learning_rate": 0.00029692949118787415,
      "loss": 11.9121,
      "step": 12
    },
    {
      "epoch": 0.6341463414634146,
      "grad_norm": 0.064723901450634,
      "learning_rate": 0.0002931208884600073,
      "loss": 11.9034,
      "step": 13
    },
    {
      "epoch": 0.6829268292682927,
      "grad_norm": 0.07741189002990723,
      "learning_rate": 0.00028784367174303454,
      "loss": 11.8984,
      "step": 14
    },
    {
      "epoch": 0.7317073170731707,
      "grad_norm": 0.15558002889156342,
      "learning_rate": 0.0002811519924216873,
      "loss": 11.8842,
      "step": 15
    },
    {
      "epoch": 0.7804878048780488,
      "grad_norm": 0.0634838119149208,
      "learning_rate": 0.00027311451618109144,
      "loss": 11.914,
      "step": 16
    },
    {
      "epoch": 0.8292682926829268,
      "grad_norm": 0.07950323820114136,
      "learning_rate": 0.0002638137184039186,
      "loss": 11.9015,
      "step": 17
    },
    {
      "epoch": 0.8780487804878049,
      "grad_norm": 0.08885859698057175,
      "learning_rate": 0.000253345037861353,
      "loss": 11.9076,
      "step": 18
    },
    {
      "epoch": 0.926829268292683,
      "grad_norm": 0.10019578784704208,
      "learning_rate": 0.00024181589738214943,
      "loss": 11.8991,
      "step": 19
    },
    {
      "epoch": 0.975609756097561,
      "grad_norm": 0.17562973499298096,
      "learning_rate": 0.00022934460154904433,
      "loss": 11.8815,
      "step": 20
    },
    {
      "epoch": 1.024390243902439,
      "grad_norm": 0.1393955796957016,
      "learning_rate": 0.00021605912273364513,
      "loss": 17.8661,
      "step": 21
    },
    {
      "epoch": 1.0731707317073171,
      "grad_norm": 0.08931300789117813,
      "learning_rate": 0.000202095787926723,
      "loss": 11.9524,
      "step": 22
    },
    {
      "epoch": 1.1219512195121952,
      "grad_norm": 0.10830606520175934,
      "learning_rate": 0.00018759787983880805,
      "loss": 11.8534,
      "step": 23
    },
    {
      "epoch": 1.170731707317073,
      "grad_norm": 0.11832421272993088,
      "learning_rate": 0.0001727141666256865,
      "loss": 11.853,
      "step": 24
    },
    {
      "epoch": 1.2195121951219512,
      "grad_norm": 0.15067139267921448,
      "learning_rate": 0.0001575973753258069,
      "loss": 11.9391,
      "step": 25
    },
    {
      "epoch": 1.2195121951219512,
      "eval_loss": 11.908763885498047,
      "eval_runtime": 0.0607,
      "eval_samples_per_second": 824.1,
      "eval_steps_per_second": 32.964,
      "step": 25
    },
    {
      "epoch": 1.2682926829268293,
      "grad_norm": 0.16227374970912933,
      "learning_rate": 0.0001424026246741931,
      "loss": 11.866,
      "step": 26
    },
    {
      "epoch": 1.3170731707317074,
      "grad_norm": 0.08508593589067459,
      "learning_rate": 0.00012728583337431353,
      "loss": 11.9516,
      "step": 27
    },
    {
      "epoch": 1.3658536585365852,
      "grad_norm": 0.11364945769309998,
      "learning_rate": 0.0001124021201611919,
      "loss": 11.873,
      "step": 28
    },
    {
      "epoch": 1.4146341463414633,
      "grad_norm": 0.1271531879901886,
      "learning_rate": 9.790421207327697e-05,
      "loss": 11.8994,
      "step": 29
    },
    {
      "epoch": 1.4634146341463414,
      "grad_norm": 0.1775551587343216,
      "learning_rate": 8.394087726635483e-05,
      "loss": 11.8827,
      "step": 30
    },
    {
      "epoch": 1.5121951219512195,
      "grad_norm": 0.19221873581409454,
      "learning_rate": 7.065539845095567e-05,
      "loss": 11.9097,
      "step": 31
    },
    {
      "epoch": 1.5609756097560976,
      "grad_norm": 0.10446513444185257,
      "learning_rate": 5.818410261785056e-05,
      "loss": 11.8609,
      "step": 32
    },
    {
      "epoch": 1.6097560975609757,
      "grad_norm": 0.11987953633069992,
      "learning_rate": 4.6654962138647004e-05,
      "loss": 11.946,
      "step": 33
    },
    {
      "epoch": 1.6585365853658538,
      "grad_norm": 0.13893361389636993,
      "learning_rate": 3.618628159608137e-05,
      "loss": 11.8953,
      "step": 34
    },
    {
      "epoch": 1.7073170731707317,
      "grad_norm": 0.16340382397174835,
      "learning_rate": 2.6885483818908586e-05,
      "loss": 11.8647,
      "step": 35
    },
    {
      "epoch": 1.7560975609756098,
      "grad_norm": 0.18076759576797485,
      "learning_rate": 1.8848007578312686e-05,
      "loss": 11.8385,
      "step": 36
    },
    {
      "epoch": 1.8048780487804879,
      "grad_norm": 0.12096209079027176,
      "learning_rate": 1.215632825696541e-05,
      "loss": 11.9756,
      "step": 37
    },
    {
      "epoch": 1.8536585365853657,
      "grad_norm": 0.12150277197360992,
      "learning_rate": 6.879111539992676e-06,
      "loss": 11.852,
      "step": 38
    },
    {
      "epoch": 1.9024390243902438,
      "grad_norm": 0.12834501266479492,
      "learning_rate": 3.070508812125827e-06,
      "loss": 11.9213,
      "step": 39
    },
    {
      "epoch": 1.951219512195122,
      "grad_norm": 0.1679176688194275,
      "learning_rate": 7.696014912157267e-07,
      "loss": 11.8859,
      "step": 40
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.23847155272960663,
      "learning_rate": 0.0,
      "loss": 17.8301,
      "step": 41
    }
  ],
  "logging_steps": 1,
  "max_steps": 41,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 50,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 1,
        "early_stopping_threshold": 0.0
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 261190189056.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}