codestylist commited on
Commit
9b5f408
1 Parent(s): 036a5cf

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +537 -101
trainer_state.json CHANGED
@@ -1,184 +1,620 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.9087947882736156,
5
- "global_step": 12000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.16,
12
- "learning_rate": 0.0019185667752442998,
13
- "loss": 2.9941,
14
  "step": 500
15
  },
16
  {
17
- "epoch": 0.33,
18
- "learning_rate": 0.0018371335504885993,
19
- "loss": 1.5024,
20
  "step": 1000
21
  },
22
  {
23
- "epoch": 0.49,
24
- "learning_rate": 0.001755700325732899,
25
- "loss": 0.0902,
26
  "step": 1500
27
  },
28
  {
29
- "epoch": 0.65,
30
- "learning_rate": 0.0016742671009771987,
31
- "loss": 0.0504,
32
  "step": 2000
33
  },
34
  {
35
- "epoch": 0.81,
36
- "learning_rate": 0.0015928338762214984,
37
- "loss": 0.0412,
38
  "step": 2500
39
  },
40
  {
41
- "epoch": 0.98,
42
- "learning_rate": 0.0015114006514657982,
43
- "loss": 0.0373,
44
  "step": 3000
45
  },
46
  {
47
- "epoch": 1.0,
48
- "eval_loss": 0.026404375210404396,
49
- "eval_runtime": 68.798,
50
- "eval_samples_per_second": 178.45,
51
- "eval_steps_per_second": 11.163,
52
- "step": 3070
53
- },
54
- {
55
- "epoch": 1.14,
56
- "learning_rate": 0.0014299674267100977,
57
- "loss": 0.0346,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 1.3,
62
- "learning_rate": 0.0013485342019543974,
63
- "loss": 0.0301,
64
  "step": 4000
65
  },
66
  {
67
- "epoch": 1.47,
68
- "learning_rate": 0.001267100977198697,
69
- "loss": 0.0323,
70
  "step": 4500
71
  },
72
  {
73
- "epoch": 1.63,
74
- "learning_rate": 0.0011856677524429969,
75
- "loss": 0.0305,
76
  "step": 5000
77
  },
78
  {
79
- "epoch": 1.79,
80
- "learning_rate": 0.0011042345276872966,
81
- "loss": 0.0287,
82
  "step": 5500
83
  },
84
  {
85
- "epoch": 1.95,
86
- "learning_rate": 0.001022801302931596,
87
- "loss": 0.0287,
88
  "step": 6000
89
  },
90
  {
91
- "epoch": 2.0,
92
- "eval_loss": 0.023602882400155067,
93
- "eval_runtime": 68.7368,
94
- "eval_samples_per_second": 178.609,
95
- "eval_steps_per_second": 11.173,
96
- "step": 6140
97
- },
98
- {
99
- "epoch": 2.12,
100
- "learning_rate": 0.0009413680781758957,
101
- "loss": 0.0261,
102
  "step": 6500
103
  },
104
  {
105
- "epoch": 2.28,
106
- "learning_rate": 0.0008599348534201955,
107
- "loss": 0.0255,
108
  "step": 7000
109
  },
110
  {
111
- "epoch": 2.44,
112
- "learning_rate": 0.0007785016286644952,
113
- "loss": 0.025,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 2.61,
118
- "learning_rate": 0.0006970684039087948,
119
- "loss": 0.0247,
120
  "step": 8000
121
  },
122
  {
123
- "epoch": 2.77,
124
- "learning_rate": 0.0006156351791530945,
125
- "loss": 0.023,
126
  "step": 8500
127
  },
128
  {
129
- "epoch": 2.93,
130
- "learning_rate": 0.0005342019543973941,
131
- "loss": 0.0218,
132
  "step": 9000
133
  },
134
  {
135
- "epoch": 3.0,
136
- "eval_loss": 0.02037286013364792,
137
- "eval_runtime": 68.7895,
138
- "eval_samples_per_second": 178.472,
139
- "eval_steps_per_second": 11.165,
140
- "step": 9210
141
- },
142
- {
143
- "epoch": 3.09,
144
- "learning_rate": 0.0004527687296416938,
145
- "loss": 0.0198,
146
  "step": 9500
147
  },
148
  {
149
- "epoch": 3.26,
150
- "learning_rate": 0.0003713355048859935,
151
- "loss": 0.0187,
152
  "step": 10000
153
  },
154
  {
155
- "epoch": 3.42,
156
- "learning_rate": 0.0002899022801302932,
157
- "loss": 0.0181,
158
  "step": 10500
159
  },
160
  {
161
- "epoch": 3.58,
162
- "learning_rate": 0.00020846905537459285,
163
- "loss": 0.0174,
164
  "step": 11000
165
  },
166
  {
167
- "epoch": 3.75,
168
- "learning_rate": 0.00012703583061889252,
169
- "loss": 0.0166,
170
  "step": 11500
171
  },
172
  {
173
- "epoch": 3.91,
174
- "learning_rate": 4.5602605863192184e-05,
175
- "loss": 0.0167,
176
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  }
178
  ],
179
- "max_steps": 12280,
180
- "num_train_epochs": 4,
181
- "total_flos": 2.5979941526962176e+16,
182
  "trial_name": null,
183
  "trial_params": null
184
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9982255399865387,
5
+ "global_step": 49000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.03,
12
+ "learning_rate": 0.001979603907891248,
13
+ "loss": 0.0528,
14
  "step": 500
15
  },
16
  {
17
+ "epoch": 0.06,
18
+ "learning_rate": 0.0019592078157824964,
19
+ "loss": 0.0543,
20
  "step": 1000
21
  },
22
  {
23
+ "epoch": 0.09,
24
+ "learning_rate": 0.0019388117236737441,
25
+ "loss": 0.0531,
26
  "step": 1500
27
  },
28
  {
29
+ "epoch": 0.12,
30
+ "learning_rate": 0.001918415631564992,
31
+ "loss": 0.0534,
32
  "step": 2000
33
  },
34
  {
35
+ "epoch": 0.15,
36
+ "learning_rate": 0.0018980195394562402,
37
+ "loss": 0.0528,
38
  "step": 2500
39
  },
40
  {
41
+ "epoch": 0.18,
42
+ "learning_rate": 0.0018776234473474882,
43
+ "loss": 0.052,
44
  "step": 3000
45
  },
46
  {
47
+ "epoch": 0.21,
48
+ "learning_rate": 0.0018572273552387361,
49
+ "loss": 0.0518,
 
 
 
 
 
 
 
 
50
  "step": 3500
51
  },
52
  {
53
+ "epoch": 0.24,
54
+ "learning_rate": 0.0018368312631299843,
55
+ "loss": 0.0509,
56
  "step": 4000
57
  },
58
  {
59
+ "epoch": 0.28,
60
+ "learning_rate": 0.0018164351710212323,
61
+ "loss": 0.0507,
62
  "step": 4500
63
  },
64
  {
65
+ "epoch": 0.31,
66
+ "learning_rate": 0.0017960390789124802,
67
+ "loss": 0.049,
68
  "step": 5000
69
  },
70
  {
71
+ "epoch": 0.34,
72
+ "learning_rate": 0.0017756429868037286,
73
+ "loss": 0.0487,
74
  "step": 5500
75
  },
76
  {
77
+ "epoch": 0.37,
78
+ "learning_rate": 0.0017552468946949765,
79
+ "loss": 0.0484,
80
  "step": 6000
81
  },
82
  {
83
+ "epoch": 0.4,
84
+ "learning_rate": 0.0017348508025862245,
85
+ "loss": 0.0498,
 
 
 
 
 
 
 
 
86
  "step": 6500
87
  },
88
  {
89
+ "epoch": 0.43,
90
+ "learning_rate": 0.0017144547104774727,
91
+ "loss": 0.0507,
92
  "step": 7000
93
  },
94
  {
95
+ "epoch": 0.46,
96
+ "learning_rate": 0.0016940586183687206,
97
+ "loss": 0.0467,
98
  "step": 7500
99
  },
100
  {
101
+ "epoch": 0.49,
102
+ "learning_rate": 0.0016736625262599688,
103
+ "loss": 0.0469,
104
  "step": 8000
105
  },
106
  {
107
+ "epoch": 0.52,
108
+ "learning_rate": 0.0016532664341512167,
109
+ "loss": 0.0473,
110
  "step": 8500
111
  },
112
  {
113
+ "epoch": 0.55,
114
+ "learning_rate": 0.0016328703420424647,
115
+ "loss": 0.047,
116
  "step": 9000
117
  },
118
  {
119
+ "epoch": 0.58,
120
+ "learning_rate": 0.0016124742499337129,
121
+ "loss": 0.0465,
 
 
 
 
 
 
 
 
122
  "step": 9500
123
  },
124
  {
125
+ "epoch": 0.61,
126
+ "learning_rate": 0.0015920781578249608,
127
+ "loss": 0.0461,
128
  "step": 10000
129
  },
130
  {
131
+ "epoch": 0.64,
132
+ "learning_rate": 0.0015716820657162088,
133
+ "loss": 0.045,
134
  "step": 10500
135
  },
136
  {
137
+ "epoch": 0.67,
138
+ "learning_rate": 0.001551285973607457,
139
+ "loss": 0.0468,
140
  "step": 11000
141
  },
142
  {
143
+ "epoch": 0.7,
144
+ "learning_rate": 0.0015308898814987049,
145
+ "loss": 0.0445,
146
  "step": 11500
147
  },
148
  {
149
+ "epoch": 0.73,
150
+ "learning_rate": 0.0015104937893899528,
151
+ "loss": 0.0443,
152
  "step": 12000
153
+ },
154
+ {
155
+ "epoch": 0.76,
156
+ "learning_rate": 0.001490097697281201,
157
+ "loss": 0.0439,
158
+ "step": 12500
159
+ },
160
+ {
161
+ "epoch": 0.8,
162
+ "learning_rate": 0.001469701605172449,
163
+ "loss": 0.0437,
164
+ "step": 13000
165
+ },
166
+ {
167
+ "epoch": 0.83,
168
+ "learning_rate": 0.001449305513063697,
169
+ "loss": 0.0432,
170
+ "step": 13500
171
+ },
172
+ {
173
+ "epoch": 0.86,
174
+ "learning_rate": 0.001428909420954945,
175
+ "loss": 0.0434,
176
+ "step": 14000
177
+ },
178
+ {
179
+ "epoch": 0.89,
180
+ "learning_rate": 0.001408513328846193,
181
+ "loss": 0.044,
182
+ "step": 14500
183
+ },
184
+ {
185
+ "epoch": 0.92,
186
+ "learning_rate": 0.001388117236737441,
187
+ "loss": 0.0433,
188
+ "step": 15000
189
+ },
190
+ {
191
+ "epoch": 0.95,
192
+ "learning_rate": 0.0013677211446286891,
193
+ "loss": 0.0427,
194
+ "step": 15500
195
+ },
196
+ {
197
+ "epoch": 0.98,
198
+ "learning_rate": 0.001347325052519937,
199
+ "loss": 0.043,
200
+ "step": 16000
201
+ },
202
+ {
203
+ "epoch": 1.0,
204
+ "eval_loss": 0.03870956972241402,
205
+ "eval_runtime": 667.1114,
206
+ "eval_samples_per_second": 146.987,
207
+ "eval_steps_per_second": 6.125,
208
+ "step": 16343
209
+ },
210
+ {
211
+ "epoch": 1.01,
212
+ "learning_rate": 0.0013269289604111853,
213
+ "loss": 0.0404,
214
+ "step": 16500
215
+ },
216
+ {
217
+ "epoch": 1.04,
218
+ "learning_rate": 0.0013065328683024334,
219
+ "loss": 0.037,
220
+ "step": 17000
221
+ },
222
+ {
223
+ "epoch": 1.07,
224
+ "learning_rate": 0.0012861367761936814,
225
+ "loss": 0.0371,
226
+ "step": 17500
227
+ },
228
+ {
229
+ "epoch": 1.1,
230
+ "learning_rate": 0.0012657406840849293,
231
+ "loss": 0.0372,
232
+ "step": 18000
233
+ },
234
+ {
235
+ "epoch": 1.13,
236
+ "learning_rate": 0.0012453445919761775,
237
+ "loss": 0.0386,
238
+ "step": 18500
239
+ },
240
+ {
241
+ "epoch": 1.16,
242
+ "learning_rate": 0.0012249484998674255,
243
+ "loss": 0.0377,
244
+ "step": 19000
245
+ },
246
+ {
247
+ "epoch": 1.19,
248
+ "learning_rate": 0.0012045524077586734,
249
+ "loss": 0.0385,
250
+ "step": 19500
251
+ },
252
+ {
253
+ "epoch": 1.22,
254
+ "learning_rate": 0.0011841563156499216,
255
+ "loss": 0.0377,
256
+ "step": 20000
257
+ },
258
+ {
259
+ "epoch": 1.25,
260
+ "learning_rate": 0.0011637602235411695,
261
+ "loss": 0.0378,
262
+ "step": 20500
263
+ },
264
+ {
265
+ "epoch": 1.28,
266
+ "learning_rate": 0.0011433641314324175,
267
+ "loss": 0.0364,
268
+ "step": 21000
269
+ },
270
+ {
271
+ "epoch": 1.32,
272
+ "learning_rate": 0.0011229680393236656,
273
+ "loss": 0.0372,
274
+ "step": 21500
275
+ },
276
+ {
277
+ "epoch": 1.35,
278
+ "learning_rate": 0.0011025719472149136,
279
+ "loss": 0.0372,
280
+ "step": 22000
281
+ },
282
+ {
283
+ "epoch": 1.38,
284
+ "learning_rate": 0.0010821758551061618,
285
+ "loss": 0.0387,
286
+ "step": 22500
287
+ },
288
+ {
289
+ "epoch": 1.41,
290
+ "learning_rate": 0.0010617797629974097,
291
+ "loss": 0.0387,
292
+ "step": 23000
293
+ },
294
+ {
295
+ "epoch": 1.44,
296
+ "learning_rate": 0.0010413836708886577,
297
+ "loss": 0.0387,
298
+ "step": 23500
299
+ },
300
+ {
301
+ "epoch": 1.47,
302
+ "learning_rate": 0.0010209875787799058,
303
+ "loss": 0.0363,
304
+ "step": 24000
305
+ },
306
+ {
307
+ "epoch": 1.5,
308
+ "learning_rate": 0.0010005914866711538,
309
+ "loss": 0.0385,
310
+ "step": 24500
311
+ },
312
+ {
313
+ "epoch": 1.53,
314
+ "learning_rate": 0.0009801953945624017,
315
+ "loss": 0.0387,
316
+ "step": 25000
317
+ },
318
+ {
319
+ "epoch": 1.56,
320
+ "learning_rate": 0.0009597993024536499,
321
+ "loss": 0.0381,
322
+ "step": 25500
323
+ },
324
+ {
325
+ "epoch": 1.59,
326
+ "learning_rate": 0.000939403210344898,
327
+ "loss": 0.039,
328
+ "step": 26000
329
+ },
330
+ {
331
+ "epoch": 1.62,
332
+ "learning_rate": 0.000919007118236146,
333
+ "loss": 0.0369,
334
+ "step": 26500
335
+ },
336
+ {
337
+ "epoch": 1.65,
338
+ "learning_rate": 0.000898611026127394,
339
+ "loss": 0.0372,
340
+ "step": 27000
341
+ },
342
+ {
343
+ "epoch": 1.68,
344
+ "learning_rate": 0.000878214934018642,
345
+ "loss": 0.0379,
346
+ "step": 27500
347
+ },
348
+ {
349
+ "epoch": 1.71,
350
+ "learning_rate": 0.0008578188419098901,
351
+ "loss": 0.0367,
352
+ "step": 28000
353
+ },
354
+ {
355
+ "epoch": 1.74,
356
+ "learning_rate": 0.0008374227498011381,
357
+ "loss": 0.0361,
358
+ "step": 28500
359
+ },
360
+ {
361
+ "epoch": 1.77,
362
+ "learning_rate": 0.0008170266576923861,
363
+ "loss": 0.0364,
364
+ "step": 29000
365
+ },
366
+ {
367
+ "epoch": 1.81,
368
+ "learning_rate": 0.0007966305655836342,
369
+ "loss": 0.0361,
370
+ "step": 29500
371
+ },
372
+ {
373
+ "epoch": 1.84,
374
+ "learning_rate": 0.0007762344734748822,
375
+ "loss": 0.0359,
376
+ "step": 30000
377
+ },
378
+ {
379
+ "epoch": 1.87,
380
+ "learning_rate": 0.0007558383813661303,
381
+ "loss": 0.0357,
382
+ "step": 30500
383
+ },
384
+ {
385
+ "epoch": 1.9,
386
+ "learning_rate": 0.0007354422892573784,
387
+ "loss": 0.0358,
388
+ "step": 31000
389
+ },
390
+ {
391
+ "epoch": 1.93,
392
+ "learning_rate": 0.0007150461971486263,
393
+ "loss": 0.0357,
394
+ "step": 31500
395
+ },
396
+ {
397
+ "epoch": 1.96,
398
+ "learning_rate": 0.0006946501050398744,
399
+ "loss": 0.0352,
400
+ "step": 32000
401
+ },
402
+ {
403
+ "epoch": 1.99,
404
+ "learning_rate": 0.0006742540129311224,
405
+ "loss": 0.0351,
406
+ "step": 32500
407
+ },
408
+ {
409
+ "epoch": 2.0,
410
+ "eval_loss": 0.03284740820527077,
411
+ "eval_runtime": 666.0258,
412
+ "eval_samples_per_second": 147.227,
413
+ "eval_steps_per_second": 6.135,
414
+ "step": 32686
415
+ },
416
+ {
417
+ "epoch": 2.02,
418
+ "learning_rate": 0.0006538579208223705,
419
+ "loss": 0.0325,
420
+ "step": 33000
421
+ },
422
+ {
423
+ "epoch": 2.05,
424
+ "learning_rate": 0.0006334618287136184,
425
+ "loss": 0.0314,
426
+ "step": 33500
427
+ },
428
+ {
429
+ "epoch": 2.08,
430
+ "learning_rate": 0.0006130657366048665,
431
+ "loss": 0.0308,
432
+ "step": 34000
433
+ },
434
+ {
435
+ "epoch": 2.11,
436
+ "learning_rate": 0.0005926696444961146,
437
+ "loss": 0.0315,
438
+ "step": 34500
439
+ },
440
+ {
441
+ "epoch": 2.14,
442
+ "learning_rate": 0.0005722735523873625,
443
+ "loss": 0.0308,
444
+ "step": 35000
445
+ },
446
+ {
447
+ "epoch": 2.17,
448
+ "learning_rate": 0.0005518774602786107,
449
+ "loss": 0.0306,
450
+ "step": 35500
451
+ },
452
+ {
453
+ "epoch": 2.2,
454
+ "learning_rate": 0.0005314813681698587,
455
+ "loss": 0.0309,
456
+ "step": 36000
457
+ },
458
+ {
459
+ "epoch": 2.23,
460
+ "learning_rate": 0.0005110852760611067,
461
+ "loss": 0.0304,
462
+ "step": 36500
463
+ },
464
+ {
465
+ "epoch": 2.26,
466
+ "learning_rate": 0.0004906891839523547,
467
+ "loss": 0.0306,
468
+ "step": 37000
469
+ },
470
+ {
471
+ "epoch": 2.29,
472
+ "learning_rate": 0.00047029309184360276,
473
+ "loss": 0.0307,
474
+ "step": 37500
475
+ },
476
+ {
477
+ "epoch": 2.33,
478
+ "learning_rate": 0.0004498969997348508,
479
+ "loss": 0.0296,
480
+ "step": 38000
481
+ },
482
+ {
483
+ "epoch": 2.36,
484
+ "learning_rate": 0.0004295009076260988,
485
+ "loss": 0.0302,
486
+ "step": 38500
487
+ },
488
+ {
489
+ "epoch": 2.39,
490
+ "learning_rate": 0.0004091048155173469,
491
+ "loss": 0.0299,
492
+ "step": 39000
493
+ },
494
+ {
495
+ "epoch": 2.42,
496
+ "learning_rate": 0.00038870872340859494,
497
+ "loss": 0.0297,
498
+ "step": 39500
499
+ },
500
+ {
501
+ "epoch": 2.45,
502
+ "learning_rate": 0.00036831263129984295,
503
+ "loss": 0.0292,
504
+ "step": 40000
505
+ },
506
+ {
507
+ "epoch": 2.48,
508
+ "learning_rate": 0.000347916539191091,
509
+ "loss": 0.0302,
510
+ "step": 40500
511
+ },
512
+ {
513
+ "epoch": 2.51,
514
+ "learning_rate": 0.000327520447082339,
515
+ "loss": 0.0295,
516
+ "step": 41000
517
+ },
518
+ {
519
+ "epoch": 2.54,
520
+ "learning_rate": 0.0003071243549735871,
521
+ "loss": 0.0295,
522
+ "step": 41500
523
+ },
524
+ {
525
+ "epoch": 2.57,
526
+ "learning_rate": 0.00028672826286483513,
527
+ "loss": 0.0289,
528
+ "step": 42000
529
+ },
530
+ {
531
+ "epoch": 2.6,
532
+ "learning_rate": 0.00026633217075608314,
533
+ "loss": 0.0297,
534
+ "step": 42500
535
+ },
536
+ {
537
+ "epoch": 2.63,
538
+ "learning_rate": 0.00024593607864733115,
539
+ "loss": 0.0291,
540
+ "step": 43000
541
+ },
542
+ {
543
+ "epoch": 2.66,
544
+ "learning_rate": 0.0002255399865385792,
545
+ "loss": 0.0288,
546
+ "step": 43500
547
+ },
548
+ {
549
+ "epoch": 2.69,
550
+ "learning_rate": 0.00020514389442982727,
551
+ "loss": 0.0278,
552
+ "step": 44000
553
+ },
554
+ {
555
+ "epoch": 2.72,
556
+ "learning_rate": 0.0001847478023210753,
557
+ "loss": 0.0284,
558
+ "step": 44500
559
+ },
560
+ {
561
+ "epoch": 2.75,
562
+ "learning_rate": 0.0001643517102123233,
563
+ "loss": 0.0285,
564
+ "step": 45000
565
+ },
566
+ {
567
+ "epoch": 2.78,
568
+ "learning_rate": 0.00014395561810357136,
569
+ "loss": 0.0278,
570
+ "step": 45500
571
+ },
572
+ {
573
+ "epoch": 2.81,
574
+ "learning_rate": 0.0001235595259948194,
575
+ "loss": 0.0286,
576
+ "step": 46000
577
+ },
578
+ {
579
+ "epoch": 2.85,
580
+ "learning_rate": 0.00010316343388606743,
581
+ "loss": 0.028,
582
+ "step": 46500
583
+ },
584
+ {
585
+ "epoch": 2.88,
586
+ "learning_rate": 8.276734177731548e-05,
587
+ "loss": 0.0279,
588
+ "step": 47000
589
+ },
590
+ {
591
+ "epoch": 2.91,
592
+ "learning_rate": 6.23712496685635e-05,
593
+ "loss": 0.0286,
594
+ "step": 47500
595
+ },
596
+ {
597
+ "epoch": 2.94,
598
+ "learning_rate": 4.197515755981154e-05,
599
+ "loss": 0.0278,
600
+ "step": 48000
601
+ },
602
+ {
603
+ "epoch": 2.97,
604
+ "learning_rate": 2.157906545105958e-05,
605
+ "loss": 0.0279,
606
+ "step": 48500
607
+ },
608
+ {
609
+ "epoch": 3.0,
610
+ "learning_rate": 1.1829733423076138e-06,
611
+ "loss": 0.0277,
612
+ "step": 49000
613
  }
614
  ],
615
+ "max_steps": 49029,
616
+ "num_train_epochs": 3,
617
+ "total_flos": 1.591600637458514e+17,
618
  "trial_name": null,
619
  "trial_params": null
620
  }