phosseini commited on
Commit
fb83b10
1 Parent(s): 3745abf

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +434 -0
trainer_state.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8793088793754578,
3
+ "best_model_checkpoint": "models/checkpoints/checkpoint-5500",
4
+ "epoch": 0.4509582863585118,
5
+ "global_step": 5500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 1.998360118071499e-05,
13
+ "loss": 1.518,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 1.9967202361429978e-05,
19
+ "loss": 1.2325,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.02,
24
+ "learning_rate": 1.9950803542144966e-05,
25
+ "loss": 1.1664,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.03,
30
+ "learning_rate": 1.9934404722859955e-05,
31
+ "loss": 1.1746,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.04,
36
+ "learning_rate": 1.9918005903574943e-05,
37
+ "loss": 1.1251,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.04,
42
+ "eval_loss": 0.964438259601593,
43
+ "eval_runtime": 267.7236,
44
+ "eval_samples_per_second": 255.906,
45
+ "eval_steps_per_second": 15.994,
46
+ "step": 500
47
+ },
48
+ {
49
+ "epoch": 0.05,
50
+ "learning_rate": 1.9901607084289934e-05,
51
+ "loss": 1.1041,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.06,
56
+ "learning_rate": 1.988520826500492e-05,
57
+ "loss": 1.0817,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.07,
62
+ "learning_rate": 1.986880944571991e-05,
63
+ "loss": 1.0594,
64
+ "step": 800
65
+ },
66
+ {
67
+ "epoch": 0.07,
68
+ "learning_rate": 1.98524106264349e-05,
69
+ "loss": 1.0642,
70
+ "step": 900
71
+ },
72
+ {
73
+ "epoch": 0.08,
74
+ "learning_rate": 1.9836011807149887e-05,
75
+ "loss": 1.0453,
76
+ "step": 1000
77
+ },
78
+ {
79
+ "epoch": 0.08,
80
+ "eval_loss": 0.9285051226615906,
81
+ "eval_runtime": 253.7626,
82
+ "eval_samples_per_second": 269.985,
83
+ "eval_steps_per_second": 16.874,
84
+ "step": 1000
85
+ },
86
+ {
87
+ "epoch": 0.09,
88
+ "learning_rate": 1.9819612987864875e-05,
89
+ "loss": 1.0486,
90
+ "step": 1100
91
+ },
92
+ {
93
+ "epoch": 0.1,
94
+ "learning_rate": 1.9803214168579864e-05,
95
+ "loss": 1.0549,
96
+ "step": 1200
97
+ },
98
+ {
99
+ "epoch": 0.11,
100
+ "learning_rate": 1.9786815349294852e-05,
101
+ "loss": 1.0639,
102
+ "step": 1300
103
+ },
104
+ {
105
+ "epoch": 0.11,
106
+ "learning_rate": 1.977041653000984e-05,
107
+ "loss": 1.0354,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.12,
112
+ "learning_rate": 1.9754017710724828e-05,
113
+ "loss": 1.0393,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.12,
118
+ "eval_loss": 0.9202534556388855,
119
+ "eval_runtime": 264.8239,
120
+ "eval_samples_per_second": 258.708,
121
+ "eval_steps_per_second": 16.169,
122
+ "step": 1500
123
+ },
124
+ {
125
+ "epoch": 0.13,
126
+ "learning_rate": 1.9737618891439817e-05,
127
+ "loss": 1.0381,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 0.14,
132
+ "learning_rate": 1.9721220072154808e-05,
133
+ "loss": 1.0672,
134
+ "step": 1700
135
+ },
136
+ {
137
+ "epoch": 0.15,
138
+ "learning_rate": 1.9704821252869793e-05,
139
+ "loss": 1.0335,
140
+ "step": 1800
141
+ },
142
+ {
143
+ "epoch": 0.16,
144
+ "learning_rate": 1.9688422433584785e-05,
145
+ "loss": 1.0239,
146
+ "step": 1900
147
+ },
148
+ {
149
+ "epoch": 0.16,
150
+ "learning_rate": 1.9672023614299773e-05,
151
+ "loss": 1.0012,
152
+ "step": 2000
153
+ },
154
+ {
155
+ "epoch": 0.16,
156
+ "eval_loss": 0.9098692536354065,
157
+ "eval_runtime": 264.5198,
158
+ "eval_samples_per_second": 259.005,
159
+ "eval_steps_per_second": 16.188,
160
+ "step": 2000
161
+ },
162
+ {
163
+ "epoch": 0.17,
164
+ "learning_rate": 1.965562479501476e-05,
165
+ "loss": 1.0146,
166
+ "step": 2100
167
+ },
168
+ {
169
+ "epoch": 0.18,
170
+ "learning_rate": 1.963922597572975e-05,
171
+ "loss": 1.0502,
172
+ "step": 2200
173
+ },
174
+ {
175
+ "epoch": 0.19,
176
+ "learning_rate": 1.9622827156444737e-05,
177
+ "loss": 1.0062,
178
+ "step": 2300
179
+ },
180
+ {
181
+ "epoch": 0.2,
182
+ "learning_rate": 1.9606428337159726e-05,
183
+ "loss": 1.0144,
184
+ "step": 2400
185
+ },
186
+ {
187
+ "epoch": 0.2,
188
+ "learning_rate": 1.9590029517874714e-05,
189
+ "loss": 0.9826,
190
+ "step": 2500
191
+ },
192
+ {
193
+ "epoch": 0.2,
194
+ "eval_loss": 0.9166184067726135,
195
+ "eval_runtime": 256.5893,
196
+ "eval_samples_per_second": 267.01,
197
+ "eval_steps_per_second": 16.688,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 0.21,
202
+ "learning_rate": 1.9573630698589702e-05,
203
+ "loss": 1.0269,
204
+ "step": 2600
205
+ },
206
+ {
207
+ "epoch": 0.22,
208
+ "learning_rate": 1.9557231879304694e-05,
209
+ "loss": 0.9898,
210
+ "step": 2700
211
+ },
212
+ {
213
+ "epoch": 0.23,
214
+ "learning_rate": 1.954083306001968e-05,
215
+ "loss": 1.0248,
216
+ "step": 2800
217
+ },
218
+ {
219
+ "epoch": 0.24,
220
+ "learning_rate": 1.952443424073467e-05,
221
+ "loss": 0.9996,
222
+ "step": 2900
223
+ },
224
+ {
225
+ "epoch": 0.25,
226
+ "learning_rate": 1.950803542144966e-05,
227
+ "loss": 0.9595,
228
+ "step": 3000
229
+ },
230
+ {
231
+ "epoch": 0.25,
232
+ "eval_loss": 0.9191610217094421,
233
+ "eval_runtime": 272.1388,
234
+ "eval_samples_per_second": 251.754,
235
+ "eval_steps_per_second": 15.735,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 0.25,
240
+ "learning_rate": 1.9491636602164647e-05,
241
+ "loss": 0.9873,
242
+ "step": 3100
243
+ },
244
+ {
245
+ "epoch": 0.26,
246
+ "learning_rate": 1.9475237782879635e-05,
247
+ "loss": 0.9667,
248
+ "step": 3200
249
+ },
250
+ {
251
+ "epoch": 0.27,
252
+ "learning_rate": 1.9458838963594623e-05,
253
+ "loss": 0.9666,
254
+ "step": 3300
255
+ },
256
+ {
257
+ "epoch": 0.28,
258
+ "learning_rate": 1.944244014430961e-05,
259
+ "loss": 1.0102,
260
+ "step": 3400
261
+ },
262
+ {
263
+ "epoch": 0.29,
264
+ "learning_rate": 1.94260413250246e-05,
265
+ "loss": 0.9511,
266
+ "step": 3500
267
+ },
268
+ {
269
+ "epoch": 0.29,
270
+ "eval_loss": 0.9013209342956543,
271
+ "eval_runtime": 267.0529,
272
+ "eval_samples_per_second": 256.548,
273
+ "eval_steps_per_second": 16.034,
274
+ "step": 3500
275
+ },
276
+ {
277
+ "epoch": 0.3,
278
+ "learning_rate": 1.9409642505739588e-05,
279
+ "loss": 0.9927,
280
+ "step": 3600
281
+ },
282
+ {
283
+ "epoch": 0.3,
284
+ "learning_rate": 1.9393243686454576e-05,
285
+ "loss": 0.9507,
286
+ "step": 3700
287
+ },
288
+ {
289
+ "epoch": 0.31,
290
+ "learning_rate": 1.9376844867169567e-05,
291
+ "loss": 1.0049,
292
+ "step": 3800
293
+ },
294
+ {
295
+ "epoch": 0.32,
296
+ "learning_rate": 1.9360446047884552e-05,
297
+ "loss": 0.9385,
298
+ "step": 3900
299
+ },
300
+ {
301
+ "epoch": 0.33,
302
+ "learning_rate": 1.9344047228599544e-05,
303
+ "loss": 0.9644,
304
+ "step": 4000
305
+ },
306
+ {
307
+ "epoch": 0.33,
308
+ "eval_loss": 0.8957135081291199,
309
+ "eval_runtime": 260.7988,
310
+ "eval_samples_per_second": 262.701,
311
+ "eval_steps_per_second": 16.419,
312
+ "step": 4000
313
+ },
314
+ {
315
+ "epoch": 0.34,
316
+ "learning_rate": 1.932764840931453e-05,
317
+ "loss": 0.9293,
318
+ "step": 4100
319
+ },
320
+ {
321
+ "epoch": 0.34,
322
+ "learning_rate": 1.931124959002952e-05,
323
+ "loss": 0.9298,
324
+ "step": 4200
325
+ },
326
+ {
327
+ "epoch": 0.35,
328
+ "learning_rate": 1.929485077074451e-05,
329
+ "loss": 0.9218,
330
+ "step": 4300
331
+ },
332
+ {
333
+ "epoch": 0.36,
334
+ "learning_rate": 1.9278451951459497e-05,
335
+ "loss": 0.9523,
336
+ "step": 4400
337
+ },
338
+ {
339
+ "epoch": 0.37,
340
+ "learning_rate": 1.9262053132174485e-05,
341
+ "loss": 0.9079,
342
+ "step": 4500
343
+ },
344
+ {
345
+ "epoch": 0.37,
346
+ "eval_loss": 0.9050074815750122,
347
+ "eval_runtime": 264.4439,
348
+ "eval_samples_per_second": 259.079,
349
+ "eval_steps_per_second": 16.192,
350
+ "step": 4500
351
+ },
352
+ {
353
+ "epoch": 0.38,
354
+ "learning_rate": 1.9245654312889473e-05,
355
+ "loss": 0.9437,
356
+ "step": 4600
357
+ },
358
+ {
359
+ "epoch": 0.39,
360
+ "learning_rate": 1.922925549360446e-05,
361
+ "loss": 0.9395,
362
+ "step": 4700
363
+ },
364
+ {
365
+ "epoch": 0.39,
366
+ "learning_rate": 1.921285667431945e-05,
367
+ "loss": 0.969,
368
+ "step": 4800
369
+ },
370
+ {
371
+ "epoch": 0.4,
372
+ "learning_rate": 1.9196457855034438e-05,
373
+ "loss": 0.926,
374
+ "step": 4900
375
+ },
376
+ {
377
+ "epoch": 0.41,
378
+ "learning_rate": 1.9180059035749426e-05,
379
+ "loss": 0.9178,
380
+ "step": 5000
381
+ },
382
+ {
383
+ "epoch": 0.41,
384
+ "eval_loss": 0.8956226706504822,
385
+ "eval_runtime": 259.9134,
386
+ "eval_samples_per_second": 263.595,
387
+ "eval_steps_per_second": 16.475,
388
+ "step": 5000
389
+ },
390
+ {
391
+ "epoch": 0.42,
392
+ "learning_rate": 1.9163660216464418e-05,
393
+ "loss": 0.9697,
394
+ "step": 5100
395
+ },
396
+ {
397
+ "epoch": 0.43,
398
+ "learning_rate": 1.9147261397179403e-05,
399
+ "loss": 0.9448,
400
+ "step": 5200
401
+ },
402
+ {
403
+ "epoch": 0.43,
404
+ "learning_rate": 1.9130862577894394e-05,
405
+ "loss": 0.9118,
406
+ "step": 5300
407
+ },
408
+ {
409
+ "epoch": 0.44,
410
+ "learning_rate": 1.911446375860938e-05,
411
+ "loss": 0.9321,
412
+ "step": 5400
413
+ },
414
+ {
415
+ "epoch": 0.45,
416
+ "learning_rate": 1.909806493932437e-05,
417
+ "loss": 0.9289,
418
+ "step": 5500
419
+ },
420
+ {
421
+ "epoch": 0.45,
422
+ "eval_loss": 0.8793088793754578,
423
+ "eval_runtime": 261.4397,
424
+ "eval_samples_per_second": 262.057,
425
+ "eval_steps_per_second": 16.379,
426
+ "step": 5500
427
+ }
428
+ ],
429
+ "max_steps": 121960,
430
+ "num_train_epochs": 10,
431
+ "total_flos": 1.922426903232e+16,
432
+ "trial_name": null,
433
+ "trial_params": null
434
+ }