yyx123 commited on
Commit
f9d96e7
1 Parent(s): f363f0a

Model save

Browse files
Files changed (5) hide show
  1. README.md +11 -25
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +339 -1383
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba-5e-4-50
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba-5e-4-50
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 4.1158
26
 
27
  ## Model description
28
 
@@ -54,25 +50,15 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.8768 | 1.0 | 55 | 1.8434 |
58
- | 1.5109 | 2.0 | 110 | 1.8907 |
59
- | 1.1215 | 3.0 | 165 | 2.1649 |
60
- | 0.4876 | 4.0 | 220 | 2.6900 |
61
- | 0.2916 | 5.0 | 275 | 2.8212 |
62
- | 0.1983 | 6.0 | 330 | 2.8984 |
63
- | 0.1199 | 7.0 | 385 | 2.9498 |
64
- | 0.0743 | 8.0 | 440 | 3.2507 |
65
- | 0.062 | 9.0 | 495 | 3.1474 |
66
- | 0.0404 | 10.0 | 550 | 3.3666 |
67
- | 0.038 | 11.0 | 605 | 3.3349 |
68
- | 0.0374 | 12.0 | 660 | 3.4456 |
69
- | 0.0384 | 13.0 | 715 | 3.4822 |
70
- | 0.0408 | 14.0 | 770 | 3.4718 |
71
- | 0.0347 | 15.0 | 825 | 3.5028 |
72
- | 0.0377 | 16.0 | 880 | 3.5218 |
73
- | 0.0395 | 17.0 | 935 | 3.5320 |
74
- | 0.0408 | 18.0 | 990 | 3.5371 |
75
- | 0.0468 | 19.0 | 1045 | 3.5391 |
76
 
77
 
78
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba-5e-4-50
 
16
 
17
  # Yi-6B-ruozhiba-5e-4-50
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 3.4886
22
 
23
  ## Model description
24
 
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.5916 | 2.0 | 110 | 2.0382 |
54
+ | 0.9956 | 3.0 | 165 | 2.4359 |
55
+ | 0.5198 | 4.0 | 220 | 2.9536 |
56
+ | 0.2296 | 5.0 | 275 | 3.0199 |
57
+ | 0.1444 | 6.0 | 330 | 3.2190 |
58
+ | 0.1129 | 7.0 | 385 | 3.3571 |
59
+ | 0.1048 | 8.0 | 440 | 3.4553 |
60
+ | 0.1008 | 9.0 | 495 | 3.4835 |
61
+ | 0.0938 | 10.0 | 550 | 3.4886 |
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 4.1158246994018555,
4
- "eval_runtime": 6.5886,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.491,
7
- "eval_steps_per_second": 0.911,
8
- "train_loss": 0.0,
9
- "train_runtime": 11.9759,
10
  "train_samples": 217,
11
- "train_samples_per_second": 181.197,
12
- "train_steps_per_second": 45.925
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 3.4885787963867188,
4
+ "eval_runtime": 4.9367,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.659,
7
+ "eval_steps_per_second": 1.215,
8
+ "train_loss": 0.3880799013376236,
9
+ "train_runtime": 22779.6065,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 0.095,
12
+ "train_steps_per_second": 0.024
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 4.1158246994018555,
4
- "eval_runtime": 6.5886,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 3.491,
7
- "eval_steps_per_second": 0.911
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_loss": 3.4885787963867188,
4
+ "eval_runtime": 4.9367,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.659,
7
+ "eval_steps_per_second": 1.215
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 11.9759,
5
  "train_samples": 217,
6
- "train_samples_per_second": 181.197,
7
- "train_steps_per_second": 45.925
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.3880799013376236,
4
+ "train_runtime": 22779.6065,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 0.095,
7
+ "train_steps_per_second": 0.024
8
  }
trainer_state.json CHANGED
@@ -1,2019 +1,975 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "learning_rate": 4.5454545454545455e-06,
14
- "loss": 2.3833,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
- "learning_rate": 1.8181818181818182e-05,
20
- "loss": 2.4734,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
- "learning_rate": 3.6363636363636364e-05,
26
- "loss": 2.2655,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
- "learning_rate": 5.4545454545454546e-05,
32
- "loss": 2.2091,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
- "learning_rate": 7.272727272727273e-05,
38
- "loss": 2.1358,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
- "learning_rate": 9.090909090909092e-05,
44
- "loss": 2.0997,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
- "learning_rate": 0.00010909090909090909,
50
- "loss": 1.931,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
- "learning_rate": 0.00012727272727272725,
56
- "loss": 2.0453,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
- "learning_rate": 0.00014545454545454546,
62
- "loss": 1.9392,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
- "learning_rate": 0.00016363636363636363,
68
- "loss": 1.8909,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
- "learning_rate": 0.00018181818181818183,
74
- "loss": 1.7724,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
- "learning_rate": 0.0002,
80
- "loss": 1.854,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
- "learning_rate": 0.00021818181818181818,
86
- "loss": 1.9588,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
- "learning_rate": 0.00023636363636363636,
92
- "loss": 1.8768,
93
  "step": 52
94
  },
95
- {
96
- "epoch": 1.0,
97
- "gpt4_scores": 0.67,
98
- "step": 55
99
- },
100
- {
101
- "epoch": 1.0,
102
- "std": 0.12413702106946178,
103
- "step": 55
104
- },
105
- {
106
- "epoch": 1.0,
107
- "eval_loss": 1.84342360496521,
108
- "eval_runtime": 4.9467,
109
- "eval_samples_per_second": 4.65,
110
- "eval_steps_per_second": 1.213,
111
- "step": 55
112
- },
113
  {
114
  "epoch": 1.02,
115
- "learning_rate": 0.0002545454545454545,
116
- "loss": 1.7451,
117
  "step": 56
118
  },
119
  {
120
  "epoch": 1.09,
121
- "learning_rate": 0.00027272727272727274,
122
- "loss": 1.7394,
123
  "step": 60
124
  },
125
  {
126
  "epoch": 1.16,
127
- "learning_rate": 0.0002909090909090909,
128
- "loss": 1.6971,
129
  "step": 64
130
  },
131
  {
132
  "epoch": 1.24,
133
- "learning_rate": 0.0003090909090909091,
134
- "loss": 1.6419,
135
  "step": 68
136
  },
137
  {
138
  "epoch": 1.31,
139
- "learning_rate": 0.00032727272727272726,
140
- "loss": 1.65,
141
  "step": 72
142
  },
143
  {
144
  "epoch": 1.38,
145
- "learning_rate": 0.00034545454545454544,
146
- "loss": 1.594,
147
  "step": 76
148
  },
149
  {
150
  "epoch": 1.45,
151
- "learning_rate": 0.00036363636363636367,
152
- "loss": 1.5985,
153
  "step": 80
154
  },
155
  {
156
  "epoch": 1.53,
157
- "learning_rate": 0.00038181818181818184,
158
- "loss": 1.719,
159
  "step": 84
160
  },
161
  {
162
  "epoch": 1.6,
163
- "learning_rate": 0.0004,
164
- "loss": 1.655,
165
  "step": 88
166
  },
167
  {
168
  "epoch": 1.67,
169
- "learning_rate": 0.00041818181818181814,
170
- "loss": 1.6052,
171
  "step": 92
172
  },
173
  {
174
  "epoch": 1.75,
175
- "learning_rate": 0.00043636363636363637,
176
- "loss": 1.6288,
177
  "step": 96
178
  },
179
  {
180
  "epoch": 1.82,
181
- "learning_rate": 0.00045454545454545455,
182
- "loss": 1.6272,
183
  "step": 100
184
  },
185
  {
186
  "epoch": 1.89,
187
- "learning_rate": 0.0004727272727272727,
188
- "loss": 1.6124,
189
  "step": 104
190
  },
191
  {
192
  "epoch": 1.96,
193
- "learning_rate": 0.0004909090909090909,
194
- "loss": 1.5109,
195
  "step": 108
196
  },
197
  {
198
  "epoch": 2.0,
199
- "gpt4_scores": 0.675,
200
- "step": 110
201
- },
202
- {
203
- "epoch": 2.0,
204
- "std": 0.11688669727560959,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 2.0,
209
- "eval_loss": 1.89065682888031,
210
- "eval_runtime": 4.9546,
211
- "eval_samples_per_second": 4.642,
212
- "eval_steps_per_second": 1.211,
213
  "step": 110
214
  },
215
  {
216
  "epoch": 2.04,
217
- "learning_rate": 0.0004999949650182266,
218
- "loss": 1.3323,
219
  "step": 112
220
  },
221
  {
222
  "epoch": 2.11,
223
- "learning_rate": 0.0004999546863808815,
224
- "loss": 1.0715,
225
  "step": 116
226
  },
227
  {
228
  "epoch": 2.18,
229
- "learning_rate": 0.0004998741355957963,
230
- "loss": 0.8812,
231
  "step": 120
232
  },
233
  {
234
  "epoch": 2.25,
235
- "learning_rate": 0.0004997533256411359,
236
- "loss": 0.9464,
237
  "step": 124
238
  },
239
  {
240
  "epoch": 2.33,
241
- "learning_rate": 0.0004995922759815339,
242
- "loss": 0.9049,
243
  "step": 128
244
  },
245
  {
246
  "epoch": 2.4,
247
- "learning_rate": 0.0004993910125649561,
248
- "loss": 0.9246,
249
  "step": 132
250
  },
251
  {
252
  "epoch": 2.47,
253
- "learning_rate": 0.0004991495678185201,
254
- "loss": 1.0853,
255
  "step": 136
256
  },
257
  {
258
  "epoch": 2.55,
259
- "learning_rate": 0.0004988679806432712,
260
- "loss": 1.1289,
261
  "step": 140
262
  },
263
  {
264
  "epoch": 2.62,
265
- "learning_rate": 0.0004985462964079136,
266
- "loss": 0.9598,
267
  "step": 144
268
  },
269
  {
270
  "epoch": 2.69,
271
- "learning_rate": 0.0004981845669415021,
272
- "loss": 0.8825,
273
  "step": 148
274
  },
275
  {
276
  "epoch": 2.76,
277
- "learning_rate": 0.0004977828505250904,
278
- "loss": 0.985,
279
  "step": 152
280
  },
281
  {
282
  "epoch": 2.84,
283
- "learning_rate": 0.0004973412118823412,
284
- "loss": 0.9869,
285
  "step": 156
286
  },
287
  {
288
  "epoch": 2.91,
289
- "learning_rate": 0.0004968597221690986,
290
- "loss": 0.985,
291
  "step": 160
292
  },
293
  {
294
  "epoch": 2.98,
295
- "learning_rate": 0.0004963384589619233,
296
- "loss": 1.1215,
297
  "step": 164
298
  },
299
  {
300
  "epoch": 3.0,
301
- "gpt4_scores": 0.67,
302
- "step": 165
303
- },
304
- {
305
- "epoch": 3.0,
306
- "std": 0.11920570456148481,
307
  "step": 165
308
  },
309
  {
310
  "epoch": 3.0,
311
- "eval_loss": 2.164942741394043,
312
- "eval_runtime": 4.9498,
313
- "eval_samples_per_second": 4.647,
314
- "eval_steps_per_second": 1.212,
315
  "step": 165
316
  },
317
  {
318
  "epoch": 3.05,
319
- "learning_rate": 0.0004957775062455933,
320
- "loss": 0.8362,
321
  "step": 168
322
  },
323
  {
324
  "epoch": 3.13,
325
- "learning_rate": 0.0004951769543995731,
326
- "loss": 0.3812,
327
  "step": 172
328
  },
329
  {
330
  "epoch": 3.2,
331
- "learning_rate": 0.0004945369001834514,
332
- "loss": 0.4813,
333
  "step": 176
334
  },
335
  {
336
  "epoch": 3.27,
337
- "learning_rate": 0.0004938574467213517,
338
- "loss": 0.4693,
339
  "step": 180
340
  },
341
  {
342
  "epoch": 3.35,
343
- "learning_rate": 0.0004931387034853173,
344
- "loss": 0.447,
345
  "step": 184
346
  },
347
  {
348
  "epoch": 3.42,
349
- "learning_rate": 0.0004923807862776728,
350
- "loss": 0.6197,
351
  "step": 188
352
  },
353
  {
354
  "epoch": 3.49,
355
- "learning_rate": 0.0004915838172123671,
356
- "loss": 0.5158,
357
  "step": 192
358
  },
359
  {
360
  "epoch": 3.56,
361
- "learning_rate": 0.0004907479246952981,
362
- "loss": 0.5785,
363
  "step": 196
364
  },
365
  {
366
  "epoch": 3.64,
367
- "learning_rate": 0.0004898732434036243,
368
- "loss": 0.5258,
369
  "step": 200
370
  },
371
  {
372
  "epoch": 3.71,
373
- "learning_rate": 0.0004889599142640663,
374
- "loss": 0.4577,
375
  "step": 204
376
  },
377
  {
378
  "epoch": 3.78,
379
- "learning_rate": 0.0004880080844302004,
380
- "loss": 0.5965,
381
  "step": 208
382
  },
383
  {
384
  "epoch": 3.85,
385
- "learning_rate": 0.0004870179072587499,
386
- "loss": 0.5373,
387
  "step": 212
388
  },
389
  {
390
  "epoch": 3.93,
391
- "learning_rate": 0.0004859895422848767,
392
- "loss": 0.4731,
393
  "step": 216
394
  },
395
  {
396
  "epoch": 4.0,
397
- "learning_rate": 0.0004849231551964771,
398
- "loss": 0.4876,
399
- "step": 220
400
- },
401
- {
402
- "epoch": 4.0,
403
- "gpt4_scores": 0.61,
404
  "step": 220
405
  },
406
  {
407
  "epoch": 4.0,
408
- "std": 0.12445882853377657,
 
409
  "step": 220
410
  },
411
  {
412
  "epoch": 4.0,
413
- "eval_loss": 2.6900253295898438,
414
- "eval_runtime": 4.9381,
415
- "eval_samples_per_second": 4.658,
416
- "eval_steps_per_second": 1.215,
417
  "step": 220
418
  },
419
  {
420
  "epoch": 4.07,
421
- "learning_rate": 0.00048381891780748665,
422
- "loss": 0.288,
423
  "step": 224
424
  },
425
  {
426
  "epoch": 4.15,
427
- "learning_rate": 0.00048267700803019775,
428
- "loss": 0.2556,
429
  "step": 228
430
  },
431
  {
432
  "epoch": 4.22,
433
- "learning_rate": 0.0004814976098465951,
434
- "loss": 0.2897,
435
  "step": 232
436
  },
437
  {
438
  "epoch": 4.29,
439
- "learning_rate": 0.00048028091327871256,
440
- "loss": 0.2375,
441
  "step": 236
442
  },
443
  {
444
  "epoch": 4.36,
445
- "learning_rate": 0.0004790271143580174,
446
- "loss": 0.2367,
447
  "step": 240
448
  },
449
  {
450
  "epoch": 4.44,
451
- "learning_rate": 0.00047773641509382626,
452
- "loss": 0.2878,
453
  "step": 244
454
  },
455
  {
456
  "epoch": 4.51,
457
- "learning_rate": 0.0004764090234407577,
458
- "loss": 0.3138,
459
  "step": 248
460
  },
461
  {
462
  "epoch": 4.58,
463
- "learning_rate": 0.00047504515326522696,
464
- "loss": 0.277,
465
  "step": 252
466
  },
467
  {
468
  "epoch": 4.65,
469
- "learning_rate": 0.0004736450243109884,
470
- "loss": 0.2624,
471
  "step": 256
472
  },
473
  {
474
  "epoch": 4.73,
475
- "learning_rate": 0.0004722088621637309,
476
- "loss": 0.2888,
477
  "step": 260
478
  },
479
  {
480
  "epoch": 4.8,
481
- "learning_rate": 0.00047073689821473173,
482
- "loss": 0.2256,
483
  "step": 264
484
  },
485
  {
486
  "epoch": 4.87,
487
- "learning_rate": 0.00046922936962357577,
488
- "loss": 0.2243,
489
  "step": 268
490
  },
491
  {
492
  "epoch": 4.95,
493
- "learning_rate": 0.00046768651927994433,
494
- "loss": 0.2916,
495
  "step": 272
496
  },
497
  {
498
  "epoch": 5.0,
499
- "gpt4_scores": 0.63,
500
- "step": 275
501
- },
502
- {
503
- "epoch": 5.0,
504
- "std": 0.12573782247199924,
505
  "step": 275
506
  },
507
  {
508
  "epoch": 5.0,
509
- "eval_loss": 2.821211099624634,
510
- "eval_runtime": 4.9351,
511
- "eval_samples_per_second": 4.661,
512
- "eval_steps_per_second": 1.216,
513
  "step": 275
514
  },
515
  {
516
  "epoch": 5.02,
517
- "learning_rate": 0.0004661085957644817,
518
- "loss": 0.233,
519
  "step": 276
520
  },
521
  {
522
  "epoch": 5.09,
523
- "learning_rate": 0.0004644958533087443,
524
- "loss": 0.1537,
525
  "step": 280
526
  },
527
  {
528
  "epoch": 5.16,
529
- "learning_rate": 0.0004628485517542392,
530
- "loss": 0.1743,
531
  "step": 284
532
  },
533
  {
534
  "epoch": 5.24,
535
- "learning_rate": 0.0004611669565105596,
536
- "loss": 0.1827,
537
  "step": 288
538
  },
539
  {
540
  "epoch": 5.31,
541
- "learning_rate": 0.00045945133851262184,
542
- "loss": 0.1928,
543
  "step": 292
544
  },
545
  {
546
  "epoch": 5.38,
547
- "learning_rate": 0.00045770197417701366,
548
- "loss": 0.1718,
549
  "step": 296
550
  },
551
  {
552
  "epoch": 5.45,
553
- "learning_rate": 0.0004559191453574582,
554
- "loss": 0.166,
555
  "step": 300
556
  },
557
  {
558
  "epoch": 5.53,
559
- "learning_rate": 0.00045410313929940244,
560
- "loss": 0.1484,
561
  "step": 304
562
  },
563
  {
564
  "epoch": 5.6,
565
- "learning_rate": 0.0004522542485937369,
566
- "loss": 0.1521,
567
  "step": 308
568
  },
569
  {
570
  "epoch": 5.67,
571
- "learning_rate": 0.00045037277112965383,
572
- "loss": 0.195,
573
  "step": 312
574
  },
575
  {
576
  "epoch": 5.75,
577
- "learning_rate": 0.0004484590100466523,
578
- "loss": 0.153,
579
  "step": 316
580
  },
581
  {
582
  "epoch": 5.82,
583
- "learning_rate": 0.0004465132736856969,
584
- "loss": 0.1751,
585
  "step": 320
586
  },
587
  {
588
  "epoch": 5.89,
589
- "learning_rate": 0.0004445358755395382,
590
- "loss": 0.2153,
591
  "step": 324
592
  },
593
  {
594
  "epoch": 5.96,
595
- "learning_rate": 0.00044252713420220394,
596
- "loss": 0.1983,
597
  "step": 328
598
  },
599
  {
600
  "epoch": 6.0,
601
- "gpt4_scores": 0.7500000000000001,
602
- "step": 330
603
- },
604
- {
605
- "epoch": 6.0,
606
- "std": 0.0972111104761179,
607
  "step": 330
608
  },
609
  {
610
  "epoch": 6.0,
611
- "eval_loss": 2.8983585834503174,
612
- "eval_runtime": 4.9684,
613
- "eval_samples_per_second": 4.629,
614
- "eval_steps_per_second": 1.208,
615
  "step": 330
616
  },
617
  {
618
  "epoch": 6.04,
619
- "learning_rate": 0.0004404873733176677,
620
- "loss": 0.1353,
621
  "step": 332
622
  },
623
  {
624
  "epoch": 6.11,
625
- "learning_rate": 0.00043841692152770415,
626
- "loss": 0.102,
627
  "step": 336
628
  },
629
  {
630
  "epoch": 6.18,
631
- "learning_rate": 0.0004363161124189387,
632
- "loss": 0.1055,
633
  "step": 340
634
  },
635
  {
636
  "epoch": 6.25,
637
- "learning_rate": 0.00043418528446910123,
638
- "loss": 0.0998,
639
  "step": 344
640
  },
641
  {
642
  "epoch": 6.33,
643
- "learning_rate": 0.00043202478099249104,
644
- "loss": 0.1168,
645
  "step": 348
646
  },
647
  {
648
  "epoch": 6.4,
649
- "learning_rate": 0.0004298349500846628,
650
- "loss": 0.1156,
651
  "step": 352
652
  },
653
  {
654
  "epoch": 6.47,
655
- "learning_rate": 0.00042761614456634226,
656
- "loss": 0.1345,
657
  "step": 356
658
  },
659
  {
660
  "epoch": 6.55,
661
- "learning_rate": 0.00042536872192658034,
662
- "loss": 0.0968,
663
  "step": 360
664
  },
665
  {
666
  "epoch": 6.62,
667
- "learning_rate": 0.0004230930442651557,
668
- "loss": 0.1179,
669
  "step": 364
670
  },
671
  {
672
  "epoch": 6.69,
673
- "learning_rate": 0.00042078947823423365,
674
- "loss": 0.0959,
675
  "step": 368
676
  },
677
  {
678
  "epoch": 6.76,
679
- "learning_rate": 0.00041845839497929203,
680
- "loss": 0.1078,
681
  "step": 372
682
  },
683
  {
684
  "epoch": 6.84,
685
- "learning_rate": 0.0004161001700793231,
686
- "loss": 0.1175,
687
  "step": 376
688
  },
689
  {
690
  "epoch": 6.91,
691
- "learning_rate": 0.0004137151834863213,
692
- "loss": 0.1095,
693
  "step": 380
694
  },
695
  {
696
  "epoch": 6.98,
697
- "learning_rate": 0.00041130381946406574,
698
- "loss": 0.1199,
699
  "step": 384
700
  },
701
  {
702
  "epoch": 7.0,
703
- "gpt4_scores": 0.49000000000000005,
704
- "step": 385
705
- },
706
- {
707
- "epoch": 7.0,
708
- "std": 0.12525973016097391,
709
  "step": 385
710
  },
711
  {
712
  "epoch": 7.0,
713
- "eval_loss": 2.949794292449951,
714
- "eval_runtime": 4.9646,
715
- "eval_samples_per_second": 4.633,
716
- "eval_steps_per_second": 1.209,
717
  "step": 385
718
  },
719
  {
720
  "epoch": 7.05,
721
- "learning_rate": 0.0004088664665262091,
722
- "loss": 0.076,
723
  "step": 388
724
  },
725
  {
726
  "epoch": 7.13,
727
- "learning_rate": 0.0004064035173736804,
728
- "loss": 0.0709,
729
  "step": 392
730
  },
731
  {
732
  "epoch": 7.2,
733
- "learning_rate": 0.00040391536883141455,
734
- "loss": 0.073,
735
  "step": 396
736
  },
737
  {
738
  "epoch": 7.27,
739
- "learning_rate": 0.00040140242178441667,
740
- "loss": 0.0819,
741
  "step": 400
742
  },
743
  {
744
  "epoch": 7.35,
745
- "learning_rate": 0.000398865081113172,
746
- "loss": 0.0643,
747
  "step": 404
748
  },
749
  {
750
  "epoch": 7.42,
751
- "learning_rate": 0.0003963037556284129,
752
- "loss": 0.1603,
753
  "step": 408
754
  },
755
  {
756
  "epoch": 7.49,
757
- "learning_rate": 0.0003937188580052518,
758
- "loss": 0.119,
759
  "step": 412
760
  },
761
  {
762
  "epoch": 7.56,
763
- "learning_rate": 0.0003911108047166924,
764
- "loss": 0.0754,
765
  "step": 416
766
  },
767
  {
768
  "epoch": 7.64,
769
- "learning_rate": 0.0003884800159665276,
770
- "loss": 0.0622,
771
  "step": 420
772
  },
773
  {
774
  "epoch": 7.71,
775
- "learning_rate": 0.00038582691562163827,
776
- "loss": 0.0711,
777
  "step": 424
778
  },
779
  {
780
  "epoch": 7.78,
781
- "learning_rate": 0.00038315193114369994,
782
- "loss": 0.0643,
783
  "step": 428
784
  },
785
  {
786
  "epoch": 7.85,
787
- "learning_rate": 0.0003804554935203115,
788
- "loss": 0.0672,
789
  "step": 432
790
  },
791
  {
792
  "epoch": 7.93,
793
- "learning_rate": 0.00037773803719555514,
794
- "loss": 0.0688,
795
  "step": 436
796
  },
797
  {
798
  "epoch": 8.0,
799
- "learning_rate": 0.000375,
800
- "loss": 0.0743,
801
- "step": 440
802
- },
803
- {
804
- "epoch": 8.0,
805
- "gpt4_scores": 0.51,
806
  "step": 440
807
  },
808
  {
809
  "epoch": 8.0,
810
- "std": 0.12763228431709586,
 
811
  "step": 440
812
  },
813
  {
814
  "epoch": 8.0,
815
- "eval_loss": 3.250723123550415,
816
- "eval_runtime": 4.9396,
817
- "eval_samples_per_second": 4.656,
818
- "eval_steps_per_second": 1.215,
819
  "step": 440
820
  },
821
  {
822
  "epoch": 8.07,
823
- "learning_rate": 0.00037224182308015974,
824
- "loss": 0.0528,
825
  "step": 444
826
  },
827
  {
828
  "epoch": 8.15,
829
- "learning_rate": 0.0003694639508274158,
830
- "loss": 0.0502,
831
  "step": 448
832
  },
833
  {
834
  "epoch": 8.22,
835
- "learning_rate": 0.00036666683080641843,
836
- "loss": 0.0659,
837
  "step": 452
838
  },
839
  {
840
  "epoch": 8.29,
841
- "learning_rate": 0.0003638509136829758,
842
- "loss": 0.065,
843
  "step": 456
844
  },
845
  {
846
  "epoch": 8.36,
847
- "learning_rate": 0.00036101665315144355,
848
- "loss": 0.0524,
849
  "step": 460
850
  },
851
  {
852
  "epoch": 8.44,
853
- "learning_rate": 0.00035816450586162706,
854
- "loss": 0.0565,
855
  "step": 464
856
  },
857
  {
858
  "epoch": 8.51,
859
- "learning_rate": 0.00035529493134520666,
860
- "loss": 0.0432,
861
  "step": 468
862
  },
863
  {
864
  "epoch": 8.58,
865
- "learning_rate": 0.00035240839194169884,
866
- "loss": 0.0582,
867
  "step": 472
868
  },
869
  {
870
  "epoch": 8.65,
871
- "learning_rate": 0.0003495053527239656,
872
- "loss": 0.0507,
873
  "step": 476
874
  },
875
  {
876
  "epoch": 8.73,
877
- "learning_rate": 0.00034658628142328216,
878
- "loss": 0.0571,
879
  "step": 480
880
  },
881
  {
882
  "epoch": 8.8,
883
- "learning_rate": 0.00034365164835397803,
884
- "loss": 0.0925,
885
  "step": 484
886
  },
887
  {
888
  "epoch": 8.87,
889
- "learning_rate": 0.00034070192633766023,
890
- "loss": 0.0601,
891
  "step": 488
892
  },
893
  {
894
  "epoch": 8.95,
895
- "learning_rate": 0.00033773759062703394,
896
- "loss": 0.062,
897
  "step": 492
898
  },
899
  {
900
  "epoch": 9.0,
901
- "gpt4_scores": 0.53,
902
- "step": 495
903
- },
904
- {
905
- "epoch": 9.0,
906
- "std": 0.1192057045614848,
907
  "step": 495
908
  },
909
  {
910
  "epoch": 9.0,
911
- "eval_loss": 3.1474077701568604,
912
- "eval_runtime": 4.9704,
913
- "eval_samples_per_second": 4.627,
914
- "eval_steps_per_second": 1.207,
915
  "step": 495
916
  },
917
  {
918
  "epoch": 9.02,
919
- "learning_rate": 0.0003347591188293301,
920
- "loss": 0.0535,
921
  "step": 496
922
  },
923
  {
924
  "epoch": 9.09,
925
- "learning_rate": 0.00033176699082935546,
926
- "loss": 0.0484,
927
  "step": 500
928
  },
929
  {
930
  "epoch": 9.16,
931
- "learning_rate": 0.00032876168871217323,
932
- "loss": 0.0541,
933
  "step": 504
934
  },
935
  {
936
  "epoch": 9.24,
937
- "learning_rate": 0.00032574369668543187,
938
- "loss": 0.0438,
939
  "step": 508
940
  },
941
  {
942
  "epoch": 9.31,
943
- "learning_rate": 0.00032271350100134975,
944
- "loss": 0.0451,
945
  "step": 512
946
  },
947
  {
948
  "epoch": 9.38,
949
- "learning_rate": 0.00031967158987837195,
950
- "loss": 0.0403,
951
  "step": 516
952
  },
953
  {
954
  "epoch": 9.45,
955
- "learning_rate": 0.0003166184534225087,
956
- "loss": 0.0517,
957
  "step": 520
958
  },
959
  {
960
  "epoch": 9.53,
961
- "learning_rate": 0.0003135545835483718,
962
- "loss": 0.0498,
963
  "step": 524
964
  },
965
  {
966
  "epoch": 9.6,
967
- "learning_rate": 0.0003104804738999169,
968
- "loss": 0.0516,
969
  "step": 528
970
  },
971
  {
972
  "epoch": 9.67,
973
- "learning_rate": 0.00030739661977091025,
974
- "loss": 0.0487,
975
  "step": 532
976
  },
977
  {
978
  "epoch": 9.75,
979
- "learning_rate": 0.00030430351802512693,
980
- "loss": 0.045,
981
  "step": 536
982
  },
983
  {
984
  "epoch": 9.82,
985
- "learning_rate": 0.0003012016670162977,
986
- "loss": 0.0397,
987
  "step": 540
988
  },
989
  {
990
  "epoch": 9.89,
991
- "learning_rate": 0.00029809156650781527,
992
- "loss": 0.0484,
993
  "step": 544
994
  },
995
  {
996
  "epoch": 9.96,
997
- "learning_rate": 0.0002949737175922135,
998
- "loss": 0.0404,
999
  "step": 548
1000
  },
1001
  {
1002
  "epoch": 10.0,
1003
- "gpt4_scores": 0.6749999999999999,
 
1004
  "step": 550
1005
  },
1006
  {
1007
  "epoch": 10.0,
1008
- "std": 0.11602801385872293,
 
 
 
1009
  "step": 550
1010
  },
1011
  {
1012
  "epoch": 10.0,
1013
- "eval_loss": 3.3666398525238037,
1014
- "eval_runtime": 4.9475,
1015
- "eval_samples_per_second": 4.649,
1016
- "eval_steps_per_second": 1.213,
1017
- "step": 550
1018
- },
1019
- {
1020
- "epoch": 10.04,
1021
- "learning_rate": 0.0002918486226104327,
1022
- "loss": 0.0403,
1023
- "step": 552
1024
- },
1025
- {
1026
- "epoch": 10.11,
1027
- "learning_rate": 0.0002887167850708831,
1028
- "loss": 0.0401,
1029
- "step": 556
1030
- },
1031
- {
1032
- "epoch": 10.18,
1033
- "learning_rate": 0.00028557870956832135,
1034
- "loss": 0.0442,
1035
- "step": 560
1036
- },
1037
- {
1038
- "epoch": 10.25,
1039
- "learning_rate": 0.00028243490170255044,
1040
- "loss": 0.041,
1041
- "step": 564
1042
- },
1043
- {
1044
- "epoch": 10.33,
1045
- "learning_rate": 0.0002792858679969596,
1046
- "loss": 0.036,
1047
- "step": 568
1048
- },
1049
- {
1050
- "epoch": 10.4,
1051
- "learning_rate": 0.0002761321158169134,
1052
- "loss": 0.0533,
1053
- "step": 572
1054
- },
1055
- {
1056
- "epoch": 10.47,
1057
- "learning_rate": 0.0002729741532880069,
1058
- "loss": 0.0506,
1059
- "step": 576
1060
- },
1061
- {
1062
- "epoch": 10.55,
1063
- "learning_rate": 0.0002698124892141971,
1064
- "loss": 0.0532,
1065
- "step": 580
1066
- },
1067
- {
1068
- "epoch": 10.62,
1069
- "learning_rate": 0.000266647632995826,
1070
- "loss": 0.0468,
1071
- "step": 584
1072
- },
1073
- {
1074
- "epoch": 10.69,
1075
- "learning_rate": 0.0002634800945475465,
1076
- "loss": 0.0396,
1077
- "step": 588
1078
- },
1079
- {
1080
- "epoch": 10.76,
1081
- "learning_rate": 0.00026031038421616684,
1082
- "loss": 0.0383,
1083
- "step": 592
1084
- },
1085
- {
1086
- "epoch": 10.84,
1087
- "learning_rate": 0.00025713901269842405,
1088
- "loss": 0.0475,
1089
- "step": 596
1090
- },
1091
- {
1092
- "epoch": 10.91,
1093
- "learning_rate": 0.000253966490958702,
1094
- "loss": 0.0491,
1095
- "step": 600
1096
- },
1097
- {
1098
- "epoch": 10.98,
1099
- "learning_rate": 0.00025079333014670557,
1100
- "loss": 0.038,
1101
- "step": 604
1102
- },
1103
- {
1104
- "epoch": 11.0,
1105
- "gpt4_scores": 0.63,
1106
- "step": 605
1107
- },
1108
- {
1109
- "epoch": 11.0,
1110
- "std": 0.11580155439371269,
1111
- "step": 605
1112
- },
1113
- {
1114
- "epoch": 11.0,
1115
- "eval_loss": 3.3349289894104004,
1116
- "eval_runtime": 4.9384,
1117
- "eval_samples_per_second": 4.657,
1118
- "eval_steps_per_second": 1.215,
1119
- "step": 605
1120
- },
1121
- {
1122
- "epoch": 11.05,
1123
- "learning_rate": 0.00024762004151510585,
1124
- "loss": 0.0482,
1125
- "step": 608
1126
- },
1127
- {
1128
- "epoch": 11.13,
1129
- "learning_rate": 0.00024444713633716764,
1130
- "loss": 0.0356,
1131
- "step": 612
1132
- },
1133
- {
1134
- "epoch": 11.2,
1135
- "learning_rate": 0.00024127512582437484,
1136
- "loss": 0.0444,
1137
- "step": 616
1138
- },
1139
- {
1140
- "epoch": 11.27,
1141
- "learning_rate": 0.00023810452104406444,
1142
- "loss": 0.0411,
1143
- "step": 620
1144
- },
1145
- {
1146
- "epoch": 11.35,
1147
- "learning_rate": 0.00023493583283708543,
1148
- "loss": 0.0441,
1149
- "step": 624
1150
- },
1151
- {
1152
- "epoch": 11.42,
1153
- "learning_rate": 0.00023176957173549233,
1154
- "loss": 0.0393,
1155
- "step": 628
1156
- },
1157
- {
1158
- "epoch": 11.49,
1159
- "learning_rate": 0.00022860624788029015,
1160
- "loss": 0.0363,
1161
- "step": 632
1162
- },
1163
- {
1164
- "epoch": 11.56,
1165
- "learning_rate": 0.00022544637093924072,
1166
- "loss": 0.0393,
1167
- "step": 636
1168
- },
1169
- {
1170
- "epoch": 11.64,
1171
- "learning_rate": 0.00022229045002474727,
1172
- "loss": 0.0452,
1173
- "step": 640
1174
- },
1175
- {
1176
- "epoch": 11.71,
1177
- "learning_rate": 0.00021913899361182632,
1178
- "loss": 0.0417,
1179
- "step": 644
1180
- },
1181
- {
1182
- "epoch": 11.78,
1183
- "learning_rate": 0.000215992509456184,
1184
- "loss": 0.0366,
1185
- "step": 648
1186
- },
1187
- {
1188
- "epoch": 11.85,
1189
- "learning_rate": 0.00021285150451240712,
1190
- "loss": 0.0443,
1191
- "step": 652
1192
- },
1193
- {
1194
- "epoch": 11.93,
1195
- "learning_rate": 0.000209716484852284,
1196
- "loss": 0.0447,
1197
- "step": 656
1198
- },
1199
- {
1200
- "epoch": 12.0,
1201
- "learning_rate": 0.00020658795558326743,
1202
- "loss": 0.0374,
1203
- "step": 660
1204
- },
1205
- {
1206
- "epoch": 12.0,
1207
- "gpt4_scores": 0.6100000000000001,
1208
- "step": 660
1209
- },
1210
- {
1211
- "epoch": 12.0,
1212
- "std": 0.12841339493993606,
1213
- "step": 660
1214
- },
1215
- {
1216
- "epoch": 12.0,
1217
- "eval_loss": 3.4455862045288086,
1218
- "eval_runtime": 4.9572,
1219
- "eval_samples_per_second": 4.64,
1220
- "eval_steps_per_second": 1.21,
1221
- "step": 660
1222
- },
1223
- {
1224
- "epoch": 12.07,
1225
- "learning_rate": 0.0002034664207670925,
1226
- "loss": 0.0442,
1227
- "step": 664
1228
- },
1229
- {
1230
- "epoch": 12.15,
1231
- "learning_rate": 0.00020035238333856371,
1232
- "loss": 0.0391,
1233
- "step": 668
1234
- },
1235
- {
1236
- "epoch": 12.22,
1237
- "learning_rate": 0.0001972463450245226,
1238
- "loss": 0.0392,
1239
- "step": 672
1240
- },
1241
- {
1242
- "epoch": 12.29,
1243
- "learning_rate": 0.00019414880626301146,
1244
- "loss": 0.0426,
1245
- "step": 676
1246
- },
1247
- {
1248
- "epoch": 12.36,
1249
- "learning_rate": 0.00019106026612264316,
1250
- "loss": 0.043,
1251
- "step": 680
1252
- },
1253
- {
1254
- "epoch": 12.44,
1255
- "learning_rate": 0.0001879812222221929,
1256
- "loss": 0.0395,
1257
- "step": 684
1258
- },
1259
- {
1260
- "epoch": 12.51,
1261
- "learning_rate": 0.00018491217065042198,
1262
- "loss": 0.0396,
1263
- "step": 688
1264
- },
1265
- {
1266
- "epoch": 12.58,
1267
- "learning_rate": 0.00018185360588615057,
1268
- "loss": 0.0394,
1269
- "step": 692
1270
- },
1271
- {
1272
- "epoch": 12.65,
1273
- "learning_rate": 0.00017880602071858692,
1274
- "loss": 0.0403,
1275
- "step": 696
1276
- },
1277
- {
1278
- "epoch": 12.73,
1279
- "learning_rate": 0.00017576990616793137,
1280
- "loss": 0.0349,
1281
- "step": 700
1282
- },
1283
- {
1284
- "epoch": 12.8,
1285
- "learning_rate": 0.00017274575140626317,
1286
- "loss": 0.0404,
1287
- "step": 704
1288
- },
1289
- {
1290
- "epoch": 12.87,
1291
- "learning_rate": 0.0001697340436787273,
1292
- "loss": 0.0344,
1293
- "step": 708
1294
- },
1295
- {
1296
- "epoch": 12.95,
1297
- "learning_rate": 0.00016673526822502983,
1298
- "loss": 0.0384,
1299
- "step": 712
1300
- },
1301
- {
1302
- "epoch": 13.0,
1303
- "gpt4_scores": 0.53,
1304
- "step": 715
1305
- },
1306
- {
1307
- "epoch": 13.0,
1308
- "std": 0.13787675656179324,
1309
- "step": 715
1310
- },
1311
- {
1312
- "epoch": 13.0,
1313
- "eval_loss": 3.4822309017181396,
1314
- "eval_runtime": 4.946,
1315
- "eval_samples_per_second": 4.65,
1316
- "eval_steps_per_second": 1.213,
1317
- "step": 715
1318
- },
1319
- {
1320
- "epoch": 13.02,
1321
- "learning_rate": 0.0001637499082012574,
1322
- "loss": 0.0522,
1323
- "step": 716
1324
- },
1325
- {
1326
- "epoch": 13.09,
1327
- "learning_rate": 0.00016077844460203207,
1328
- "loss": 0.0384,
1329
- "step": 720
1330
- },
1331
- {
1332
- "epoch": 13.16,
1333
- "learning_rate": 0.00015782135618301485,
1334
- "loss": 0.0336,
1335
- "step": 724
1336
- },
1337
- {
1338
- "epoch": 13.24,
1339
- "learning_rate": 0.00015487911938376925,
1340
- "loss": 0.0401,
1341
- "step": 728
1342
- },
1343
- {
1344
- "epoch": 13.31,
1345
- "learning_rate": 0.00015195220825099862,
1346
- "loss": 0.0395,
1347
- "step": 732
1348
- },
1349
- {
1350
- "epoch": 13.38,
1351
- "learning_rate": 0.00014904109436216883,
1352
- "loss": 0.0414,
1353
- "step": 736
1354
- },
1355
- {
1356
- "epoch": 13.45,
1357
- "learning_rate": 0.0001461462467495284,
1358
- "loss": 0.0393,
1359
- "step": 740
1360
- },
1361
- {
1362
- "epoch": 13.53,
1363
- "learning_rate": 0.00014326813182453956,
1364
- "loss": 0.0383,
1365
- "step": 744
1366
- },
1367
- {
1368
- "epoch": 13.6,
1369
- "learning_rate": 0.00014040721330273062,
1370
- "loss": 0.038,
1371
- "step": 748
1372
- },
1373
- {
1374
- "epoch": 13.67,
1375
- "learning_rate": 0.0001375639521289836,
1376
- "loss": 0.0383,
1377
- "step": 752
1378
- },
1379
- {
1380
- "epoch": 13.75,
1381
- "learning_rate": 0.00013473880640326724,
1382
- "loss": 0.0405,
1383
- "step": 756
1384
- },
1385
- {
1386
- "epoch": 13.82,
1387
- "learning_rate": 0.00013193223130682935,
1388
- "loss": 0.0422,
1389
- "step": 760
1390
- },
1391
- {
1392
- "epoch": 13.89,
1393
- "learning_rate": 0.000129144679028859,
1394
- "loss": 0.0408,
1395
- "step": 764
1396
- },
1397
- {
1398
- "epoch": 13.96,
1399
- "learning_rate": 0.00012637659869363084,
1400
- "loss": 0.0408,
1401
- "step": 768
1402
- },
1403
- {
1404
- "epoch": 14.0,
1405
- "gpt4_scores": 0.6500000000000001,
1406
- "step": 770
1407
- },
1408
- {
1409
- "epoch": 14.0,
1410
- "std": 0.11423659658795862,
1411
- "step": 770
1412
- },
1413
- {
1414
- "epoch": 14.0,
1415
- "eval_loss": 3.471827983856201,
1416
- "eval_runtime": 4.9416,
1417
- "eval_samples_per_second": 4.654,
1418
- "eval_steps_per_second": 1.214,
1419
- "step": 770
1420
- },
1421
- {
1422
- "epoch": 14.04,
1423
- "learning_rate": 0.00012362843628814266,
1424
- "loss": 0.0343,
1425
- "step": 772
1426
- },
1427
- {
1428
- "epoch": 14.11,
1429
- "learning_rate": 0.00012090063459025954,
1430
- "loss": 0.0386,
1431
- "step": 776
1432
- },
1433
- {
1434
- "epoch": 14.18,
1435
- "learning_rate": 0.00011819363309737438,
1436
- "loss": 0.0412,
1437
- "step": 780
1438
- },
1439
- {
1440
- "epoch": 14.25,
1441
- "learning_rate": 0.0001155078679555969,
1442
- "loss": 0.0402,
1443
- "step": 784
1444
- },
1445
- {
1446
- "epoch": 14.33,
1447
- "learning_rate": 0.00011284377188948258,
1448
- "loss": 0.038,
1449
- "step": 788
1450
- },
1451
- {
1452
- "epoch": 14.4,
1453
- "learning_rate": 0.00011020177413231333,
1454
- "loss": 0.0367,
1455
- "step": 792
1456
- },
1457
- {
1458
- "epoch": 14.47,
1459
- "learning_rate": 0.0001075823003569403,
1460
- "loss": 0.0449,
1461
- "step": 796
1462
- },
1463
- {
1464
- "epoch": 14.55,
1465
- "learning_rate": 0.00010498577260720049,
1466
- "loss": 0.036,
1467
- "step": 800
1468
- },
1469
- {
1470
- "epoch": 14.62,
1471
- "learning_rate": 0.00010241260922991761,
1472
- "loss": 0.0385,
1473
- "step": 804
1474
- },
1475
- {
1476
- "epoch": 14.69,
1477
- "learning_rate": 9.986322480749927e-05,
1478
- "loss": 0.0428,
1479
- "step": 808
1480
- },
1481
- {
1482
- "epoch": 14.76,
1483
- "learning_rate": 9.733803009114044e-05,
1484
- "loss": 0.0375,
1485
- "step": 812
1486
- },
1487
- {
1488
- "epoch": 14.84,
1489
- "learning_rate": 9.483743193464408e-05,
1490
- "loss": 0.0435,
1491
- "step": 816
1492
- },
1493
- {
1494
- "epoch": 14.91,
1495
- "learning_rate": 9.236183322886945e-05,
1496
- "loss": 0.0394,
1497
- "step": 820
1498
- },
1499
- {
1500
- "epoch": 14.98,
1501
- "learning_rate": 8.991163283681945e-05,
1502
- "loss": 0.0347,
1503
- "step": 824
1504
- },
1505
- {
1506
- "epoch": 15.0,
1507
- "gpt4_scores": 0.67,
1508
- "step": 825
1509
- },
1510
- {
1511
- "epoch": 15.0,
1512
- "std": 0.11644741302407709,
1513
- "step": 825
1514
- },
1515
- {
1516
- "epoch": 15.0,
1517
- "eval_loss": 3.502847671508789,
1518
- "eval_runtime": 4.9425,
1519
- "eval_samples_per_second": 4.654,
1520
- "eval_steps_per_second": 1.214,
1521
- "step": 825
1522
- },
1523
- {
1524
- "epoch": 15.05,
1525
- "learning_rate": 8.748722552937688e-05,
1526
- "loss": 0.0395,
1527
- "step": 828
1528
- },
1529
- {
1530
- "epoch": 15.13,
1531
- "learning_rate": 8.508900192169963e-05,
1532
- "loss": 0.0389,
1533
- "step": 832
1534
- },
1535
- {
1536
- "epoch": 15.2,
1537
- "learning_rate": 8.271734841028553e-05,
1538
- "loss": 0.0425,
1539
- "step": 836
1540
- },
1541
- {
1542
- "epoch": 15.27,
1543
- "learning_rate": 8.037264711071699e-05,
1544
- "loss": 0.0403,
1545
- "step": 840
1546
- },
1547
- {
1548
- "epoch": 15.35,
1549
- "learning_rate": 7.805527579609576e-05,
1550
- "loss": 0.0405,
1551
- "step": 844
1552
- },
1553
- {
1554
- "epoch": 15.42,
1555
- "learning_rate": 7.576560783617667e-05,
1556
- "loss": 0.0392,
1557
- "step": 848
1558
- },
1559
- {
1560
- "epoch": 15.49,
1561
- "learning_rate": 7.35040121372109e-05,
1562
- "loss": 0.0383,
1563
- "step": 852
1564
- },
1565
- {
1566
- "epoch": 15.56,
1567
- "learning_rate": 7.127085308250913e-05,
1568
- "loss": 0.0404,
1569
- "step": 856
1570
- },
1571
- {
1572
- "epoch": 15.64,
1573
- "learning_rate": 6.906649047373245e-05,
1574
- "loss": 0.0347,
1575
- "step": 860
1576
- },
1577
- {
1578
- "epoch": 15.71,
1579
- "learning_rate": 6.689127947292231e-05,
1580
- "loss": 0.0363,
1581
- "step": 864
1582
- },
1583
- {
1584
- "epoch": 15.78,
1585
- "learning_rate": 6.474557054527707e-05,
1586
- "loss": 0.0341,
1587
- "step": 868
1588
- },
1589
- {
1590
- "epoch": 15.85,
1591
- "learning_rate": 6.262970940268654e-05,
1592
- "loss": 0.0377,
1593
- "step": 872
1594
- },
1595
- {
1596
- "epoch": 15.93,
1597
- "learning_rate": 6.054403694803079e-05,
1598
- "loss": 0.0442,
1599
- "step": 876
1600
- },
1601
- {
1602
- "epoch": 16.0,
1603
- "learning_rate": 5.848888922025553e-05,
1604
- "loss": 0.0377,
1605
- "step": 880
1606
- },
1607
- {
1608
- "epoch": 16.0,
1609
- "gpt4_scores": 0.5700000000000001,
1610
- "step": 880
1611
- },
1612
- {
1613
- "epoch": 16.0,
1614
- "std": 0.1319469590403659,
1615
- "step": 880
1616
- },
1617
- {
1618
- "epoch": 16.0,
1619
- "eval_loss": 3.5217506885528564,
1620
- "eval_runtime": 4.9355,
1621
- "eval_samples_per_second": 4.66,
1622
- "eval_steps_per_second": 1.216,
1623
- "step": 880
1624
- },
1625
- {
1626
- "epoch": 16.07,
1627
- "learning_rate": 5.646459734022938e-05,
1628
- "loss": 0.041,
1629
- "step": 884
1630
- },
1631
- {
1632
- "epoch": 16.15,
1633
- "learning_rate": 5.4471487457395216e-05,
1634
- "loss": 0.0333,
1635
- "step": 888
1636
- },
1637
- {
1638
- "epoch": 16.22,
1639
- "learning_rate": 5.2509880697220956e-05,
1640
- "loss": 0.0407,
1641
- "step": 892
1642
- },
1643
- {
1644
- "epoch": 16.29,
1645
- "learning_rate": 5.058009310946118e-05,
1646
- "loss": 0.0391,
1647
- "step": 896
1648
- },
1649
- {
1650
- "epoch": 16.36,
1651
- "learning_rate": 4.8682435617235344e-05,
1652
- "loss": 0.0365,
1653
- "step": 900
1654
- },
1655
- {
1656
- "epoch": 16.44,
1657
- "learning_rate": 4.6817213966933034e-05,
1658
- "loss": 0.0425,
1659
- "step": 904
1660
- },
1661
- {
1662
- "epoch": 16.51,
1663
- "learning_rate": 4.498472867895223e-05,
1664
- "loss": 0.0363,
1665
- "step": 908
1666
- },
1667
- {
1668
- "epoch": 16.58,
1669
- "learning_rate": 4.318527499928074e-05,
1670
- "loss": 0.0391,
1671
- "step": 912
1672
- },
1673
- {
1674
- "epoch": 16.65,
1675
- "learning_rate": 4.141914285192619e-05,
1676
- "loss": 0.0375,
1677
- "step": 916
1678
- },
1679
- {
1680
- "epoch": 16.73,
1681
- "learning_rate": 3.968661679220467e-05,
1682
- "loss": 0.0388,
1683
- "step": 920
1684
- },
1685
- {
1686
- "epoch": 16.8,
1687
- "learning_rate": 3.798797596089351e-05,
1688
- "loss": 0.0417,
1689
- "step": 924
1690
- },
1691
- {
1692
- "epoch": 16.87,
1693
- "learning_rate": 3.632349403925664e-05,
1694
- "loss": 0.0378,
1695
- "step": 928
1696
- },
1697
- {
1698
- "epoch": 16.95,
1699
- "learning_rate": 3.4693439204949856e-05,
1700
- "loss": 0.0395,
1701
- "step": 932
1702
- },
1703
- {
1704
- "epoch": 17.0,
1705
- "gpt4_scores": 0.595,
1706
- "step": 935
1707
- },
1708
- {
1709
- "epoch": 17.0,
1710
- "std": 0.1132364782214636,
1711
- "step": 935
1712
- },
1713
- {
1714
- "epoch": 17.0,
1715
- "eval_loss": 3.5319790840148926,
1716
- "eval_runtime": 4.9408,
1717
- "eval_samples_per_second": 4.655,
1718
- "eval_steps_per_second": 1.214,
1719
- "step": 935
1720
- },
1721
- {
1722
- "epoch": 17.02,
1723
- "learning_rate": 3.309807408881269e-05,
1724
- "loss": 0.0416,
1725
- "step": 936
1726
- },
1727
- {
1728
- "epoch": 17.09,
1729
- "learning_rate": 3.1537655732553766e-05,
1730
- "loss": 0.0409,
1731
- "step": 940
1732
- },
1733
- {
1734
- "epoch": 17.16,
1735
- "learning_rate": 3.0012435547336736e-05,
1736
- "loss": 0.0412,
1737
- "step": 944
1738
- },
1739
- {
1740
- "epoch": 17.24,
1741
- "learning_rate": 2.8522659273273603e-05,
1742
- "loss": 0.0411,
1743
- "step": 948
1744
- },
1745
- {
1746
- "epoch": 17.31,
1747
- "learning_rate": 2.7068566939831645e-05,
1748
- "loss": 0.0332,
1749
- "step": 952
1750
- },
1751
- {
1752
- "epoch": 17.38,
1753
- "learning_rate": 2.5650392827160445e-05,
1754
- "loss": 0.0315,
1755
- "step": 956
1756
- },
1757
- {
1758
- "epoch": 17.45,
1759
- "learning_rate": 2.4268365428344735e-05,
1760
- "loss": 0.0448,
1761
- "step": 960
1762
- },
1763
- {
1764
- "epoch": 17.53,
1765
- "learning_rate": 2.29227074125907e-05,
1766
- "loss": 0.0349,
1767
- "step": 964
1768
- },
1769
- {
1770
- "epoch": 17.6,
1771
- "learning_rate": 2.1613635589349755e-05,
1772
- "loss": 0.0367,
1773
- "step": 968
1774
- },
1775
- {
1776
- "epoch": 17.67,
1777
- "learning_rate": 2.0341360873386672e-05,
1778
- "loss": 0.0386,
1779
- "step": 972
1780
- },
1781
- {
1782
- "epoch": 17.75,
1783
- "learning_rate": 1.9106088250797264e-05,
1784
- "loss": 0.0371,
1785
- "step": 976
1786
- },
1787
- {
1788
- "epoch": 17.82,
1789
- "learning_rate": 1.7908016745981858e-05,
1790
- "loss": 0.0369,
1791
- "step": 980
1792
- },
1793
- {
1794
- "epoch": 17.89,
1795
- "learning_rate": 1.674733938957873e-05,
1796
- "loss": 0.0464,
1797
- "step": 984
1798
- },
1799
- {
1800
- "epoch": 17.96,
1801
- "learning_rate": 1.562424318736344e-05,
1802
- "loss": 0.0408,
1803
- "step": 988
1804
- },
1805
- {
1806
- "epoch": 18.0,
1807
- "gpt4_scores": 0.59,
1808
- "step": 990
1809
- },
1810
- {
1811
- "epoch": 18.0,
1812
- "std": 0.13224976370489286,
1813
- "step": 990
1814
- },
1815
- {
1816
- "epoch": 18.0,
1817
- "eval_loss": 3.537071466445923,
1818
- "eval_runtime": 4.9404,
1819
- "eval_samples_per_second": 4.656,
1820
- "eval_steps_per_second": 1.214,
1821
- "step": 990
1822
- },
1823
- {
1824
- "epoch": 18.04,
1825
- "learning_rate": 1.4538909090118846e-05,
1826
- "loss": 0.0424,
1827
- "step": 992
1828
- },
1829
- {
1830
- "epoch": 18.11,
1831
- "learning_rate": 1.3491511964480702e-05,
1832
- "loss": 0.0406,
1833
- "step": 996
1834
- },
1835
- {
1836
- "epoch": 18.18,
1837
- "learning_rate": 1.2482220564763668e-05,
1838
- "loss": 0.0404,
1839
- "step": 1000
1840
- },
1841
- {
1842
- "epoch": 18.25,
1843
- "learning_rate": 1.1511197505771842e-05,
1844
- "loss": 0.039,
1845
- "step": 1004
1846
- },
1847
- {
1848
- "epoch": 18.33,
1849
- "learning_rate": 1.0578599236598707e-05,
1850
- "loss": 0.0398,
1851
- "step": 1008
1852
- },
1853
- {
1854
- "epoch": 18.4,
1855
- "learning_rate": 9.684576015420277e-06,
1856
- "loss": 0.0347,
1857
- "step": 1012
1858
- },
1859
- {
1860
- "epoch": 18.47,
1861
- "learning_rate": 8.829271885286095e-06,
1862
- "loss": 0.0386,
1863
- "step": 1016
1864
- },
1865
- {
1866
- "epoch": 18.55,
1867
- "learning_rate": 8.012824650910938e-06,
1868
- "loss": 0.034,
1869
- "step": 1020
1870
- },
1871
- {
1872
- "epoch": 18.62,
1873
- "learning_rate": 7.235365856472442e-06,
1874
- "loss": 0.0349,
1875
- "step": 1024
1876
- },
1877
- {
1878
- "epoch": 18.69,
1879
- "learning_rate": 6.497020764416634e-06,
1880
- "loss": 0.0376,
1881
- "step": 1028
1882
- },
1883
- {
1884
- "epoch": 18.76,
1885
- "learning_rate": 5.797908335276214e-06,
1886
- "loss": 0.0422,
1887
- "step": 1032
1888
- },
1889
- {
1890
- "epoch": 18.84,
1891
- "learning_rate": 5.1381412085036995e-06,
1892
- "loss": 0.0404,
1893
- "step": 1036
1894
- },
1895
- {
1896
- "epoch": 18.91,
1897
- "learning_rate": 4.517825684323323e-06,
1898
- "loss": 0.0356,
1899
- "step": 1040
1900
- },
1901
- {
1902
- "epoch": 18.98,
1903
- "learning_rate": 3.937061706604072e-06,
1904
- "loss": 0.0468,
1905
- "step": 1044
1906
- },
1907
- {
1908
- "epoch": 19.0,
1909
- "gpt4_scores": 0.575,
1910
- "step": 1045
1911
- },
1912
- {
1913
- "epoch": 19.0,
1914
- "std": 0.12791598805466028,
1915
- "step": 1045
1916
- },
1917
- {
1918
- "epoch": 19.0,
1919
- "eval_loss": 3.5391297340393066,
1920
- "eval_runtime": 4.9485,
1921
- "eval_samples_per_second": 4.648,
1922
- "eval_steps_per_second": 1.212,
1923
- "step": 1045
1924
- },
1925
- {
1926
- "epoch": 19.05,
1927
- "learning_rate": 3.3959428467570664e-06,
1928
- "loss": 0.0362,
1929
- "step": 1048
1930
- },
1931
- {
1932
- "epoch": 19.13,
1933
- "learning_rate": 2.8945562886593944e-06,
1934
- "loss": 0.0408,
1935
- "step": 1052
1936
- },
1937
- {
1938
- "epoch": 19.2,
1939
- "learning_rate": 2.4329828146074094e-06,
1940
- "loss": 0.0391,
1941
- "step": 1056
1942
- },
1943
- {
1944
- "epoch": 19.27,
1945
- "learning_rate": 2.011296792301165e-06,
1946
- "loss": 0.0376,
1947
- "step": 1060
1948
- },
1949
- {
1950
- "epoch": 19.35,
1951
- "learning_rate": 1.6295661628624448e-06,
1952
- "loss": 0.0446,
1953
- "step": 1064
1954
- },
1955
- {
1956
- "epoch": 19.42,
1957
- "learning_rate": 1.2878524298882698e-06,
1958
- "loss": 0.0332,
1959
- "step": 1068
1960
- },
1961
- {
1962
- "epoch": 19.49,
1963
- "learning_rate": 9.862106495415469e-07,
1964
- "loss": 0.0375,
1965
- "step": 1072
1966
- },
1967
- {
1968
- "epoch": 19.56,
1969
- "learning_rate": 7.246894216806354e-07,
1970
- "loss": 0.0451,
1971
- "step": 1076
1972
- },
1973
- {
1974
- "epoch": 19.64,
1975
- "learning_rate": 5.033308820289185e-07,
1976
- "loss": 0.0451,
1977
- "step": 1080
1978
- },
1979
- {
1980
- "epoch": 19.71,
1981
- "learning_rate": 3.221706953860093e-07,
1982
- "loss": 0.0354,
1983
- "step": 1084
1984
- },
1985
- {
1986
- "epoch": 19.78,
1987
- "learning_rate": 1.8123804988159908e-07,
1988
- "loss": 0.0397,
1989
- "step": 1088
1990
- },
1991
- {
1992
- "epoch": 19.85,
1993
- "learning_rate": 8.0555652272718e-08,
1994
- "loss": 0.0394,
1995
- "step": 1092
1996
- },
1997
- {
1998
- "epoch": 19.93,
1999
- "learning_rate": 2.0139724285161975e-08,
2000
- "loss": 0.0321,
2001
- "step": 1096
2002
- },
2003
- {
2004
- "epoch": 20.0,
2005
- "learning_rate": 0.0,
2006
- "loss": 0.0358,
2007
- "step": 1100
2008
- },
2009
- {
2010
- "epoch": 20.0,
2011
- "step": 1100,
2012
- "total_flos": 3.792163606268314e+16,
2013
- "train_loss": 0.0,
2014
- "train_runtime": 11.9759,
2015
- "train_samples_per_second": 181.197,
2016
- "train_steps_per_second": 45.925
2017
  }
2018
  ],
2019
  "logging_steps": 4,
@@ -2021,7 +977,7 @@
2021
  "num_input_tokens_seen": 0,
2022
  "num_train_epochs": 10,
2023
  "save_steps": 55,
2024
- "total_flos": 3.792163606268314e+16,
2025
  "train_batch_size": 4,
2026
  "trial_name": null,
2027
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "learning_rate": 9.090909090909091e-06,
14
+ "loss": 2.7431,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.07,
19
+ "learning_rate": 3.6363636363636364e-05,
20
+ "loss": 2.8776,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.15,
25
+ "learning_rate": 7.272727272727273e-05,
26
+ "loss": 2.634,
27
  "step": 8
28
  },
29
  {
30
  "epoch": 0.22,
31
+ "learning_rate": 0.00010909090909090909,
32
+ "loss": 2.5152,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.29,
37
+ "learning_rate": 0.00014545454545454546,
38
+ "loss": 2.3073,
39
  "step": 16
40
  },
41
  {
42
  "epoch": 0.36,
43
+ "learning_rate": 0.00018181818181818183,
44
+ "loss": 2.2473,
45
  "step": 20
46
  },
47
  {
48
  "epoch": 0.44,
49
+ "learning_rate": 0.00021818181818181818,
50
+ "loss": 2.1606,
51
  "step": 24
52
  },
53
  {
54
  "epoch": 0.51,
55
+ "learning_rate": 0.0002545454545454545,
56
+ "loss": 2.1845,
57
  "step": 28
58
  },
59
  {
60
  "epoch": 0.58,
61
+ "learning_rate": 0.0002909090909090909,
62
+ "loss": 2.0583,
63
  "step": 32
64
  },
65
  {
66
  "epoch": 0.65,
67
+ "learning_rate": 0.00032727272727272726,
68
+ "loss": 2.0335,
69
  "step": 36
70
  },
71
  {
72
  "epoch": 0.73,
73
+ "learning_rate": 0.00036363636363636367,
74
+ "loss": 1.9237,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
+ "learning_rate": 0.0004,
80
+ "loss": 1.9701,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
+ "learning_rate": 0.00043636363636363637,
86
+ "loss": 1.9571,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
+ "learning_rate": 0.0004727272727272727,
92
+ "loss": 2.0252,
93
  "step": 52
94
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  {
96
  "epoch": 1.02,
97
+ "learning_rate": 0.0004999949650182266,
98
+ "loss": 2.0972,
99
  "step": 56
100
  },
101
  {
102
  "epoch": 1.09,
103
+ "learning_rate": 0.0004998741355957963,
104
+ "loss": 1.8658,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 1.16,
109
+ "learning_rate": 0.0004995922759815339,
110
+ "loss": 1.8353,
111
  "step": 64
112
  },
113
  {
114
  "epoch": 1.24,
115
+ "learning_rate": 0.0004991495678185201,
116
+ "loss": 1.7303,
117
  "step": 68
118
  },
119
  {
120
  "epoch": 1.31,
121
+ "learning_rate": 0.0004985462964079136,
122
+ "loss": 1.6807,
123
  "step": 72
124
  },
125
  {
126
  "epoch": 1.38,
127
+ "learning_rate": 0.0004977828505250904,
128
+ "loss": 1.56,
129
  "step": 76
130
  },
131
  {
132
  "epoch": 1.45,
133
+ "learning_rate": 0.0004968597221690986,
134
+ "loss": 1.5809,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 1.53,
139
+ "learning_rate": 0.0004957775062455933,
140
+ "loss": 1.7125,
141
  "step": 84
142
  },
143
  {
144
  "epoch": 1.6,
145
+ "learning_rate": 0.0004945369001834514,
146
+ "loss": 1.6883,
147
  "step": 88
148
  },
149
  {
150
  "epoch": 1.67,
151
+ "learning_rate": 0.0004931387034853173,
152
+ "loss": 1.6341,
153
  "step": 92
154
  },
155
  {
156
  "epoch": 1.75,
157
+ "learning_rate": 0.0004915838172123671,
158
+ "loss": 1.7006,
159
  "step": 96
160
  },
161
  {
162
  "epoch": 1.82,
163
+ "learning_rate": 0.0004898732434036243,
164
+ "loss": 1.7037,
165
  "step": 100
166
  },
167
  {
168
  "epoch": 1.89,
169
+ "learning_rate": 0.0004880080844302004,
170
+ "loss": 1.5447,
171
  "step": 104
172
  },
173
  {
174
  "epoch": 1.96,
175
+ "learning_rate": 0.0004859895422848767,
176
+ "loss": 1.5916,
177
  "step": 108
178
  },
179
  {
180
  "epoch": 2.0,
181
+ "pls_score": 62.4,
182
+ "std": 3.9855739862659676,
 
 
 
 
183
  "step": 110
184
  },
185
  {
186
  "epoch": 2.0,
187
+ "eval_loss": 2.0381879806518555,
188
+ "eval_runtime": 4.958,
189
+ "eval_samples_per_second": 4.639,
190
+ "eval_steps_per_second": 1.21,
191
  "step": 110
192
  },
193
  {
194
  "epoch": 2.04,
195
+ "learning_rate": 0.00048381891780748665,
196
+ "loss": 1.5033,
197
  "step": 112
198
  },
199
  {
200
  "epoch": 2.11,
201
+ "learning_rate": 0.0004814976098465951,
202
+ "loss": 0.9438,
203
  "step": 116
204
  },
205
  {
206
  "epoch": 2.18,
207
+ "learning_rate": 0.0004790271143580174,
208
+ "loss": 0.9552,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 2.25,
213
+ "learning_rate": 0.0004764090234407577,
214
+ "loss": 0.8901,
215
  "step": 124
216
  },
217
  {
218
  "epoch": 2.33,
219
+ "learning_rate": 0.0004736450243109884,
220
+ "loss": 0.8692,
221
  "step": 128
222
  },
223
  {
224
  "epoch": 2.4,
225
+ "learning_rate": 0.00047073689821473173,
226
+ "loss": 0.8675,
227
  "step": 132
228
  },
229
  {
230
  "epoch": 2.47,
231
+ "learning_rate": 0.00046768651927994433,
232
+ "loss": 0.8246,
233
  "step": 136
234
  },
235
  {
236
  "epoch": 2.55,
237
+ "learning_rate": 0.0004644958533087443,
238
+ "loss": 0.813,
239
  "step": 140
240
  },
241
  {
242
  "epoch": 2.62,
243
+ "learning_rate": 0.0004611669565105596,
244
+ "loss": 0.9542,
245
  "step": 144
246
  },
247
  {
248
  "epoch": 2.69,
249
+ "learning_rate": 0.00045770197417701366,
250
+ "loss": 0.8225,
251
  "step": 148
252
  },
253
  {
254
  "epoch": 2.76,
255
+ "learning_rate": 0.00045410313929940244,
256
+ "loss": 0.8872,
257
  "step": 152
258
  },
259
  {
260
  "epoch": 2.84,
261
+ "learning_rate": 0.00045037277112965383,
262
+ "loss": 0.9624,
263
  "step": 156
264
  },
265
  {
266
  "epoch": 2.91,
267
+ "learning_rate": 0.0004465132736856969,
268
+ "loss": 1.0054,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 2.98,
273
+ "learning_rate": 0.00044252713420220394,
274
+ "loss": 0.9956,
275
  "step": 164
276
  },
277
  {
278
  "epoch": 3.0,
279
+ "pls_score": 70.0,
280
+ "std": 3.54400902933387,
 
 
 
 
281
  "step": 165
282
  },
283
  {
284
  "epoch": 3.0,
285
+ "eval_loss": 2.4358930587768555,
286
+ "eval_runtime": 4.9575,
287
+ "eval_samples_per_second": 4.639,
288
+ "eval_steps_per_second": 1.21,
289
  "step": 165
290
  },
291
  {
292
  "epoch": 3.05,
293
+ "learning_rate": 0.00043841692152770415,
294
+ "loss": 0.5089,
295
  "step": 168
296
  },
297
  {
298
  "epoch": 3.13,
299
+ "learning_rate": 0.00043418528446910123,
300
+ "loss": 0.4589,
301
  "step": 172
302
  },
303
  {
304
  "epoch": 3.2,
305
+ "learning_rate": 0.0004298349500846628,
306
+ "loss": 0.4433,
307
  "step": 176
308
  },
309
  {
310
  "epoch": 3.27,
311
+ "learning_rate": 0.00042536872192658034,
312
+ "loss": 0.4208,
313
  "step": 180
314
  },
315
  {
316
  "epoch": 3.35,
317
+ "learning_rate": 0.00042078947823423365,
318
+ "loss": 0.4314,
319
  "step": 184
320
  },
321
  {
322
  "epoch": 3.42,
323
+ "learning_rate": 0.0004161001700793231,
324
+ "loss": 0.4506,
325
  "step": 188
326
  },
327
  {
328
  "epoch": 3.49,
329
+ "learning_rate": 0.00041130381946406574,
330
+ "loss": 0.4087,
331
  "step": 192
332
  },
333
  {
334
  "epoch": 3.56,
335
+ "learning_rate": 0.0004064035173736804,
336
+ "loss": 0.4357,
337
  "step": 196
338
  },
339
  {
340
  "epoch": 3.64,
341
+ "learning_rate": 0.00040140242178441667,
342
+ "loss": 0.4892,
343
  "step": 200
344
  },
345
  {
346
  "epoch": 3.71,
347
+ "learning_rate": 0.0003963037556284129,
348
+ "loss": 0.5003,
349
  "step": 204
350
  },
351
  {
352
  "epoch": 3.78,
353
+ "learning_rate": 0.0003911108047166924,
354
+ "loss": 0.4449,
355
  "step": 208
356
  },
357
  {
358
  "epoch": 3.85,
359
+ "learning_rate": 0.00038582691562163827,
360
+ "loss": 0.4754,
361
  "step": 212
362
  },
363
  {
364
  "epoch": 3.93,
365
+ "learning_rate": 0.0003804554935203115,
366
+ "loss": 0.4281,
367
  "step": 216
368
  },
369
  {
370
  "epoch": 4.0,
371
+ "learning_rate": 0.000375,
372
+ "loss": 0.5198,
 
 
 
 
 
373
  "step": 220
374
  },
375
  {
376
  "epoch": 4.0,
377
+ "pls_score": 64.2,
378
+ "std": 3.9404568263083397,
379
  "step": 220
380
  },
381
  {
382
  "epoch": 4.0,
383
+ "eval_loss": 2.9536352157592773,
384
+ "eval_runtime": 4.9448,
385
+ "eval_samples_per_second": 4.651,
386
+ "eval_steps_per_second": 1.213,
387
  "step": 220
388
  },
389
  {
390
  "epoch": 4.07,
391
+ "learning_rate": 0.0003694639508274158,
392
+ "loss": 0.2395,
393
  "step": 224
394
  },
395
  {
396
  "epoch": 4.15,
397
+ "learning_rate": 0.0003638509136829758,
398
+ "loss": 0.2221,
399
  "step": 228
400
  },
401
  {
402
  "epoch": 4.22,
403
+ "learning_rate": 0.00035816450586162706,
404
+ "loss": 0.2417,
405
  "step": 232
406
  },
407
  {
408
  "epoch": 4.29,
409
+ "learning_rate": 0.00035240839194169884,
410
+ "loss": 0.2447,
411
  "step": 236
412
  },
413
  {
414
  "epoch": 4.36,
415
+ "learning_rate": 0.00034658628142328216,
416
+ "loss": 0.2491,
417
  "step": 240
418
  },
419
  {
420
  "epoch": 4.44,
421
+ "learning_rate": 0.00034070192633766023,
422
+ "loss": 0.2284,
423
  "step": 244
424
  },
425
  {
426
  "epoch": 4.51,
427
+ "learning_rate": 0.0003347591188293301,
428
+ "loss": 0.229,
429
  "step": 248
430
  },
431
  {
432
  "epoch": 4.58,
433
+ "learning_rate": 0.00032876168871217323,
434
+ "loss": 0.2267,
435
  "step": 252
436
  },
437
  {
438
  "epoch": 4.65,
439
+ "learning_rate": 0.00032271350100134975,
440
+ "loss": 0.2265,
441
  "step": 256
442
  },
443
  {
444
  "epoch": 4.73,
445
+ "learning_rate": 0.0003166184534225087,
446
+ "loss": 0.2466,
447
  "step": 260
448
  },
449
  {
450
  "epoch": 4.8,
451
+ "learning_rate": 0.0003104804738999169,
452
+ "loss": 0.2335,
453
  "step": 264
454
  },
455
  {
456
  "epoch": 4.87,
457
+ "learning_rate": 0.00030430351802512693,
458
+ "loss": 0.3354,
459
  "step": 268
460
  },
461
  {
462
  "epoch": 4.95,
463
+ "learning_rate": 0.00029809156650781527,
464
+ "loss": 0.2296,
465
  "step": 272
466
  },
467
  {
468
  "epoch": 5.0,
469
+ "pls_score": 68.4,
470
+ "std": 4.0827441751841365,
 
 
 
 
471
  "step": 275
472
  },
473
  {
474
  "epoch": 5.0,
475
+ "eval_loss": 3.0199222564697266,
476
+ "eval_runtime": 4.9279,
477
+ "eval_samples_per_second": 4.667,
478
+ "eval_steps_per_second": 1.218,
479
  "step": 275
480
  },
481
  {
482
  "epoch": 5.02,
483
+ "learning_rate": 0.0002918486226104327,
484
+ "loss": 0.1705,
485
  "step": 276
486
  },
487
  {
488
  "epoch": 5.09,
489
+ "learning_rate": 0.00028557870956832135,
490
+ "loss": 0.1558,
491
  "step": 280
492
  },
493
  {
494
  "epoch": 5.16,
495
+ "learning_rate": 0.0002792858679969596,
496
+ "loss": 0.1551,
497
  "step": 284
498
  },
499
  {
500
  "epoch": 5.24,
501
+ "learning_rate": 0.0002729741532880069,
502
+ "loss": 0.2522,
503
  "step": 288
504
  },
505
  {
506
  "epoch": 5.31,
507
+ "learning_rate": 0.000266647632995826,
508
+ "loss": 0.1467,
509
  "step": 292
510
  },
511
  {
512
  "epoch": 5.38,
513
+ "learning_rate": 0.00026031038421616684,
514
+ "loss": 0.1359,
515
  "step": 296
516
  },
517
  {
518
  "epoch": 5.45,
519
+ "learning_rate": 0.000253966490958702,
520
+ "loss": 0.1236,
521
  "step": 300
522
  },
523
  {
524
  "epoch": 5.53,
525
+ "learning_rate": 0.00024762004151510585,
526
+ "loss": 0.1714,
527
  "step": 304
528
  },
529
  {
530
  "epoch": 5.6,
531
+ "learning_rate": 0.00024127512582437484,
532
+ "loss": 0.1462,
533
  "step": 308
534
  },
535
  {
536
  "epoch": 5.67,
537
+ "learning_rate": 0.00023493583283708543,
538
+ "loss": 0.1472,
539
  "step": 312
540
  },
541
  {
542
  "epoch": 5.75,
543
+ "learning_rate": 0.00022860624788029015,
544
+ "loss": 0.1314,
545
  "step": 316
546
  },
547
  {
548
  "epoch": 5.82,
549
+ "learning_rate": 0.00022229045002474727,
550
+ "loss": 0.1533,
551
  "step": 320
552
  },
553
  {
554
  "epoch": 5.89,
555
+ "learning_rate": 0.000215992509456184,
556
+ "loss": 0.1323,
557
  "step": 324
558
  },
559
  {
560
  "epoch": 5.96,
561
+ "learning_rate": 0.000209716484852284,
562
+ "loss": 0.1444,
563
  "step": 328
564
  },
565
  {
566
  "epoch": 6.0,
567
+ "pls_score": 66.8,
568
+ "std": 3.514996443810435,
 
 
 
 
569
  "step": 330
570
  },
571
  {
572
  "epoch": 6.0,
573
+ "eval_loss": 3.218996047973633,
574
+ "eval_runtime": 4.9525,
575
+ "eval_samples_per_second": 4.644,
576
+ "eval_steps_per_second": 1.212,
577
  "step": 330
578
  },
579
  {
580
  "epoch": 6.04,
581
+ "learning_rate": 0.0002034664207670925,
582
+ "loss": 0.1093,
583
  "step": 332
584
  },
585
  {
586
  "epoch": 6.11,
587
+ "learning_rate": 0.0001972463450245226,
588
+ "loss": 0.1101,
589
  "step": 336
590
  },
591
  {
592
  "epoch": 6.18,
593
+ "learning_rate": 0.00019106026612264316,
594
+ "loss": 0.1124,
595
  "step": 340
596
  },
597
  {
598
  "epoch": 6.25,
599
+ "learning_rate": 0.00018491217065042198,
600
+ "loss": 0.1056,
601
  "step": 344
602
  },
603
  {
604
  "epoch": 6.33,
605
+ "learning_rate": 0.00017880602071858692,
606
+ "loss": 0.1114,
607
  "step": 348
608
  },
609
  {
610
  "epoch": 6.4,
611
+ "learning_rate": 0.00017274575140626317,
612
+ "loss": 0.1157,
613
  "step": 352
614
  },
615
  {
616
  "epoch": 6.47,
617
+ "learning_rate": 0.00016673526822502983,
618
+ "loss": 0.116,
619
  "step": 356
620
  },
621
  {
622
  "epoch": 6.55,
623
+ "learning_rate": 0.00016077844460203207,
624
+ "loss": 0.1073,
625
  "step": 360
626
  },
627
  {
628
  "epoch": 6.62,
629
+ "learning_rate": 0.00015487911938376925,
630
+ "loss": 0.1925,
631
  "step": 364
632
  },
633
  {
634
  "epoch": 6.69,
635
+ "learning_rate": 0.00014904109436216883,
636
+ "loss": 0.097,
637
  "step": 368
638
  },
639
  {
640
  "epoch": 6.76,
641
+ "learning_rate": 0.00014326813182453956,
642
+ "loss": 0.1229,
643
  "step": 372
644
  },
645
  {
646
  "epoch": 6.84,
647
+ "learning_rate": 0.0001375639521289836,
648
+ "loss": 0.1071,
649
  "step": 376
650
  },
651
  {
652
  "epoch": 6.91,
653
+ "learning_rate": 0.00013193223130682935,
654
+ "loss": 0.1207,
655
  "step": 380
656
  },
657
  {
658
  "epoch": 6.98,
659
+ "learning_rate": 0.00012637659869363084,
660
+ "loss": 0.1129,
661
  "step": 384
662
  },
663
  {
664
  "epoch": 7.0,
665
+ "pls_score": 62.0,
666
+ "std": 4.280186911806539,
 
 
 
 
667
  "step": 385
668
  },
669
  {
670
  "epoch": 7.0,
671
+ "eval_loss": 3.3570775985717773,
672
+ "eval_runtime": 4.9469,
673
+ "eval_samples_per_second": 4.649,
674
+ "eval_steps_per_second": 1.213,
675
  "step": 385
676
  },
677
  {
678
  "epoch": 7.05,
679
+ "learning_rate": 0.00012090063459025954,
680
+ "loss": 0.103,
681
  "step": 388
682
  },
683
  {
684
  "epoch": 7.13,
685
+ "learning_rate": 0.0001155078679555969,
686
+ "loss": 0.0853,
687
  "step": 392
688
  },
689
  {
690
  "epoch": 7.2,
691
+ "learning_rate": 0.00011020177413231333,
692
+ "loss": 0.0998,
693
  "step": 396
694
  },
695
  {
696
  "epoch": 7.27,
697
+ "learning_rate": 0.00010498577260720049,
698
+ "loss": 0.096,
699
  "step": 400
700
  },
701
  {
702
  "epoch": 7.35,
703
+ "learning_rate": 9.986322480749927e-05,
704
+ "loss": 0.1175,
705
  "step": 404
706
  },
707
  {
708
  "epoch": 7.42,
709
+ "learning_rate": 9.483743193464408e-05,
710
+ "loss": 0.1062,
711
  "step": 408
712
  },
713
  {
714
  "epoch": 7.49,
715
+ "learning_rate": 8.991163283681945e-05,
716
+ "loss": 0.0943,
717
  "step": 412
718
  },
719
  {
720
  "epoch": 7.56,
721
+ "learning_rate": 8.508900192169963e-05,
722
+ "loss": 0.1006,
723
  "step": 416
724
  },
725
  {
726
  "epoch": 7.64,
727
+ "learning_rate": 8.037264711071699e-05,
728
+ "loss": 0.1041,
729
  "step": 420
730
  },
731
  {
732
  "epoch": 7.71,
733
+ "learning_rate": 7.576560783617667e-05,
734
+ "loss": 0.2169,
735
  "step": 424
736
  },
737
  {
738
  "epoch": 7.78,
739
+ "learning_rate": 7.127085308250913e-05,
740
+ "loss": 0.1022,
741
  "step": 428
742
  },
743
  {
744
  "epoch": 7.85,
745
+ "learning_rate": 6.689127947292231e-05,
746
+ "loss": 0.0953,
747
  "step": 432
748
  },
749
  {
750
  "epoch": 7.93,
751
+ "learning_rate": 6.262970940268654e-05,
752
+ "loss": 0.0976,
753
  "step": 436
754
  },
755
  {
756
  "epoch": 8.0,
757
+ "learning_rate": 5.848888922025553e-05,
758
+ "loss": 0.1048,
 
 
 
 
 
759
  "step": 440
760
  },
761
  {
762
  "epoch": 8.0,
763
+ "pls_score": 53.6,
764
+ "std": 4.017561449436709,
765
  "step": 440
766
  },
767
  {
768
  "epoch": 8.0,
769
+ "eval_loss": 3.45528507232666,
770
+ "eval_runtime": 4.9545,
771
+ "eval_samples_per_second": 4.642,
772
+ "eval_steps_per_second": 1.211,
773
  "step": 440
774
  },
775
  {
776
  "epoch": 8.07,
777
+ "learning_rate": 5.4471487457395216e-05,
778
+ "loss": 0.1036,
779
  "step": 444
780
  },
781
  {
782
  "epoch": 8.15,
783
+ "learning_rate": 5.058009310946118e-05,
784
+ "loss": 0.0995,
785
  "step": 448
786
  },
787
  {
788
  "epoch": 8.22,
789
+ "learning_rate": 4.6817213966933034e-05,
790
+ "loss": 0.0894,
791
  "step": 452
792
  },
793
  {
794
  "epoch": 8.29,
795
+ "learning_rate": 4.318527499928074e-05,
796
+ "loss": 0.0955,
797
  "step": 456
798
  },
799
  {
800
  "epoch": 8.36,
801
+ "learning_rate": 3.968661679220467e-05,
802
+ "loss": 0.0997,
803
  "step": 460
804
  },
805
  {
806
  "epoch": 8.44,
807
+ "learning_rate": 3.632349403925664e-05,
808
+ "loss": 0.0976,
809
  "step": 464
810
  },
811
  {
812
  "epoch": 8.51,
813
+ "learning_rate": 3.309807408881269e-05,
814
+ "loss": 0.0895,
815
  "step": 468
816
  },
817
  {
818
  "epoch": 8.58,
819
+ "learning_rate": 3.0012435547336736e-05,
820
+ "loss": 0.0904,
821
  "step": 472
822
  },
823
  {
824
  "epoch": 8.65,
825
+ "learning_rate": 2.7068566939831645e-05,
826
+ "loss": 0.0892,
827
  "step": 476
828
  },
829
  {
830
  "epoch": 8.73,
831
+ "learning_rate": 2.4268365428344735e-05,
832
+ "loss": 0.106,
833
  "step": 480
834
  },
835
  {
836
  "epoch": 8.8,
837
+ "learning_rate": 2.1613635589349755e-05,
838
+ "loss": 0.1842,
839
  "step": 484
840
  },
841
  {
842
  "epoch": 8.87,
843
+ "learning_rate": 1.9106088250797264e-05,
844
+ "loss": 0.0916,
845
  "step": 488
846
  },
847
  {
848
  "epoch": 8.95,
849
+ "learning_rate": 1.674733938957873e-05,
850
+ "loss": 0.1008,
851
  "step": 492
852
  },
853
  {
854
  "epoch": 9.0,
855
+ "pls_score": 65.4,
856
+ "std": 4.071461654000931,
 
 
 
 
857
  "step": 495
858
  },
859
  {
860
  "epoch": 9.0,
861
+ "eval_loss": 3.483499050140381,
862
+ "eval_runtime": 4.9616,
863
+ "eval_samples_per_second": 4.636,
864
+ "eval_steps_per_second": 1.209,
865
  "step": 495
866
  },
867
  {
868
  "epoch": 9.02,
869
+ "learning_rate": 1.4538909090118846e-05,
870
+ "loss": 0.0936,
871
  "step": 496
872
  },
873
  {
874
  "epoch": 9.09,
875
+ "learning_rate": 1.2482220564763668e-05,
876
+ "loss": 0.0864,
877
  "step": 500
878
  },
879
  {
880
  "epoch": 9.16,
881
+ "learning_rate": 1.0578599236598707e-05,
882
+ "loss": 0.0891,
883
  "step": 504
884
  },
885
  {
886
  "epoch": 9.24,
887
+ "learning_rate": 8.829271885286095e-06,
888
+ "loss": 0.0893,
889
  "step": 508
890
  },
891
  {
892
  "epoch": 9.31,
893
+ "learning_rate": 7.235365856472442e-06,
894
+ "loss": 0.1026,
895
  "step": 512
896
  },
897
  {
898
  "epoch": 9.38,
899
+ "learning_rate": 5.797908335276214e-06,
900
+ "loss": 0.1098,
901
  "step": 516
902
  },
903
  {
904
  "epoch": 9.45,
905
+ "learning_rate": 4.517825684323323e-06,
906
+ "loss": 0.1009,
907
  "step": 520
908
  },
909
  {
910
  "epoch": 9.53,
911
+ "learning_rate": 3.3959428467570664e-06,
912
+ "loss": 0.1049,
913
  "step": 524
914
  },
915
  {
916
  "epoch": 9.6,
917
+ "learning_rate": 2.4329828146074094e-06,
918
+ "loss": 0.0826,
919
  "step": 528
920
  },
921
  {
922
  "epoch": 9.67,
923
+ "learning_rate": 1.6295661628624448e-06,
924
+ "loss": 0.0856,
925
  "step": 532
926
  },
927
  {
928
  "epoch": 9.75,
929
+ "learning_rate": 9.862106495415469e-07,
930
+ "loss": 0.1868,
931
  "step": 536
932
  },
933
  {
934
  "epoch": 9.82,
935
+ "learning_rate": 5.033308820289185e-07,
936
+ "loss": 0.1001,
937
  "step": 540
938
  },
939
  {
940
  "epoch": 9.89,
941
+ "learning_rate": 1.8123804988159908e-07,
942
+ "loss": 0.1168,
943
  "step": 544
944
  },
945
  {
946
  "epoch": 9.96,
947
+ "learning_rate": 2.0139724285161975e-08,
948
+ "loss": 0.0938,
949
  "step": 548
950
  },
951
  {
952
  "epoch": 10.0,
953
+ "pls_score": 61.6,
954
+ "std": 4.375934185976749,
955
  "step": 550
956
  },
957
  {
958
  "epoch": 10.0,
959
+ "eval_loss": 3.4885787963867188,
960
+ "eval_runtime": 4.9332,
961
+ "eval_samples_per_second": 4.662,
962
+ "eval_steps_per_second": 1.216,
963
  "step": 550
964
  },
965
  {
966
  "epoch": 10.0,
967
+ "step": 550,
968
+ "total_flos": 1.876652342808576e+16,
969
+ "train_loss": 0.3880799013376236,
970
+ "train_runtime": 22779.6065,
971
+ "train_samples_per_second": 0.095,
972
+ "train_steps_per_second": 0.024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
973
  }
974
  ],
975
  "logging_steps": 4,
 
977
  "num_input_tokens_seen": 0,
978
  "num_train_epochs": 10,
979
  "save_steps": 55,
980
+ "total_flos": 1.876652342808576e+16,
981
  "train_batch_size": 4,
982
  "trial_name": null,
983
  "trial_params": null