yyx123 commited on
Commit
2543586
1 Parent(s): 06d0ef6

Model save

Browse files
Files changed (5) hide show
  1. README.md +10 -15
  2. all_results.json +8 -8
  3. eval_results.json +4 -4
  4. train_results.json +4 -4
  5. trainer_state.json +183 -197
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba-5e-4-50
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba-5e-4-50
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 3.4886
26
 
27
  ## Model description
28
 
@@ -54,15 +50,14 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 1.5916 | 2.0 | 110 | 2.0382 |
58
- | 0.9956 | 3.0 | 165 | 2.4359 |
59
- | 0.5198 | 4.0 | 220 | 2.9536 |
60
- | 0.2296 | 5.0 | 275 | 3.0199 |
61
- | 0.1444 | 6.0 | 330 | 3.2190 |
62
- | 0.1129 | 7.0 | 385 | 3.3571 |
63
- | 0.1048 | 8.0 | 440 | 3.4553 |
64
- | 0.1008 | 9.0 | 495 | 3.4835 |
65
- | 0.0938 | 10.0 | 550 | 3.4886 |
66
 
67
 
68
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba-5e-4-50
 
16
 
17
  # Yi-6B-ruozhiba-5e-4-50
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 3.4887
22
 
23
  ## Model description
24
 
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.5872 | 2.0 | 110 | 2.0369 |
54
+ | 0.9686 | 3.0 | 165 | 2.4604 |
55
+ | 0.534 | 4.0 | 220 | 2.9133 |
56
+ | 0.2198 | 5.0 | 275 | 3.0297 |
57
+ | 0.1399 | 6.0 | 330 | 3.1912 |
58
+ | 0.1165 | 7.0 | 385 | 3.3320 |
59
+ | 0.0972 | 9.0 | 495 | 3.4838 |
60
+ | 0.1639 | 10.0 | 550 | 3.4887 |
 
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 3.4885787963867188,
4
- "eval_runtime": 4.9367,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.659,
7
- "eval_steps_per_second": 1.215,
8
- "train_loss": 0.3880799013376236,
9
- "train_runtime": 22779.6065,
10
  "train_samples": 217,
11
- "train_samples_per_second": 0.095,
12
- "train_steps_per_second": 0.024
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_loss": 3.4886670112609863,
4
+ "eval_runtime": 4.9035,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.691,
7
+ "eval_steps_per_second": 1.224,
8
+ "train_loss": 0.020198198123411698,
9
+ "train_runtime": 4992.1656,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 0.435,
12
+ "train_steps_per_second": 0.11
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_loss": 3.4885787963867188,
4
- "eval_runtime": 4.9367,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.659,
7
- "eval_steps_per_second": 1.215
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_loss": 3.4886670112609863,
4
+ "eval_runtime": 4.9035,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.691,
7
+ "eval_steps_per_second": 1.224
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "train_loss": 0.3880799013376236,
4
- "train_runtime": 22779.6065,
5
  "train_samples": 217,
6
- "train_samples_per_second": 0.095,
7
- "train_steps_per_second": 0.024
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "train_loss": 0.020198198123411698,
4
+ "train_runtime": 4992.1656,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 0.435,
7
+ "train_steps_per_second": 0.11
8
  }
trainer_state.json CHANGED
@@ -47,7 +47,7 @@
47
  {
48
  "epoch": 0.44,
49
  "learning_rate": 0.00021818181818181818,
50
- "loss": 2.1606,
51
  "step": 24
52
  },
53
  {
@@ -59,7 +59,7 @@
59
  {
60
  "epoch": 0.58,
61
  "learning_rate": 0.0002909090909090909,
62
- "loss": 2.0583,
63
  "step": 32
64
  },
65
  {
@@ -71,607 +71,607 @@
71
  {
72
  "epoch": 0.73,
73
  "learning_rate": 0.00036363636363636367,
74
- "loss": 1.9237,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
  "learning_rate": 0.0004,
80
- "loss": 1.9701,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
  "learning_rate": 0.00043636363636363637,
86
- "loss": 1.9571,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
  "learning_rate": 0.0004727272727272727,
92
- "loss": 2.0252,
93
  "step": 52
94
  },
95
  {
96
  "epoch": 1.02,
97
  "learning_rate": 0.0004999949650182266,
98
- "loss": 2.0972,
99
  "step": 56
100
  },
101
  {
102
  "epoch": 1.09,
103
  "learning_rate": 0.0004998741355957963,
104
- "loss": 1.8658,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 1.16,
109
  "learning_rate": 0.0004995922759815339,
110
- "loss": 1.8353,
111
  "step": 64
112
  },
113
  {
114
  "epoch": 1.24,
115
  "learning_rate": 0.0004991495678185201,
116
- "loss": 1.7303,
117
  "step": 68
118
  },
119
  {
120
  "epoch": 1.31,
121
  "learning_rate": 0.0004985462964079136,
122
- "loss": 1.6807,
123
  "step": 72
124
  },
125
  {
126
  "epoch": 1.38,
127
  "learning_rate": 0.0004977828505250904,
128
- "loss": 1.56,
129
  "step": 76
130
  },
131
  {
132
  "epoch": 1.45,
133
  "learning_rate": 0.0004968597221690986,
134
- "loss": 1.5809,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 1.53,
139
  "learning_rate": 0.0004957775062455933,
140
- "loss": 1.7125,
141
  "step": 84
142
  },
143
  {
144
  "epoch": 1.6,
145
  "learning_rate": 0.0004945369001834514,
146
- "loss": 1.6883,
147
  "step": 88
148
  },
149
  {
150
  "epoch": 1.67,
151
  "learning_rate": 0.0004931387034853173,
152
- "loss": 1.6341,
153
  "step": 92
154
  },
155
  {
156
  "epoch": 1.75,
157
  "learning_rate": 0.0004915838172123671,
158
- "loss": 1.7006,
159
  "step": 96
160
  },
161
  {
162
  "epoch": 1.82,
163
  "learning_rate": 0.0004898732434036243,
164
- "loss": 1.7037,
165
  "step": 100
166
  },
167
  {
168
  "epoch": 1.89,
169
  "learning_rate": 0.0004880080844302004,
170
- "loss": 1.5447,
171
  "step": 104
172
  },
173
  {
174
  "epoch": 1.96,
175
  "learning_rate": 0.0004859895422848767,
176
- "loss": 1.5916,
177
  "step": 108
178
  },
179
  {
180
  "epoch": 2.0,
181
- "pls_score": 62.4,
182
- "std": 3.9855739862659676,
183
  "step": 110
184
  },
185
  {
186
  "epoch": 2.0,
187
- "eval_loss": 2.0381879806518555,
188
- "eval_runtime": 4.958,
189
- "eval_samples_per_second": 4.639,
190
- "eval_steps_per_second": 1.21,
191
  "step": 110
192
  },
193
  {
194
  "epoch": 2.04,
195
  "learning_rate": 0.00048381891780748665,
196
- "loss": 1.5033,
197
  "step": 112
198
  },
199
  {
200
  "epoch": 2.11,
201
  "learning_rate": 0.0004814976098465951,
202
- "loss": 0.9438,
203
  "step": 116
204
  },
205
  {
206
  "epoch": 2.18,
207
  "learning_rate": 0.0004790271143580174,
208
- "loss": 0.9552,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 2.25,
213
  "learning_rate": 0.0004764090234407577,
214
- "loss": 0.8901,
215
  "step": 124
216
  },
217
  {
218
  "epoch": 2.33,
219
  "learning_rate": 0.0004736450243109884,
220
- "loss": 0.8692,
221
  "step": 128
222
  },
223
  {
224
  "epoch": 2.4,
225
  "learning_rate": 0.00047073689821473173,
226
- "loss": 0.8675,
227
  "step": 132
228
  },
229
  {
230
  "epoch": 2.47,
231
  "learning_rate": 0.00046768651927994433,
232
- "loss": 0.8246,
233
  "step": 136
234
  },
235
  {
236
  "epoch": 2.55,
237
  "learning_rate": 0.0004644958533087443,
238
- "loss": 0.813,
239
  "step": 140
240
  },
241
  {
242
  "epoch": 2.62,
243
  "learning_rate": 0.0004611669565105596,
244
- "loss": 0.9542,
245
  "step": 144
246
  },
247
  {
248
  "epoch": 2.69,
249
  "learning_rate": 0.00045770197417701366,
250
- "loss": 0.8225,
251
  "step": 148
252
  },
253
  {
254
  "epoch": 2.76,
255
  "learning_rate": 0.00045410313929940244,
256
- "loss": 0.8872,
257
  "step": 152
258
  },
259
  {
260
  "epoch": 2.84,
261
  "learning_rate": 0.00045037277112965383,
262
- "loss": 0.9624,
263
  "step": 156
264
  },
265
  {
266
  "epoch": 2.91,
267
  "learning_rate": 0.0004465132736856969,
268
- "loss": 1.0054,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 2.98,
273
  "learning_rate": 0.00044252713420220394,
274
- "loss": 0.9956,
275
  "step": 164
276
  },
277
  {
278
  "epoch": 3.0,
279
- "pls_score": 70.0,
280
- "std": 3.54400902933387,
281
  "step": 165
282
  },
283
  {
284
  "epoch": 3.0,
285
- "eval_loss": 2.4358930587768555,
286
- "eval_runtime": 4.9575,
287
- "eval_samples_per_second": 4.639,
288
- "eval_steps_per_second": 1.21,
289
  "step": 165
290
  },
291
  {
292
  "epoch": 3.05,
293
  "learning_rate": 0.00043841692152770415,
294
- "loss": 0.5089,
295
  "step": 168
296
  },
297
  {
298
  "epoch": 3.13,
299
  "learning_rate": 0.00043418528446910123,
300
- "loss": 0.4589,
301
  "step": 172
302
  },
303
  {
304
  "epoch": 3.2,
305
  "learning_rate": 0.0004298349500846628,
306
- "loss": 0.4433,
307
  "step": 176
308
  },
309
  {
310
  "epoch": 3.27,
311
  "learning_rate": 0.00042536872192658034,
312
- "loss": 0.4208,
313
  "step": 180
314
  },
315
  {
316
  "epoch": 3.35,
317
  "learning_rate": 0.00042078947823423365,
318
- "loss": 0.4314,
319
  "step": 184
320
  },
321
  {
322
  "epoch": 3.42,
323
  "learning_rate": 0.0004161001700793231,
324
- "loss": 0.4506,
325
  "step": 188
326
  },
327
  {
328
  "epoch": 3.49,
329
  "learning_rate": 0.00041130381946406574,
330
- "loss": 0.4087,
331
  "step": 192
332
  },
333
  {
334
  "epoch": 3.56,
335
  "learning_rate": 0.0004064035173736804,
336
- "loss": 0.4357,
337
  "step": 196
338
  },
339
  {
340
  "epoch": 3.64,
341
  "learning_rate": 0.00040140242178441667,
342
- "loss": 0.4892,
343
  "step": 200
344
  },
345
  {
346
  "epoch": 3.71,
347
  "learning_rate": 0.0003963037556284129,
348
- "loss": 0.5003,
349
  "step": 204
350
  },
351
  {
352
  "epoch": 3.78,
353
  "learning_rate": 0.0003911108047166924,
354
- "loss": 0.4449,
355
  "step": 208
356
  },
357
  {
358
  "epoch": 3.85,
359
  "learning_rate": 0.00038582691562163827,
360
- "loss": 0.4754,
361
  "step": 212
362
  },
363
  {
364
  "epoch": 3.93,
365
  "learning_rate": 0.0003804554935203115,
366
- "loss": 0.4281,
367
  "step": 216
368
  },
369
  {
370
  "epoch": 4.0,
371
  "learning_rate": 0.000375,
372
- "loss": 0.5198,
373
  "step": 220
374
  },
375
  {
376
  "epoch": 4.0,
377
- "pls_score": 64.2,
378
- "std": 3.9404568263083397,
379
  "step": 220
380
  },
381
  {
382
  "epoch": 4.0,
383
- "eval_loss": 2.9536352157592773,
384
- "eval_runtime": 4.9448,
385
- "eval_samples_per_second": 4.651,
386
- "eval_steps_per_second": 1.213,
387
  "step": 220
388
  },
389
  {
390
  "epoch": 4.07,
391
  "learning_rate": 0.0003694639508274158,
392
- "loss": 0.2395,
393
  "step": 224
394
  },
395
  {
396
  "epoch": 4.15,
397
  "learning_rate": 0.0003638509136829758,
398
- "loss": 0.2221,
399
  "step": 228
400
  },
401
  {
402
  "epoch": 4.22,
403
  "learning_rate": 0.00035816450586162706,
404
- "loss": 0.2417,
405
  "step": 232
406
  },
407
  {
408
  "epoch": 4.29,
409
  "learning_rate": 0.00035240839194169884,
410
- "loss": 0.2447,
411
  "step": 236
412
  },
413
  {
414
  "epoch": 4.36,
415
  "learning_rate": 0.00034658628142328216,
416
- "loss": 0.2491,
417
  "step": 240
418
  },
419
  {
420
  "epoch": 4.44,
421
  "learning_rate": 0.00034070192633766023,
422
- "loss": 0.2284,
423
  "step": 244
424
  },
425
  {
426
  "epoch": 4.51,
427
  "learning_rate": 0.0003347591188293301,
428
- "loss": 0.229,
429
  "step": 248
430
  },
431
  {
432
  "epoch": 4.58,
433
  "learning_rate": 0.00032876168871217323,
434
- "loss": 0.2267,
435
  "step": 252
436
  },
437
  {
438
  "epoch": 4.65,
439
  "learning_rate": 0.00032271350100134975,
440
- "loss": 0.2265,
441
  "step": 256
442
  },
443
  {
444
  "epoch": 4.73,
445
  "learning_rate": 0.0003166184534225087,
446
- "loss": 0.2466,
447
  "step": 260
448
  },
449
  {
450
  "epoch": 4.8,
451
  "learning_rate": 0.0003104804738999169,
452
- "loss": 0.2335,
453
  "step": 264
454
  },
455
  {
456
  "epoch": 4.87,
457
  "learning_rate": 0.00030430351802512693,
458
- "loss": 0.3354,
459
  "step": 268
460
  },
461
  {
462
  "epoch": 4.95,
463
  "learning_rate": 0.00029809156650781527,
464
- "loss": 0.2296,
465
  "step": 272
466
  },
467
  {
468
  "epoch": 5.0,
469
- "pls_score": 68.4,
470
- "std": 4.0827441751841365,
471
  "step": 275
472
  },
473
  {
474
  "epoch": 5.0,
475
- "eval_loss": 3.0199222564697266,
476
- "eval_runtime": 4.9279,
477
- "eval_samples_per_second": 4.667,
478
- "eval_steps_per_second": 1.218,
479
  "step": 275
480
  },
481
  {
482
  "epoch": 5.02,
483
  "learning_rate": 0.0002918486226104327,
484
- "loss": 0.1705,
485
  "step": 276
486
  },
487
  {
488
  "epoch": 5.09,
489
  "learning_rate": 0.00028557870956832135,
490
- "loss": 0.1558,
491
  "step": 280
492
  },
493
  {
494
  "epoch": 5.16,
495
  "learning_rate": 0.0002792858679969596,
496
- "loss": 0.1551,
497
  "step": 284
498
  },
499
  {
500
  "epoch": 5.24,
501
  "learning_rate": 0.0002729741532880069,
502
- "loss": 0.2522,
503
  "step": 288
504
  },
505
  {
506
  "epoch": 5.31,
507
  "learning_rate": 0.000266647632995826,
508
- "loss": 0.1467,
509
  "step": 292
510
  },
511
  {
512
  "epoch": 5.38,
513
  "learning_rate": 0.00026031038421616684,
514
- "loss": 0.1359,
515
  "step": 296
516
  },
517
  {
518
  "epoch": 5.45,
519
  "learning_rate": 0.000253966490958702,
520
- "loss": 0.1236,
521
  "step": 300
522
  },
523
  {
524
  "epoch": 5.53,
525
  "learning_rate": 0.00024762004151510585,
526
- "loss": 0.1714,
527
  "step": 304
528
  },
529
  {
530
  "epoch": 5.6,
531
  "learning_rate": 0.00024127512582437484,
532
- "loss": 0.1462,
533
  "step": 308
534
  },
535
  {
536
  "epoch": 5.67,
537
  "learning_rate": 0.00023493583283708543,
538
- "loss": 0.1472,
539
  "step": 312
540
  },
541
  {
542
  "epoch": 5.75,
543
  "learning_rate": 0.00022860624788029015,
544
- "loss": 0.1314,
545
  "step": 316
546
  },
547
  {
548
  "epoch": 5.82,
549
  "learning_rate": 0.00022229045002474727,
550
- "loss": 0.1533,
551
  "step": 320
552
  },
553
  {
554
  "epoch": 5.89,
555
  "learning_rate": 0.000215992509456184,
556
- "loss": 0.1323,
557
  "step": 324
558
  },
559
  {
560
  "epoch": 5.96,
561
  "learning_rate": 0.000209716484852284,
562
- "loss": 0.1444,
563
  "step": 328
564
  },
565
  {
566
  "epoch": 6.0,
567
- "pls_score": 66.8,
568
- "std": 3.514996443810435,
569
  "step": 330
570
  },
571
  {
572
  "epoch": 6.0,
573
- "eval_loss": 3.218996047973633,
574
- "eval_runtime": 4.9525,
575
- "eval_samples_per_second": 4.644,
576
- "eval_steps_per_second": 1.212,
577
  "step": 330
578
  },
579
  {
580
  "epoch": 6.04,
581
  "learning_rate": 0.0002034664207670925,
582
- "loss": 0.1093,
583
  "step": 332
584
  },
585
  {
586
  "epoch": 6.11,
587
  "learning_rate": 0.0001972463450245226,
588
- "loss": 0.1101,
589
  "step": 336
590
  },
591
  {
592
  "epoch": 6.18,
593
  "learning_rate": 0.00019106026612264316,
594
- "loss": 0.1124,
595
  "step": 340
596
  },
597
  {
598
  "epoch": 6.25,
599
  "learning_rate": 0.00018491217065042198,
600
- "loss": 0.1056,
601
  "step": 344
602
  },
603
  {
604
  "epoch": 6.33,
605
  "learning_rate": 0.00017880602071858692,
606
- "loss": 0.1114,
607
  "step": 348
608
  },
609
  {
610
  "epoch": 6.4,
611
  "learning_rate": 0.00017274575140626317,
612
- "loss": 0.1157,
613
  "step": 352
614
  },
615
  {
616
  "epoch": 6.47,
617
  "learning_rate": 0.00016673526822502983,
618
- "loss": 0.116,
619
  "step": 356
620
  },
621
  {
622
  "epoch": 6.55,
623
  "learning_rate": 0.00016077844460203207,
624
- "loss": 0.1073,
625
  "step": 360
626
  },
627
  {
628
  "epoch": 6.62,
629
  "learning_rate": 0.00015487911938376925,
630
- "loss": 0.1925,
631
  "step": 364
632
  },
633
  {
634
  "epoch": 6.69,
635
  "learning_rate": 0.00014904109436216883,
636
- "loss": 0.097,
637
  "step": 368
638
  },
639
  {
640
  "epoch": 6.76,
641
  "learning_rate": 0.00014326813182453956,
642
- "loss": 0.1229,
643
  "step": 372
644
  },
645
  {
646
  "epoch": 6.84,
647
  "learning_rate": 0.0001375639521289836,
648
- "loss": 0.1071,
649
  "step": 376
650
  },
651
  {
652
  "epoch": 6.91,
653
  "learning_rate": 0.00013193223130682935,
654
- "loss": 0.1207,
655
  "step": 380
656
  },
657
  {
658
  "epoch": 6.98,
659
  "learning_rate": 0.00012637659869363084,
660
- "loss": 0.1129,
661
  "step": 384
662
  },
663
  {
664
  "epoch": 7.0,
665
- "pls_score": 62.0,
666
- "std": 4.280186911806539,
667
  "step": 385
668
  },
669
  {
670
  "epoch": 7.0,
671
- "eval_loss": 3.3570775985717773,
672
- "eval_runtime": 4.9469,
673
- "eval_samples_per_second": 4.649,
674
- "eval_steps_per_second": 1.213,
675
  "step": 385
676
  },
677
  {
@@ -683,293 +683,279 @@
683
  {
684
  "epoch": 7.13,
685
  "learning_rate": 0.0001155078679555969,
686
- "loss": 0.0853,
687
  "step": 392
688
  },
689
  {
690
  "epoch": 7.2,
691
  "learning_rate": 0.00011020177413231333,
692
- "loss": 0.0998,
693
  "step": 396
694
  },
695
  {
696
  "epoch": 7.27,
697
  "learning_rate": 0.00010498577260720049,
698
- "loss": 0.096,
699
  "step": 400
700
  },
701
  {
702
  "epoch": 7.35,
703
  "learning_rate": 9.986322480749927e-05,
704
- "loss": 0.1175,
705
  "step": 404
706
  },
707
  {
708
  "epoch": 7.42,
709
  "learning_rate": 9.483743193464408e-05,
710
- "loss": 0.1062,
711
  "step": 408
712
  },
713
  {
714
  "epoch": 7.49,
715
  "learning_rate": 8.991163283681945e-05,
716
- "loss": 0.0943,
717
  "step": 412
718
  },
719
  {
720
  "epoch": 7.56,
721
  "learning_rate": 8.508900192169963e-05,
722
- "loss": 0.1006,
723
  "step": 416
724
  },
725
  {
726
  "epoch": 7.64,
727
  "learning_rate": 8.037264711071699e-05,
728
- "loss": 0.1041,
729
  "step": 420
730
  },
731
  {
732
  "epoch": 7.71,
733
  "learning_rate": 7.576560783617667e-05,
734
- "loss": 0.2169,
735
  "step": 424
736
  },
737
  {
738
  "epoch": 7.78,
739
  "learning_rate": 7.127085308250913e-05,
740
- "loss": 0.1022,
741
  "step": 428
742
  },
743
  {
744
  "epoch": 7.85,
745
  "learning_rate": 6.689127947292231e-05,
746
- "loss": 0.0953,
747
  "step": 432
748
  },
749
  {
750
  "epoch": 7.93,
751
  "learning_rate": 6.262970940268654e-05,
752
- "loss": 0.0976,
753
  "step": 436
754
  },
755
  {
756
  "epoch": 8.0,
757
  "learning_rate": 5.848888922025553e-05,
758
- "loss": 0.1048,
759
- "step": 440
760
- },
761
- {
762
- "epoch": 8.0,
763
- "pls_score": 53.6,
764
- "std": 4.017561449436709,
765
- "step": 440
766
- },
767
- {
768
- "epoch": 8.0,
769
- "eval_loss": 3.45528507232666,
770
- "eval_runtime": 4.9545,
771
- "eval_samples_per_second": 4.642,
772
- "eval_steps_per_second": 1.211,
773
  "step": 440
774
  },
775
  {
776
  "epoch": 8.07,
777
  "learning_rate": 5.4471487457395216e-05,
778
- "loss": 0.1036,
779
  "step": 444
780
  },
781
  {
782
  "epoch": 8.15,
783
  "learning_rate": 5.058009310946118e-05,
784
- "loss": 0.0995,
785
  "step": 448
786
  },
787
  {
788
  "epoch": 8.22,
789
  "learning_rate": 4.6817213966933034e-05,
790
- "loss": 0.0894,
791
  "step": 452
792
  },
793
  {
794
  "epoch": 8.29,
795
  "learning_rate": 4.318527499928074e-05,
796
- "loss": 0.0955,
797
  "step": 456
798
  },
799
  {
800
  "epoch": 8.36,
801
  "learning_rate": 3.968661679220467e-05,
802
- "loss": 0.0997,
803
  "step": 460
804
  },
805
  {
806
  "epoch": 8.44,
807
  "learning_rate": 3.632349403925664e-05,
808
- "loss": 0.0976,
809
  "step": 464
810
  },
811
  {
812
  "epoch": 8.51,
813
  "learning_rate": 3.309807408881269e-05,
814
- "loss": 0.0895,
815
  "step": 468
816
  },
817
  {
818
  "epoch": 8.58,
819
  "learning_rate": 3.0012435547336736e-05,
820
- "loss": 0.0904,
821
  "step": 472
822
  },
823
  {
824
  "epoch": 8.65,
825
  "learning_rate": 2.7068566939831645e-05,
826
- "loss": 0.0892,
827
  "step": 476
828
  },
829
  {
830
  "epoch": 8.73,
831
  "learning_rate": 2.4268365428344735e-05,
832
- "loss": 0.106,
833
  "step": 480
834
  },
835
  {
836
  "epoch": 8.8,
837
  "learning_rate": 2.1613635589349755e-05,
838
- "loss": 0.1842,
839
  "step": 484
840
  },
841
  {
842
  "epoch": 8.87,
843
  "learning_rate": 1.9106088250797264e-05,
844
- "loss": 0.0916,
845
  "step": 488
846
  },
847
  {
848
  "epoch": 8.95,
849
  "learning_rate": 1.674733938957873e-05,
850
- "loss": 0.1008,
851
  "step": 492
852
  },
853
  {
854
  "epoch": 9.0,
855
- "pls_score": 65.4,
856
- "std": 4.071461654000931,
857
  "step": 495
858
  },
859
  {
860
  "epoch": 9.0,
861
- "eval_loss": 3.483499050140381,
862
- "eval_runtime": 4.9616,
863
- "eval_samples_per_second": 4.636,
864
- "eval_steps_per_second": 1.209,
865
  "step": 495
866
  },
867
  {
868
  "epoch": 9.02,
869
  "learning_rate": 1.4538909090118846e-05,
870
- "loss": 0.0936,
871
  "step": 496
872
  },
873
  {
874
  "epoch": 9.09,
875
  "learning_rate": 1.2482220564763668e-05,
876
- "loss": 0.0864,
877
  "step": 500
878
  },
879
  {
880
  "epoch": 9.16,
881
  "learning_rate": 1.0578599236598707e-05,
882
- "loss": 0.0891,
883
  "step": 504
884
  },
885
  {
886
  "epoch": 9.24,
887
  "learning_rate": 8.829271885286095e-06,
888
- "loss": 0.0893,
889
  "step": 508
890
  },
891
  {
892
  "epoch": 9.31,
893
  "learning_rate": 7.235365856472442e-06,
894
- "loss": 0.1026,
895
  "step": 512
896
  },
897
  {
898
  "epoch": 9.38,
899
  "learning_rate": 5.797908335276214e-06,
900
- "loss": 0.1098,
901
  "step": 516
902
  },
903
  {
904
  "epoch": 9.45,
905
  "learning_rate": 4.517825684323323e-06,
906
- "loss": 0.1009,
907
  "step": 520
908
  },
909
  {
910
  "epoch": 9.53,
911
  "learning_rate": 3.3959428467570664e-06,
912
- "loss": 0.1049,
913
  "step": 524
914
  },
915
  {
916
  "epoch": 9.6,
917
  "learning_rate": 2.4329828146074094e-06,
918
- "loss": 0.0826,
919
  "step": 528
920
  },
921
  {
922
  "epoch": 9.67,
923
  "learning_rate": 1.6295661628624448e-06,
924
- "loss": 0.0856,
925
  "step": 532
926
  },
927
  {
928
  "epoch": 9.75,
929
  "learning_rate": 9.862106495415469e-07,
930
- "loss": 0.1868,
931
  "step": 536
932
  },
933
  {
934
  "epoch": 9.82,
935
  "learning_rate": 5.033308820289185e-07,
936
- "loss": 0.1001,
937
  "step": 540
938
  },
939
  {
940
  "epoch": 9.89,
941
  "learning_rate": 1.8123804988159908e-07,
942
- "loss": 0.1168,
943
  "step": 544
944
  },
945
  {
946
  "epoch": 9.96,
947
  "learning_rate": 2.0139724285161975e-08,
948
- "loss": 0.0938,
949
  "step": 548
950
  },
951
  {
952
  "epoch": 10.0,
953
- "pls_score": 61.6,
954
- "std": 4.375934185976749,
955
  "step": 550
956
  },
957
  {
958
  "epoch": 10.0,
959
- "eval_loss": 3.4885787963867188,
960
- "eval_runtime": 4.9332,
961
- "eval_samples_per_second": 4.662,
962
- "eval_steps_per_second": 1.216,
963
  "step": 550
964
  },
965
  {
966
  "epoch": 10.0,
967
  "step": 550,
968
- "total_flos": 1.876652342808576e+16,
969
- "train_loss": 0.3880799013376236,
970
- "train_runtime": 22779.6065,
971
- "train_samples_per_second": 0.095,
972
- "train_steps_per_second": 0.024
973
  }
974
  ],
975
  "logging_steps": 4,
@@ -977,7 +963,7 @@
977
  "num_input_tokens_seen": 0,
978
  "num_train_epochs": 10,
979
  "save_steps": 55,
980
- "total_flos": 1.876652342808576e+16,
981
  "train_batch_size": 4,
982
  "trial_name": null,
983
  "trial_params": null
 
47
  {
48
  "epoch": 0.44,
49
  "learning_rate": 0.00021818181818181818,
50
+ "loss": 2.1607,
51
  "step": 24
52
  },
53
  {
 
59
  {
60
  "epoch": 0.58,
61
  "learning_rate": 0.0002909090909090909,
62
+ "loss": 2.0582,
63
  "step": 32
64
  },
65
  {
 
71
  {
72
  "epoch": 0.73,
73
  "learning_rate": 0.00036363636363636367,
74
+ "loss": 1.9232,
75
  "step": 40
76
  },
77
  {
78
  "epoch": 0.8,
79
  "learning_rate": 0.0004,
80
+ "loss": 1.97,
81
  "step": 44
82
  },
83
  {
84
  "epoch": 0.87,
85
  "learning_rate": 0.00043636363636363637,
86
+ "loss": 1.9574,
87
  "step": 48
88
  },
89
  {
90
  "epoch": 0.95,
91
  "learning_rate": 0.0004727272727272727,
92
+ "loss": 2.0267,
93
  "step": 52
94
  },
95
  {
96
  "epoch": 1.02,
97
  "learning_rate": 0.0004999949650182266,
98
+ "loss": 1.9734,
99
  "step": 56
100
  },
101
  {
102
  "epoch": 1.09,
103
  "learning_rate": 0.0004998741355957963,
104
+ "loss": 1.7896,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 1.16,
109
  "learning_rate": 0.0004995922759815339,
110
+ "loss": 1.7852,
111
  "step": 64
112
  },
113
  {
114
  "epoch": 1.24,
115
  "learning_rate": 0.0004991495678185201,
116
+ "loss": 1.7058,
117
  "step": 68
118
  },
119
  {
120
  "epoch": 1.31,
121
  "learning_rate": 0.0004985462964079136,
122
+ "loss": 1.676,
123
  "step": 72
124
  },
125
  {
126
  "epoch": 1.38,
127
  "learning_rate": 0.0004977828505250904,
128
+ "loss": 1.5585,
129
  "step": 76
130
  },
131
  {
132
  "epoch": 1.45,
133
  "learning_rate": 0.0004968597221690986,
134
+ "loss": 1.5856,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 1.53,
139
  "learning_rate": 0.0004957775062455933,
140
+ "loss": 1.7152,
141
  "step": 84
142
  },
143
  {
144
  "epoch": 1.6,
145
  "learning_rate": 0.0004945369001834514,
146
+ "loss": 1.6895,
147
  "step": 88
148
  },
149
  {
150
  "epoch": 1.67,
151
  "learning_rate": 0.0004931387034853173,
152
+ "loss": 1.6401,
153
  "step": 92
154
  },
155
  {
156
  "epoch": 1.75,
157
  "learning_rate": 0.0004915838172123671,
158
+ "loss": 1.7005,
159
  "step": 96
160
  },
161
  {
162
  "epoch": 1.82,
163
  "learning_rate": 0.0004898732434036243,
164
+ "loss": 1.7131,
165
  "step": 100
166
  },
167
  {
168
  "epoch": 1.89,
169
  "learning_rate": 0.0004880080844302004,
170
+ "loss": 1.5485,
171
  "step": 104
172
  },
173
  {
174
  "epoch": 1.96,
175
  "learning_rate": 0.0004859895422848767,
176
+ "loss": 1.5872,
177
  "step": 108
178
  },
179
  {
180
  "epoch": 2.0,
181
+ "pls_score": 64.4,
182
+ "std": 3.971498457761252,
183
  "step": 110
184
  },
185
  {
186
  "epoch": 2.0,
187
+ "eval_loss": 2.036914110183716,
188
+ "eval_runtime": 4.9069,
189
+ "eval_samples_per_second": 4.687,
190
+ "eval_steps_per_second": 1.223,
191
  "step": 110
192
  },
193
  {
194
  "epoch": 2.04,
195
  "learning_rate": 0.00048381891780748665,
196
+ "loss": 1.4997,
197
  "step": 112
198
  },
199
  {
200
  "epoch": 2.11,
201
  "learning_rate": 0.0004814976098465951,
202
+ "loss": 0.9384,
203
  "step": 116
204
  },
205
  {
206
  "epoch": 2.18,
207
  "learning_rate": 0.0004790271143580174,
208
+ "loss": 0.9659,
209
  "step": 120
210
  },
211
  {
212
  "epoch": 2.25,
213
  "learning_rate": 0.0004764090234407577,
214
+ "loss": 0.8644,
215
  "step": 124
216
  },
217
  {
218
  "epoch": 2.33,
219
  "learning_rate": 0.0004736450243109884,
220
+ "loss": 0.8526,
221
  "step": 128
222
  },
223
  {
224
  "epoch": 2.4,
225
  "learning_rate": 0.00047073689821473173,
226
+ "loss": 0.8541,
227
  "step": 132
228
  },
229
  {
230
  "epoch": 2.47,
231
  "learning_rate": 0.00046768651927994433,
232
+ "loss": 0.7894,
233
  "step": 136
234
  },
235
  {
236
  "epoch": 2.55,
237
  "learning_rate": 0.0004644958533087443,
238
+ "loss": 0.7917,
239
  "step": 140
240
  },
241
  {
242
  "epoch": 2.62,
243
  "learning_rate": 0.0004611669565105596,
244
+ "loss": 0.9484,
245
  "step": 144
246
  },
247
  {
248
  "epoch": 2.69,
249
  "learning_rate": 0.00045770197417701366,
250
+ "loss": 0.8227,
251
  "step": 148
252
  },
253
  {
254
  "epoch": 2.76,
255
  "learning_rate": 0.00045410313929940244,
256
+ "loss": 0.8766,
257
  "step": 152
258
  },
259
  {
260
  "epoch": 2.84,
261
  "learning_rate": 0.00045037277112965383,
262
+ "loss": 0.961,
263
  "step": 156
264
  },
265
  {
266
  "epoch": 2.91,
267
  "learning_rate": 0.0004465132736856969,
268
+ "loss": 1.0136,
269
  "step": 160
270
  },
271
  {
272
  "epoch": 2.98,
273
  "learning_rate": 0.00044252713420220394,
274
+ "loss": 0.9686,
275
  "step": 164
276
  },
277
  {
278
  "epoch": 3.0,
279
+ "pls_score": 60.0,
280
+ "std": 4.0,
281
  "step": 165
282
  },
283
  {
284
  "epoch": 3.0,
285
+ "eval_loss": 2.4603536128997803,
286
+ "eval_runtime": 4.9107,
287
+ "eval_samples_per_second": 4.684,
288
+ "eval_steps_per_second": 1.222,
289
  "step": 165
290
  },
291
  {
292
  "epoch": 3.05,
293
  "learning_rate": 0.00043841692152770415,
294
+ "loss": 0.5066,
295
  "step": 168
296
  },
297
  {
298
  "epoch": 3.13,
299
  "learning_rate": 0.00043418528446910123,
300
+ "loss": 0.458,
301
  "step": 172
302
  },
303
  {
304
  "epoch": 3.2,
305
  "learning_rate": 0.0004298349500846628,
306
+ "loss": 0.4341,
307
  "step": 176
308
  },
309
  {
310
  "epoch": 3.27,
311
  "learning_rate": 0.00042536872192658034,
312
+ "loss": 0.4109,
313
  "step": 180
314
  },
315
  {
316
  "epoch": 3.35,
317
  "learning_rate": 0.00042078947823423365,
318
+ "loss": 0.4367,
319
  "step": 184
320
  },
321
  {
322
  "epoch": 3.42,
323
  "learning_rate": 0.0004161001700793231,
324
+ "loss": 0.4422,
325
  "step": 188
326
  },
327
  {
328
  "epoch": 3.49,
329
  "learning_rate": 0.00041130381946406574,
330
+ "loss": 0.4017,
331
  "step": 192
332
  },
333
  {
334
  "epoch": 3.56,
335
  "learning_rate": 0.0004064035173736804,
336
+ "loss": 0.4362,
337
  "step": 196
338
  },
339
  {
340
  "epoch": 3.64,
341
  "learning_rate": 0.00040140242178441667,
342
+ "loss": 0.4893,
343
  "step": 200
344
  },
345
  {
346
  "epoch": 3.71,
347
  "learning_rate": 0.0003963037556284129,
348
+ "loss": 0.4939,
349
  "step": 204
350
  },
351
  {
352
  "epoch": 3.78,
353
  "learning_rate": 0.0003911108047166924,
354
+ "loss": 0.4467,
355
  "step": 208
356
  },
357
  {
358
  "epoch": 3.85,
359
  "learning_rate": 0.00038582691562163827,
360
+ "loss": 0.4636,
361
  "step": 212
362
  },
363
  {
364
  "epoch": 3.93,
365
  "learning_rate": 0.0003804554935203115,
366
+ "loss": 0.4356,
367
  "step": 216
368
  },
369
  {
370
  "epoch": 4.0,
371
  "learning_rate": 0.000375,
372
+ "loss": 0.534,
373
  "step": 220
374
  },
375
  {
376
  "epoch": 4.0,
377
+ "pls_score": 62.96,
378
+ "std": 4.2869998833683205,
379
  "step": 220
380
  },
381
  {
382
  "epoch": 4.0,
383
+ "eval_loss": 2.9132673740386963,
384
+ "eval_runtime": 4.9065,
385
+ "eval_samples_per_second": 4.688,
386
+ "eval_steps_per_second": 1.223,
387
  "step": 220
388
  },
389
  {
390
  "epoch": 4.07,
391
  "learning_rate": 0.0003694639508274158,
392
+ "loss": 0.2425,
393
  "step": 224
394
  },
395
  {
396
  "epoch": 4.15,
397
  "learning_rate": 0.0003638509136829758,
398
+ "loss": 0.2283,
399
  "step": 228
400
  },
401
  {
402
  "epoch": 4.22,
403
  "learning_rate": 0.00035816450586162706,
404
+ "loss": 0.2518,
405
  "step": 232
406
  },
407
  {
408
  "epoch": 4.29,
409
  "learning_rate": 0.00035240839194169884,
410
+ "loss": 0.2514,
411
  "step": 236
412
  },
413
  {
414
  "epoch": 4.36,
415
  "learning_rate": 0.00034658628142328216,
416
+ "loss": 0.2357,
417
  "step": 240
418
  },
419
  {
420
  "epoch": 4.44,
421
  "learning_rate": 0.00034070192633766023,
422
+ "loss": 0.2235,
423
  "step": 244
424
  },
425
  {
426
  "epoch": 4.51,
427
  "learning_rate": 0.0003347591188293301,
428
+ "loss": 0.2363,
429
  "step": 248
430
  },
431
  {
432
  "epoch": 4.58,
433
  "learning_rate": 0.00032876168871217323,
434
+ "loss": 0.2212,
435
  "step": 252
436
  },
437
  {
438
  "epoch": 4.65,
439
  "learning_rate": 0.00032271350100134975,
440
+ "loss": 0.2362,
441
  "step": 256
442
  },
443
  {
444
  "epoch": 4.73,
445
  "learning_rate": 0.0003166184534225087,
446
+ "loss": 0.2576,
447
  "step": 260
448
  },
449
  {
450
  "epoch": 4.8,
451
  "learning_rate": 0.0003104804738999169,
452
+ "loss": 0.2257,
453
  "step": 264
454
  },
455
  {
456
  "epoch": 4.87,
457
  "learning_rate": 0.00030430351802512693,
458
+ "loss": 0.336,
459
  "step": 268
460
  },
461
  {
462
  "epoch": 4.95,
463
  "learning_rate": 0.00029809156650781527,
464
+ "loss": 0.2198,
465
  "step": 272
466
  },
467
  {
468
  "epoch": 5.0,
469
+ "pls_score": 54.4,
470
+ "std": 4.346584866305961,
471
  "step": 275
472
  },
473
  {
474
  "epoch": 5.0,
475
+ "eval_loss": 3.0296988487243652,
476
+ "eval_runtime": 4.907,
477
+ "eval_samples_per_second": 4.687,
478
+ "eval_steps_per_second": 1.223,
479
  "step": 275
480
  },
481
  {
482
  "epoch": 5.02,
483
  "learning_rate": 0.0002918486226104327,
484
+ "loss": 0.1702,
485
  "step": 276
486
  },
487
  {
488
  "epoch": 5.09,
489
  "learning_rate": 0.00028557870956832135,
490
+ "loss": 0.1767,
491
  "step": 280
492
  },
493
  {
494
  "epoch": 5.16,
495
  "learning_rate": 0.0002792858679969596,
496
+ "loss": 0.1589,
497
  "step": 284
498
  },
499
  {
500
  "epoch": 5.24,
501
  "learning_rate": 0.0002729741532880069,
502
+ "loss": 0.2729,
503
  "step": 288
504
  },
505
  {
506
  "epoch": 5.31,
507
  "learning_rate": 0.000266647632995826,
508
+ "loss": 0.1429,
509
  "step": 292
510
  },
511
  {
512
  "epoch": 5.38,
513
  "learning_rate": 0.00026031038421616684,
514
+ "loss": 0.1383,
515
  "step": 296
516
  },
517
  {
518
  "epoch": 5.45,
519
  "learning_rate": 0.000253966490958702,
520
+ "loss": 0.13,
521
  "step": 300
522
  },
523
  {
524
  "epoch": 5.53,
525
  "learning_rate": 0.00024762004151510585,
526
+ "loss": 0.1658,
527
  "step": 304
528
  },
529
  {
530
  "epoch": 5.6,
531
  "learning_rate": 0.00024127512582437484,
532
+ "loss": 0.1392,
533
  "step": 308
534
  },
535
  {
536
  "epoch": 5.67,
537
  "learning_rate": 0.00023493583283708543,
538
+ "loss": 0.1428,
539
  "step": 312
540
  },
541
  {
542
  "epoch": 5.75,
543
  "learning_rate": 0.00022860624788029015,
544
+ "loss": 0.1297,
545
  "step": 316
546
  },
547
  {
548
  "epoch": 5.82,
549
  "learning_rate": 0.00022229045002474727,
550
+ "loss": 0.1518,
551
  "step": 320
552
  },
553
  {
554
  "epoch": 5.89,
555
  "learning_rate": 0.000215992509456184,
556
+ "loss": 0.1403,
557
  "step": 324
558
  },
559
  {
560
  "epoch": 5.96,
561
  "learning_rate": 0.000209716484852284,
562
+ "loss": 0.1399,
563
  "step": 328
564
  },
565
  {
566
  "epoch": 6.0,
567
+ "pls_score": 65.2,
568
+ "std": 4.301069634404913,
569
  "step": 330
570
  },
571
  {
572
  "epoch": 6.0,
573
+ "eval_loss": 3.191229820251465,
574
+ "eval_runtime": 4.9103,
575
+ "eval_samples_per_second": 4.684,
576
+ "eval_steps_per_second": 1.222,
577
  "step": 330
578
  },
579
  {
580
  "epoch": 6.04,
581
  "learning_rate": 0.0002034664207670925,
582
+ "loss": 0.1071,
583
  "step": 332
584
  },
585
  {
586
  "epoch": 6.11,
587
  "learning_rate": 0.0001972463450245226,
588
+ "loss": 0.1114,
589
  "step": 336
590
  },
591
  {
592
  "epoch": 6.18,
593
  "learning_rate": 0.00019106026612264316,
594
+ "loss": 0.1183,
595
  "step": 340
596
  },
597
  {
598
  "epoch": 6.25,
599
  "learning_rate": 0.00018491217065042198,
600
+ "loss": 0.1043,
601
  "step": 344
602
  },
603
  {
604
  "epoch": 6.33,
605
  "learning_rate": 0.00017880602071858692,
606
+ "loss": 0.1192,
607
  "step": 348
608
  },
609
  {
610
  "epoch": 6.4,
611
  "learning_rate": 0.00017274575140626317,
612
+ "loss": 0.113,
613
  "step": 352
614
  },
615
  {
616
  "epoch": 6.47,
617
  "learning_rate": 0.00016673526822502983,
618
+ "loss": 0.1097,
619
  "step": 356
620
  },
621
  {
622
  "epoch": 6.55,
623
  "learning_rate": 0.00016077844460203207,
624
+ "loss": 0.1077,
625
  "step": 360
626
  },
627
  {
628
  "epoch": 6.62,
629
  "learning_rate": 0.00015487911938376925,
630
+ "loss": 0.1938,
631
  "step": 364
632
  },
633
  {
634
  "epoch": 6.69,
635
  "learning_rate": 0.00014904109436216883,
636
+ "loss": 0.0989,
637
  "step": 368
638
  },
639
  {
640
  "epoch": 6.76,
641
  "learning_rate": 0.00014326813182453956,
642
+ "loss": 0.1273,
643
  "step": 372
644
  },
645
  {
646
  "epoch": 6.84,
647
  "learning_rate": 0.0001375639521289836,
648
+ "loss": 0.1127,
649
  "step": 376
650
  },
651
  {
652
  "epoch": 6.91,
653
  "learning_rate": 0.00013193223130682935,
654
+ "loss": 0.105,
655
  "step": 380
656
  },
657
  {
658
  "epoch": 6.98,
659
  "learning_rate": 0.00012637659869363084,
660
+ "loss": 0.1165,
661
  "step": 384
662
  },
663
  {
664
  "epoch": 7.0,
665
+ "pls_score": 65.2,
666
+ "std": 3.657758876689387,
667
  "step": 385
668
  },
669
  {
670
  "epoch": 7.0,
671
+ "eval_loss": 3.3320090770721436,
672
+ "eval_runtime": 4.9071,
673
+ "eval_samples_per_second": 4.687,
674
+ "eval_steps_per_second": 1.223,
675
  "step": 385
676
  },
677
  {
 
683
  {
684
  "epoch": 7.13,
685
  "learning_rate": 0.0001155078679555969,
686
+ "loss": 0.0854,
687
  "step": 392
688
  },
689
  {
690
  "epoch": 7.2,
691
  "learning_rate": 0.00011020177413231333,
692
+ "loss": 0.1,
693
  "step": 396
694
  },
695
  {
696
  "epoch": 7.27,
697
  "learning_rate": 0.00010498577260720049,
698
+ "loss": 0.0961,
699
  "step": 400
700
  },
701
  {
702
  "epoch": 7.35,
703
  "learning_rate": 9.986322480749927e-05,
704
+ "loss": 0.1158,
705
  "step": 404
706
  },
707
  {
708
  "epoch": 7.42,
709
  "learning_rate": 9.483743193464408e-05,
710
+ "loss": 0.1063,
711
  "step": 408
712
  },
713
  {
714
  "epoch": 7.49,
715
  "learning_rate": 8.991163283681945e-05,
716
+ "loss": 0.0967,
717
  "step": 412
718
  },
719
  {
720
  "epoch": 7.56,
721
  "learning_rate": 8.508900192169963e-05,
722
+ "loss": 0.1014,
723
  "step": 416
724
  },
725
  {
726
  "epoch": 7.64,
727
  "learning_rate": 8.037264711071699e-05,
728
+ "loss": 0.1042,
729
  "step": 420
730
  },
731
  {
732
  "epoch": 7.71,
733
  "learning_rate": 7.576560783617667e-05,
734
+ "loss": 0.2074,
735
  "step": 424
736
  },
737
  {
738
  "epoch": 7.78,
739
  "learning_rate": 7.127085308250913e-05,
740
+ "loss": 0.1024,
741
  "step": 428
742
  },
743
  {
744
  "epoch": 7.85,
745
  "learning_rate": 6.689127947292231e-05,
746
+ "loss": 0.0967,
747
  "step": 432
748
  },
749
  {
750
  "epoch": 7.93,
751
  "learning_rate": 6.262970940268654e-05,
752
+ "loss": 0.0975,
753
  "step": 436
754
  },
755
  {
756
  "epoch": 8.0,
757
  "learning_rate": 5.848888922025553e-05,
758
+ "loss": 0.1057,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  "step": 440
760
  },
761
  {
762
  "epoch": 8.07,
763
  "learning_rate": 5.4471487457395216e-05,
764
+ "loss": 0.0918,
765
  "step": 444
766
  },
767
  {
768
  "epoch": 8.15,
769
  "learning_rate": 5.058009310946118e-05,
770
+ "loss": 0.1813,
771
  "step": 448
772
  },
773
  {
774
  "epoch": 8.22,
775
  "learning_rate": 4.6817213966933034e-05,
776
+ "loss": 0.0943,
777
  "step": 452
778
  },
779
  {
780
  "epoch": 8.29,
781
  "learning_rate": 4.318527499928074e-05,
782
+ "loss": 0.09,
783
  "step": 456
784
  },
785
  {
786
  "epoch": 8.36,
787
  "learning_rate": 3.968661679220467e-05,
788
+ "loss": 0.0994,
789
  "step": 460
790
  },
791
  {
792
  "epoch": 8.44,
793
  "learning_rate": 3.632349403925664e-05,
794
+ "loss": 0.0961,
795
  "step": 464
796
  },
797
  {
798
  "epoch": 8.51,
799
  "learning_rate": 3.309807408881269e-05,
800
+ "loss": 0.0952,
801
  "step": 468
802
  },
803
  {
804
  "epoch": 8.58,
805
  "learning_rate": 3.0012435547336736e-05,
806
+ "loss": 0.0961,
807
  "step": 472
808
  },
809
  {
810
  "epoch": 8.65,
811
  "learning_rate": 2.7068566939831645e-05,
812
+ "loss": 0.092,
813
  "step": 476
814
  },
815
  {
816
  "epoch": 8.73,
817
  "learning_rate": 2.4268365428344735e-05,
818
+ "loss": 0.0947,
819
  "step": 480
820
  },
821
  {
822
  "epoch": 8.8,
823
  "learning_rate": 2.1613635589349755e-05,
824
+ "loss": 0.123,
825
  "step": 484
826
  },
827
  {
828
  "epoch": 8.87,
829
  "learning_rate": 1.9106088250797264e-05,
830
+ "loss": 0.0902,
831
  "step": 488
832
  },
833
  {
834
  "epoch": 8.95,
835
  "learning_rate": 1.674733938957873e-05,
836
+ "loss": 0.0972,
837
  "step": 492
838
  },
839
  {
840
  "epoch": 9.0,
841
+ "pls_score": 64.89795918367346,
842
+ "std": 4.062428792618158,
843
  "step": 495
844
  },
845
  {
846
  "epoch": 9.0,
847
+ "eval_loss": 3.483757972717285,
848
+ "eval_runtime": 4.9102,
849
+ "eval_samples_per_second": 4.684,
850
+ "eval_steps_per_second": 1.222,
851
  "step": 495
852
  },
853
  {
854
  "epoch": 9.02,
855
  "learning_rate": 1.4538909090118846e-05,
856
+ "loss": 0.0866,
857
  "step": 496
858
  },
859
  {
860
  "epoch": 9.09,
861
  "learning_rate": 1.2482220564763668e-05,
862
+ "loss": 0.1054,
863
  "step": 500
864
  },
865
  {
866
  "epoch": 9.16,
867
  "learning_rate": 1.0578599236598707e-05,
868
+ "loss": 0.1056,
869
  "step": 504
870
  },
871
  {
872
  "epoch": 9.24,
873
  "learning_rate": 8.829271885286095e-06,
874
+ "loss": 0.0989,
875
  "step": 508
876
  },
877
  {
878
  "epoch": 9.31,
879
  "learning_rate": 7.235365856472442e-06,
880
+ "loss": 0.0905,
881
  "step": 512
882
  },
883
  {
884
  "epoch": 9.38,
885
  "learning_rate": 5.797908335276214e-06,
886
+ "loss": 0.0913,
887
  "step": 516
888
  },
889
  {
890
  "epoch": 9.45,
891
  "learning_rate": 4.517825684323323e-06,
892
+ "loss": 0.0861,
893
  "step": 520
894
  },
895
  {
896
  "epoch": 9.53,
897
  "learning_rate": 3.3959428467570664e-06,
898
+ "loss": 0.1116,
899
  "step": 524
900
  },
901
  {
902
  "epoch": 9.6,
903
  "learning_rate": 2.4329828146074094e-06,
904
+ "loss": 0.0846,
905
  "step": 528
906
  },
907
  {
908
  "epoch": 9.67,
909
  "learning_rate": 1.6295661628624448e-06,
910
+ "loss": 0.0971,
911
  "step": 532
912
  },
913
  {
914
  "epoch": 9.75,
915
  "learning_rate": 9.862106495415469e-07,
916
+ "loss": 0.0972,
917
  "step": 536
918
  },
919
  {
920
  "epoch": 9.82,
921
  "learning_rate": 5.033308820289185e-07,
922
+ "loss": 0.0899,
923
  "step": 540
924
  },
925
  {
926
  "epoch": 9.89,
927
  "learning_rate": 1.8123804988159908e-07,
928
+ "loss": 0.0871,
929
  "step": 544
930
  },
931
  {
932
  "epoch": 9.96,
933
  "learning_rate": 2.0139724285161975e-08,
934
+ "loss": 0.1639,
935
  "step": 548
936
  },
937
  {
938
  "epoch": 10.0,
939
+ "pls_score": 67.6,
940
+ "std": 3.873602973976553,
941
  "step": 550
942
  },
943
  {
944
  "epoch": 10.0,
945
+ "eval_loss": 3.4886670112609863,
946
+ "eval_runtime": 4.9132,
947
+ "eval_samples_per_second": 4.681,
948
+ "eval_steps_per_second": 1.221,
949
  "step": 550
950
  },
951
  {
952
  "epoch": 10.0,
953
  "step": 550,
954
+ "total_flos": 1.876960441287475e+16,
955
+ "train_loss": 0.020198198123411698,
956
+ "train_runtime": 4992.1656,
957
+ "train_samples_per_second": 0.435,
958
+ "train_steps_per_second": 0.11
959
  }
960
  ],
961
  "logging_steps": 4,
 
963
  "num_input_tokens_seen": 0,
964
  "num_train_epochs": 10,
965
  "save_steps": 55,
966
+ "total_flos": 1.876960441287475e+16,
967
  "train_batch_size": 4,
968
  "trial_name": null,
969
  "trial_params": null