yyx123 commited on
Commit
015cf60
1 Parent(s): 0398b19

Model save

Browse files
Files changed (5) hide show
  1. README.md +7 -27
  2. all_results.json +9 -9
  3. eval_results.json +5 -5
  4. train_results.json +5 -5
  5. trainer_state.json +215 -423
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 4.1664
26
 
27
  ## Model description
28
 
@@ -48,32 +44,16 @@ The following hyperparameters were used during training:
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: cosine
50
  - lr_scheduler_warmup_ratio: 0.1
51
- - num_epochs: 20
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 2.333 | 1.0 | 55 | 2.0215 |
58
- | 1.8575 | 2.0 | 110 | 1.8371 |
59
- | 1.6435 | 3.0 | 165 | 1.8182 |
60
- | 1.4138 | 4.0 | 220 | 1.9196 |
61
- | 1.0749 | 5.0 | 275 | 2.2265 |
62
- | 0.6715 | 6.0 | 330 | 2.5187 |
63
- | 0.4252 | 7.0 | 385 | 2.8304 |
64
- | 0.2572 | 8.0 | 440 | 3.1702 |
65
- | 0.1543 | 9.0 | 495 | 3.3739 |
66
- | 0.1095 | 10.0 | 550 | 3.4964 |
67
- | 0.0875 | 11.0 | 605 | 3.6468 |
68
- | 0.0692 | 12.0 | 660 | 3.8002 |
69
- | 0.0573 | 13.0 | 715 | 3.9182 |
70
- | 0.0527 | 14.0 | 770 | 4.0117 |
71
- | 0.0494 | 15.0 | 825 | 4.0920 |
72
- | 0.0459 | 16.0 | 880 | 4.1272 |
73
- | 0.0456 | 17.0 | 935 | 4.1514 |
74
- | 0.0435 | 18.0 | 990 | 4.1618 |
75
- | 0.0446 | 19.0 | 1045 | 4.1660 |
76
- | 0.0428 | 20.0 | 1100 | 4.1664 |
77
 
78
 
79
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba
 
16
 
17
  # Yi-6B-ruozhiba
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.8389
22
 
23
  ## Model description
24
 
 
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 4
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.9256 | 1.0 | 55 | 1.8785 |
54
+ | 1.6738 | 2.0 | 110 | 1.8229 |
55
+ | 1.6181 | 3.0 | 165 | 1.8283 |
56
+ | 1.5007 | 4.0 | 220 | 1.8389 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 4.166376113891602,
4
- "eval_runtime": 4.9235,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.672,
7
- "eval_steps_per_second": 1.219,
8
- "train_loss": 0.5259156136621128,
9
- "train_runtime": 7800.8143,
10
  "train_samples": 217,
11
- "train_samples_per_second": 0.556,
12
- "train_steps_per_second": 0.141
13
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_loss": 1.8388779163360596,
4
+ "eval_runtime": 4.9283,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.667,
7
+ "eval_steps_per_second": 1.217,
8
+ "train_loss": 1.7501471573656255,
9
+ "train_runtime": 1644.9971,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 0.528,
12
+ "train_steps_per_second": 0.134
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 4.166376113891602,
4
- "eval_runtime": 4.9235,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 4.672,
7
- "eval_steps_per_second": 1.219
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_loss": 1.8388779163360596,
4
+ "eval_runtime": 4.9283,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 4.667,
7
+ "eval_steps_per_second": 1.217
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "train_loss": 0.5259156136621128,
4
- "train_runtime": 7800.8143,
5
  "train_samples": 217,
6
- "train_samples_per_second": 0.556,
7
- "train_steps_per_second": 0.141
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "train_loss": 1.7501471573656255,
4
+ "train_runtime": 1644.9971,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 0.528,
7
+ "train_steps_per_second": 0.134
8
  }
trainer_state.json CHANGED
@@ -1,625 +1,417 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "learning_rate": 4.545454545454545e-07,
14
  "loss": 2.3833,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.36,
19
  "learning_rate": 9.090909090909091e-06,
20
- "loss": 2.345,
21
- "step": 20
22
  },
23
  {
24
- "epoch": 0.73,
25
  "learning_rate": 1.8181818181818182e-05,
26
- "loss": 2.333,
27
- "step": 40
28
- },
29
- {
30
- "epoch": 1.0,
31
- "gpt4_scores": 0.21666666666666667,
32
- "step": 55
33
  },
34
  {
35
- "epoch": 1.0,
36
- "eval_loss": 2.0215015411376953,
37
- "eval_runtime": 4.9229,
38
- "eval_samples_per_second": 4.672,
39
- "eval_steps_per_second": 1.219,
40
- "step": 55
41
- },
42
- {
43
- "epoch": 1.09,
44
  "learning_rate": 2.7272727272727273e-05,
45
- "loss": 2.1124,
46
- "step": 60
47
  },
48
  {
49
- "epoch": 1.45,
50
  "learning_rate": 3.6363636363636364e-05,
51
- "loss": 1.9167,
52
- "step": 80
53
  },
54
  {
55
- "epoch": 1.82,
56
  "learning_rate": 4.545454545454546e-05,
57
- "loss": 1.8575,
58
- "step": 100
59
- },
60
- {
61
- "epoch": 2.0,
62
- "gpt4_scores": 0.7166666666666667,
63
- "step": 110
64
- },
65
- {
66
- "epoch": 2.0,
67
- "eval_loss": 1.8371087312698364,
68
- "eval_runtime": 4.9624,
69
- "eval_samples_per_second": 4.635,
70
- "eval_steps_per_second": 1.209,
71
- "step": 110
72
  },
73
  {
74
- "epoch": 2.18,
75
  "learning_rate": 4.9987413559579636e-05,
76
- "loss": 1.7124,
77
- "step": 120
78
  },
79
  {
80
- "epoch": 2.55,
81
  "learning_rate": 4.988679806432712e-05,
82
- "loss": 1.7283,
83
- "step": 140
84
  },
85
  {
86
- "epoch": 2.91,
87
  "learning_rate": 4.968597221690986e-05,
88
- "loss": 1.6435,
89
- "step": 160
90
- },
91
- {
92
- "epoch": 3.0,
93
- "gpt4_scores": 0.7166666666666667,
94
- "step": 165
95
  },
96
  {
97
- "epoch": 3.0,
98
- "eval_loss": 1.8182122707366943,
99
- "eval_runtime": 4.9633,
100
- "eval_samples_per_second": 4.634,
101
- "eval_steps_per_second": 1.209,
102
- "step": 165
103
- },
104
- {
105
- "epoch": 3.27,
106
  "learning_rate": 4.938574467213518e-05,
107
- "loss": 1.4997,
108
- "step": 180
109
  },
110
  {
111
- "epoch": 3.64,
112
  "learning_rate": 4.898732434036244e-05,
113
- "loss": 1.4507,
114
- "step": 200
115
  },
116
  {
117
- "epoch": 4.0,
118
  "learning_rate": 4.849231551964771e-05,
119
- "loss": 1.4138,
120
- "step": 220
121
- },
122
- {
123
- "epoch": 4.0,
124
- "gpt4_scores": 0.7999999999999999,
125
- "step": 220
126
- },
127
- {
128
- "epoch": 4.0,
129
- "eval_loss": 1.9195586442947388,
130
- "eval_runtime": 4.9623,
131
- "eval_samples_per_second": 4.635,
132
- "eval_steps_per_second": 1.209,
133
- "step": 220
134
  },
135
  {
136
- "epoch": 4.36,
137
  "learning_rate": 4.790271143580174e-05,
138
- "loss": 1.0929,
139
- "step": 240
140
  },
141
  {
142
- "epoch": 4.73,
143
  "learning_rate": 4.722088621637309e-05,
144
- "loss": 1.0749,
145
- "step": 260
146
  },
147
  {
148
- "epoch": 5.0,
149
- "gpt4_scores": 0.6333333333333333,
150
- "step": 275
151
  },
152
  {
153
- "epoch": 5.0,
154
- "eval_loss": 2.2265050411224365,
155
- "eval_runtime": 4.9358,
156
- "eval_samples_per_second": 4.66,
157
- "eval_steps_per_second": 1.216,
158
- "step": 275
159
  },
160
  {
161
- "epoch": 5.09,
162
  "learning_rate": 4.644958533087443e-05,
163
- "loss": 0.9318,
164
- "step": 280
165
  },
166
  {
167
- "epoch": 5.45,
168
  "learning_rate": 4.559191453574582e-05,
169
- "loss": 0.6893,
170
- "step": 300
171
  },
172
  {
173
- "epoch": 5.82,
174
  "learning_rate": 4.465132736856969e-05,
175
- "loss": 0.6715,
176
- "step": 320
177
- },
178
- {
179
- "epoch": 6.0,
180
- "gpt4_scores": 0.7666666666666666,
181
- "step": 330
182
- },
183
- {
184
- "epoch": 6.0,
185
- "eval_loss": 2.5187041759490967,
186
- "eval_runtime": 4.9618,
187
- "eval_samples_per_second": 4.635,
188
- "eval_steps_per_second": 1.209,
189
- "step": 330
190
  },
191
  {
192
- "epoch": 6.18,
193
  "learning_rate": 4.3631611241893874e-05,
194
- "loss": 0.5555,
195
- "step": 340
196
  },
197
  {
198
- "epoch": 6.55,
199
  "learning_rate": 4.2536872192658036e-05,
200
- "loss": 0.4251,
201
- "step": 360
202
  },
203
  {
204
- "epoch": 6.91,
205
  "learning_rate": 4.137151834863213e-05,
206
- "loss": 0.4252,
207
- "step": 380
208
- },
209
- {
210
- "epoch": 7.0,
211
- "gpt4_scores": 0.7833333333333333,
212
- "step": 385
213
- },
214
- {
215
- "epoch": 7.0,
216
- "eval_loss": 2.830350875854492,
217
- "eval_runtime": 4.9687,
218
- "eval_samples_per_second": 4.629,
219
- "eval_steps_per_second": 1.208,
220
- "step": 385
221
  },
222
  {
223
- "epoch": 7.27,
224
  "learning_rate": 4.014024217844167e-05,
225
- "loss": 0.295,
226
- "step": 400
227
  },
228
  {
229
- "epoch": 7.64,
230
  "learning_rate": 3.884800159665276e-05,
231
- "loss": 0.2544,
232
- "step": 420
233
  },
234
  {
235
- "epoch": 8.0,
236
  "learning_rate": 3.7500000000000003e-05,
237
- "loss": 0.2572,
238
- "step": 440
239
- },
240
- {
241
- "epoch": 8.0,
242
- "gpt4_scores": 0.5833333333333334,
243
- "step": 440
244
- },
245
- {
246
- "epoch": 8.0,
247
- "eval_loss": 3.170214891433716,
248
- "eval_runtime": 4.9207,
249
- "eval_samples_per_second": 4.674,
250
- "eval_steps_per_second": 1.219,
251
- "step": 440
252
  },
253
  {
254
- "epoch": 8.36,
255
  "learning_rate": 3.610166531514436e-05,
256
- "loss": 0.1568,
257
- "step": 460
258
  },
259
  {
260
- "epoch": 8.73,
261
  "learning_rate": 3.465862814232822e-05,
262
- "loss": 0.1543,
263
- "step": 480
264
- },
265
- {
266
- "epoch": 9.0,
267
- "gpt4_scores": 0.5833333333333334,
268
- "step": 495
269
- },
270
- {
271
- "epoch": 9.0,
272
- "eval_loss": 3.3739395141601562,
273
- "eval_runtime": 4.953,
274
- "eval_samples_per_second": 4.644,
275
- "eval_steps_per_second": 1.211,
276
- "step": 495
277
  },
278
  {
279
- "epoch": 9.09,
280
  "learning_rate": 3.3176699082935545e-05,
281
- "loss": 0.1501,
282
- "step": 500
283
  },
284
  {
285
- "epoch": 9.45,
286
  "learning_rate": 3.166184534225087e-05,
287
- "loss": 0.1172,
288
- "step": 520
289
  },
290
  {
291
- "epoch": 9.82,
292
  "learning_rate": 3.012016670162977e-05,
293
- "loss": 0.1095,
294
- "step": 540
295
  },
296
  {
297
- "epoch": 10.0,
298
- "gpt4_scores": 0.75,
299
- "step": 550
300
  },
301
  {
302
- "epoch": 10.0,
303
- "eval_loss": 3.496412515640259,
304
- "eval_runtime": 4.961,
305
- "eval_samples_per_second": 4.636,
306
- "eval_steps_per_second": 1.209,
307
- "step": 550
308
  },
309
  {
310
- "epoch": 10.18,
311
  "learning_rate": 2.8557870956832132e-05,
312
- "loss": 0.1012,
313
- "step": 560
314
  },
315
  {
316
- "epoch": 10.55,
317
  "learning_rate": 2.698124892141971e-05,
318
- "loss": 0.0825,
319
- "step": 580
320
  },
321
  {
322
- "epoch": 10.91,
323
  "learning_rate": 2.5396649095870202e-05,
324
- "loss": 0.0875,
325
- "step": 600
326
- },
327
- {
328
- "epoch": 11.0,
329
- "gpt4_scores": 0.48333333333333334,
330
- "step": 605
331
- },
332
- {
333
- "epoch": 11.0,
334
- "eval_loss": 3.64684796333313,
335
- "eval_runtime": 4.9227,
336
- "eval_samples_per_second": 4.672,
337
- "eval_steps_per_second": 1.219,
338
- "step": 605
339
  },
340
  {
341
- "epoch": 11.27,
342
  "learning_rate": 2.3810452104406444e-05,
343
- "loss": 0.0739,
344
- "step": 620
345
  },
346
  {
347
- "epoch": 11.64,
348
  "learning_rate": 2.222904500247473e-05,
349
- "loss": 0.0683,
350
- "step": 640
351
  },
352
  {
353
- "epoch": 12.0,
354
  "learning_rate": 2.0658795558326743e-05,
355
- "loss": 0.0692,
356
- "step": 660
357
- },
358
- {
359
- "epoch": 12.0,
360
- "gpt4_scores": 0.7166666666666667,
361
- "step": 660
362
  },
363
  {
364
- "epoch": 12.0,
365
- "eval_loss": 3.800189256668091,
366
- "eval_runtime": 4.9225,
367
- "eval_samples_per_second": 4.672,
368
- "eval_steps_per_second": 1.219,
369
- "step": 660
370
- },
371
- {
372
- "epoch": 12.36,
373
  "learning_rate": 1.9106026612264316e-05,
374
- "loss": 0.0617,
375
- "step": 680
376
  },
377
  {
378
- "epoch": 12.73,
379
  "learning_rate": 1.7576990616793137e-05,
380
- "loss": 0.0573,
381
- "step": 700
382
- },
383
- {
384
- "epoch": 13.0,
385
- "gpt4_scores": 0.5,
386
- "step": 715
387
- },
388
- {
389
- "epoch": 13.0,
390
- "eval_loss": 3.918210506439209,
391
- "eval_runtime": 4.9555,
392
- "eval_samples_per_second": 4.641,
393
- "eval_steps_per_second": 1.211,
394
- "step": 715
395
  },
396
  {
397
- "epoch": 13.09,
398
  "learning_rate": 1.6077844460203206e-05,
399
- "loss": 0.058,
400
- "step": 720
401
  },
402
  {
403
- "epoch": 13.45,
404
  "learning_rate": 1.4614624674952842e-05,
405
- "loss": 0.0512,
406
- "step": 740
407
  },
408
  {
409
- "epoch": 13.82,
410
  "learning_rate": 1.3193223130682936e-05,
411
- "loss": 0.0527,
412
- "step": 760
413
- },
414
- {
415
- "epoch": 14.0,
416
- "gpt4_scores": 0.5166666666666667,
417
- "step": 770
418
  },
419
  {
420
- "epoch": 14.0,
421
- "eval_loss": 4.011707782745361,
422
- "eval_runtime": 4.9674,
423
- "eval_samples_per_second": 4.63,
424
- "eval_steps_per_second": 1.208,
425
- "step": 770
426
- },
427
- {
428
- "epoch": 14.18,
429
  "learning_rate": 1.181936330973744e-05,
430
- "loss": 0.0498,
431
- "step": 780
432
  },
433
  {
434
- "epoch": 14.55,
435
  "learning_rate": 1.049857726072005e-05,
436
- "loss": 0.0489,
437
- "step": 800
438
  },
439
  {
440
- "epoch": 14.91,
441
  "learning_rate": 9.236183322886945e-06,
442
- "loss": 0.0494,
443
- "step": 820
444
  },
445
  {
446
- "epoch": 15.0,
447
- "gpt4_scores": 0.65,
448
- "step": 825
449
  },
450
  {
451
- "epoch": 15.0,
452
- "eval_loss": 4.091977596282959,
453
- "eval_runtime": 4.9428,
454
- "eval_samples_per_second": 4.653,
455
- "eval_steps_per_second": 1.214,
456
- "step": 825
457
  },
458
  {
459
- "epoch": 15.27,
460
  "learning_rate": 8.0372647110717e-06,
461
- "loss": 0.0471,
462
- "step": 840
463
  },
464
  {
465
- "epoch": 15.64,
466
  "learning_rate": 6.906649047373246e-06,
467
- "loss": 0.0461,
468
- "step": 860
469
  },
470
  {
471
- "epoch": 16.0,
472
  "learning_rate": 5.848888922025553e-06,
473
- "loss": 0.0459,
474
- "step": 880
475
- },
476
- {
477
- "epoch": 16.0,
478
- "gpt4_scores": 0.5833333333333334,
479
- "step": 880
480
  },
481
  {
482
- "epoch": 16.0,
483
- "eval_loss": 4.127155303955078,
484
- "eval_runtime": 4.9427,
485
- "eval_samples_per_second": 4.653,
486
- "eval_steps_per_second": 1.214,
487
- "step": 880
488
- },
489
- {
490
- "epoch": 16.36,
491
  "learning_rate": 4.868243561723535e-06,
492
- "loss": 0.0453,
493
- "step": 900
494
  },
495
  {
496
- "epoch": 16.73,
497
  "learning_rate": 3.968661679220468e-06,
498
- "loss": 0.0456,
499
- "step": 920
500
- },
501
- {
502
- "epoch": 17.0,
503
- "gpt4_scores": 0.43333333333333335,
504
- "step": 935
505
- },
506
- {
507
- "epoch": 17.0,
508
- "eval_loss": 4.1514201164245605,
509
- "eval_runtime": 4.9682,
510
- "eval_samples_per_second": 4.629,
511
- "eval_steps_per_second": 1.208,
512
- "step": 935
513
  },
514
  {
515
- "epoch": 17.09,
516
  "learning_rate": 3.1537655732553768e-06,
517
- "loss": 0.0469,
518
- "step": 940
519
  },
520
  {
521
- "epoch": 17.45,
522
  "learning_rate": 2.4268365428344736e-06,
523
- "loss": 0.0447,
524
- "step": 960
525
  },
526
  {
527
- "epoch": 17.82,
528
  "learning_rate": 1.790801674598186e-06,
529
- "loss": 0.0435,
530
- "step": 980
531
  },
532
  {
533
- "epoch": 18.0,
534
- "gpt4_scores": 0.6333333333333333,
535
- "step": 990
536
- },
537
- {
538
- "epoch": 18.0,
539
- "eval_loss": 4.1617751121521,
540
- "eval_runtime": 4.9681,
541
- "eval_samples_per_second": 4.63,
542
- "eval_steps_per_second": 1.208,
543
- "step": 990
544
- },
545
- {
546
- "epoch": 18.18,
547
  "learning_rate": 1.248222056476367e-06,
548
- "loss": 0.0484,
549
- "step": 1000
550
  },
551
  {
552
- "epoch": 18.55,
553
  "learning_rate": 8.012824650910938e-07,
554
- "loss": 0.0436,
555
- "step": 1020
556
  },
557
  {
558
- "epoch": 18.91,
559
  "learning_rate": 4.517825684323324e-07,
560
- "loss": 0.0446,
561
- "step": 1040
562
- },
563
- {
564
- "epoch": 19.0,
565
- "gpt4_scores": 0.6166666666666667,
566
- "step": 1045
567
- },
568
- {
569
- "epoch": 19.0,
570
- "eval_loss": 4.1660075187683105,
571
- "eval_runtime": 4.9125,
572
- "eval_samples_per_second": 4.682,
573
- "eval_steps_per_second": 1.221,
574
- "step": 1045
575
  },
576
  {
577
- "epoch": 19.27,
578
  "learning_rate": 2.011296792301165e-07,
579
- "loss": 0.0463,
580
- "step": 1060
581
  },
582
  {
583
- "epoch": 19.64,
584
  "learning_rate": 5.033308820289184e-08,
585
- "loss": 0.0473,
586
- "step": 1080
587
  },
588
  {
589
- "epoch": 20.0,
590
  "learning_rate": 0.0,
591
- "loss": 0.0428,
592
- "step": 1100
593
  },
594
  {
595
- "epoch": 20.0,
596
- "gpt4_scores": 0.6833333333333332,
597
- "step": 1100
598
  },
599
  {
600
- "epoch": 20.0,
601
- "eval_loss": 4.166376113891602,
602
- "eval_runtime": 4.9632,
603
- "eval_samples_per_second": 4.634,
604
- "eval_steps_per_second": 1.209,
605
- "step": 1100
606
  },
607
  {
608
- "epoch": 20.0,
609
- "step": 1100,
610
- "total_flos": 3.792163606268314e+16,
611
- "train_loss": 0.5259156136621128,
612
- "train_runtime": 7800.8143,
613
- "train_samples_per_second": 0.556,
614
- "train_steps_per_second": 0.141
615
  }
616
  ],
617
- "logging_steps": 20,
618
- "max_steps": 1100,
619
  "num_input_tokens_seen": 0,
620
- "num_train_epochs": 20,
621
  "save_steps": 20,
622
- "total_flos": 3.792163606268314e+16,
623
  "train_batch_size": 4,
624
  "trial_name": null,
625
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 500,
6
+ "global_step": 220,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "learning_rate": 2.2727272727272728e-06,
14
  "loss": 2.3833,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.07,
19
  "learning_rate": 9.090909090909091e-06,
20
+ "loss": 2.4762,
21
+ "step": 4
22
  },
23
  {
24
+ "epoch": 0.15,
25
  "learning_rate": 1.8181818181818182e-05,
26
+ "loss": 2.2934,
27
+ "step": 8
 
 
 
 
 
28
  },
29
  {
30
+ "epoch": 0.22,
 
 
 
 
 
 
 
 
31
  "learning_rate": 2.7272727272727273e-05,
32
+ "loss": 2.2509,
33
+ "step": 12
34
  },
35
  {
36
+ "epoch": 0.29,
37
  "learning_rate": 3.6363636363636364e-05,
38
+ "loss": 2.1881,
39
+ "step": 16
40
  },
41
  {
42
+ "epoch": 0.36,
43
  "learning_rate": 4.545454545454546e-05,
44
+ "loss": 2.2269,
45
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
+ "epoch": 0.44,
49
  "learning_rate": 4.9987413559579636e-05,
50
+ "loss": 2.1204,
51
+ "step": 24
52
  },
53
  {
54
+ "epoch": 0.51,
55
  "learning_rate": 4.988679806432712e-05,
56
+ "loss": 2.1803,
57
+ "step": 28
58
  },
59
  {
60
+ "epoch": 0.58,
61
  "learning_rate": 4.968597221690986e-05,
62
+ "loss": 2.0278,
63
+ "step": 32
 
 
 
 
 
64
  },
65
  {
66
+ "epoch": 0.65,
 
 
 
 
 
 
 
 
67
  "learning_rate": 4.938574467213518e-05,
68
+ "loss": 1.9672,
69
+ "step": 36
70
  },
71
  {
72
+ "epoch": 0.73,
73
  "learning_rate": 4.898732434036244e-05,
74
+ "loss": 1.8425,
75
+ "step": 40
76
  },
77
  {
78
+ "epoch": 0.8,
79
  "learning_rate": 4.849231551964771e-05,
80
+ "loss": 1.9323,
81
+ "step": 44
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  },
83
  {
84
+ "epoch": 0.87,
85
  "learning_rate": 4.790271143580174e-05,
86
+ "loss": 2.0277,
87
+ "step": 48
88
  },
89
  {
90
+ "epoch": 0.95,
91
  "learning_rate": 4.722088621637309e-05,
92
+ "loss": 1.9256,
93
+ "step": 52
94
  },
95
  {
96
+ "epoch": 1.0,
97
+ "gpt4_scores": 0.6833333333333332,
98
+ "step": 55
99
  },
100
  {
101
+ "epoch": 1.0,
102
+ "eval_loss": 1.878501534461975,
103
+ "eval_runtime": 4.9427,
104
+ "eval_samples_per_second": 4.653,
105
+ "eval_steps_per_second": 1.214,
106
+ "step": 55
107
  },
108
  {
109
+ "epoch": 1.02,
110
  "learning_rate": 4.644958533087443e-05,
111
+ "loss": 1.7957,
112
+ "step": 56
113
  },
114
  {
115
+ "epoch": 1.09,
116
  "learning_rate": 4.559191453574582e-05,
117
+ "loss": 1.8135,
118
+ "step": 60
119
  },
120
  {
121
+ "epoch": 1.16,
122
  "learning_rate": 4.465132736856969e-05,
123
+ "loss": 1.7996,
124
+ "step": 64
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  },
126
  {
127
+ "epoch": 1.24,
128
  "learning_rate": 4.3631611241893874e-05,
129
+ "loss": 1.7751,
130
+ "step": 68
131
  },
132
  {
133
+ "epoch": 1.31,
134
  "learning_rate": 4.2536872192658036e-05,
135
+ "loss": 1.8436,
136
+ "step": 72
137
  },
138
  {
139
+ "epoch": 1.38,
140
  "learning_rate": 4.137151834863213e-05,
141
+ "loss": 1.7179,
142
+ "step": 76
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  },
144
  {
145
+ "epoch": 1.45,
146
  "learning_rate": 4.014024217844167e-05,
147
+ "loss": 1.721,
148
+ "step": 80
149
  },
150
  {
151
+ "epoch": 1.53,
152
  "learning_rate": 3.884800159665276e-05,
153
+ "loss": 1.8576,
154
+ "step": 84
155
  },
156
  {
157
+ "epoch": 1.6,
158
  "learning_rate": 3.7500000000000003e-05,
159
+ "loss": 1.8062,
160
+ "step": 88
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  },
162
  {
163
+ "epoch": 1.67,
164
  "learning_rate": 3.610166531514436e-05,
165
+ "loss": 1.6772,
166
+ "step": 92
167
  },
168
  {
169
+ "epoch": 1.75,
170
  "learning_rate": 3.465862814232822e-05,
171
+ "loss": 1.7685,
172
+ "step": 96
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  },
174
  {
175
+ "epoch": 1.82,
176
  "learning_rate": 3.3176699082935545e-05,
177
+ "loss": 1.721,
178
+ "step": 100
179
  },
180
  {
181
+ "epoch": 1.89,
182
  "learning_rate": 3.166184534225087e-05,
183
+ "loss": 1.6726,
184
+ "step": 104
185
  },
186
  {
187
+ "epoch": 1.96,
188
  "learning_rate": 3.012016670162977e-05,
189
+ "loss": 1.6738,
190
+ "step": 108
191
  },
192
  {
193
+ "epoch": 2.0,
194
+ "gpt4_scores": 0.6666666666666666,
195
+ "step": 110
196
  },
197
  {
198
+ "epoch": 2.0,
199
+ "eval_loss": 1.8228907585144043,
200
+ "eval_runtime": 4.9969,
201
+ "eval_samples_per_second": 4.603,
202
+ "eval_steps_per_second": 1.201,
203
+ "step": 110
204
  },
205
  {
206
+ "epoch": 2.04,
207
  "learning_rate": 2.8557870956832132e-05,
208
+ "loss": 1.6321,
209
+ "step": 112
210
  },
211
  {
212
+ "epoch": 2.11,
213
  "learning_rate": 2.698124892141971e-05,
214
+ "loss": 1.6618,
215
+ "step": 116
216
  },
217
  {
218
+ "epoch": 2.18,
219
  "learning_rate": 2.5396649095870202e-05,
220
+ "loss": 1.5755,
221
+ "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  },
223
  {
224
+ "epoch": 2.25,
225
  "learning_rate": 2.3810452104406444e-05,
226
+ "loss": 1.6999,
227
+ "step": 124
228
  },
229
  {
230
+ "epoch": 2.33,
231
  "learning_rate": 2.222904500247473e-05,
232
+ "loss": 1.6391,
233
+ "step": 128
234
  },
235
  {
236
+ "epoch": 2.4,
237
  "learning_rate": 2.0658795558326743e-05,
238
+ "loss": 1.583,
239
+ "step": 132
 
 
 
 
 
240
  },
241
  {
242
+ "epoch": 2.47,
 
 
 
 
 
 
 
 
243
  "learning_rate": 1.9106026612264316e-05,
244
+ "loss": 1.6833,
245
+ "step": 136
246
  },
247
  {
248
+ "epoch": 2.55,
249
  "learning_rate": 1.7576990616793137e-05,
250
+ "loss": 1.6736,
251
+ "step": 140
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  },
253
  {
254
+ "epoch": 2.62,
255
  "learning_rate": 1.6077844460203206e-05,
256
+ "loss": 1.5683,
257
+ "step": 144
258
  },
259
  {
260
+ "epoch": 2.69,
261
  "learning_rate": 1.4614624674952842e-05,
262
+ "loss": 1.5429,
263
+ "step": 148
264
  },
265
  {
266
+ "epoch": 2.76,
267
  "learning_rate": 1.3193223130682936e-05,
268
+ "loss": 1.6129,
269
+ "step": 152
 
 
 
 
 
270
  },
271
  {
272
+ "epoch": 2.84,
 
 
 
 
 
 
 
 
273
  "learning_rate": 1.181936330973744e-05,
274
+ "loss": 1.6105,
275
+ "step": 156
276
  },
277
  {
278
+ "epoch": 2.91,
279
  "learning_rate": 1.049857726072005e-05,
280
+ "loss": 1.5663,
281
+ "step": 160
282
  },
283
  {
284
+ "epoch": 2.98,
285
  "learning_rate": 9.236183322886945e-06,
286
+ "loss": 1.6181,
287
+ "step": 164
288
  },
289
  {
290
+ "epoch": 3.0,
291
+ "gpt4_scores": 0.7000000000000001,
292
+ "step": 165
293
  },
294
  {
295
+ "epoch": 3.0,
296
+ "eval_loss": 1.8283250331878662,
297
+ "eval_runtime": 4.9725,
298
+ "eval_samples_per_second": 4.625,
299
+ "eval_steps_per_second": 1.207,
300
+ "step": 165
301
  },
302
  {
303
+ "epoch": 3.05,
304
  "learning_rate": 8.0372647110717e-06,
305
+ "loss": 1.5877,
306
+ "step": 168
307
  },
308
  {
309
+ "epoch": 3.13,
310
  "learning_rate": 6.906649047373246e-06,
311
+ "loss": 1.3838,
312
+ "step": 172
313
  },
314
  {
315
+ "epoch": 3.2,
316
  "learning_rate": 5.848888922025553e-06,
317
+ "loss": 1.5406,
318
+ "step": 176
 
 
 
 
 
319
  },
320
  {
321
+ "epoch": 3.27,
 
 
 
 
 
 
 
 
322
  "learning_rate": 4.868243561723535e-06,
323
+ "loss": 1.497,
324
+ "step": 180
325
  },
326
  {
327
+ "epoch": 3.35,
328
  "learning_rate": 3.968661679220468e-06,
329
+ "loss": 1.5702,
330
+ "step": 184
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  },
332
  {
333
+ "epoch": 3.42,
334
  "learning_rate": 3.1537655732553768e-06,
335
+ "loss": 1.6095,
336
+ "step": 188
337
  },
338
  {
339
+ "epoch": 3.49,
340
  "learning_rate": 2.4268365428344736e-06,
341
+ "loss": 1.4712,
342
+ "step": 192
343
  },
344
  {
345
+ "epoch": 3.56,
346
  "learning_rate": 1.790801674598186e-06,
347
+ "loss": 1.5552,
348
+ "step": 196
349
  },
350
  {
351
+ "epoch": 3.64,
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  "learning_rate": 1.248222056476367e-06,
353
+ "loss": 1.5805,
354
+ "step": 200
355
  },
356
  {
357
+ "epoch": 3.71,
358
  "learning_rate": 8.012824650910938e-07,
359
+ "loss": 1.5588,
360
+ "step": 204
361
  },
362
  {
363
+ "epoch": 3.78,
364
  "learning_rate": 4.517825684323324e-07,
365
+ "loss": 1.64,
366
+ "step": 208
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  },
368
  {
369
+ "epoch": 3.85,
370
  "learning_rate": 2.011296792301165e-07,
371
+ "loss": 1.5666,
372
+ "step": 212
373
  },
374
  {
375
+ "epoch": 3.93,
376
  "learning_rate": 5.033308820289184e-08,
377
+ "loss": 1.4496,
378
+ "step": 216
379
  },
380
  {
381
+ "epoch": 4.0,
382
  "learning_rate": 0.0,
383
+ "loss": 1.5007,
384
+ "step": 220
385
  },
386
  {
387
+ "epoch": 4.0,
388
+ "gpt4_scores": 0.6666666666666666,
389
+ "step": 220
390
  },
391
  {
392
+ "epoch": 4.0,
393
+ "eval_loss": 1.8388779163360596,
394
+ "eval_runtime": 4.9378,
395
+ "eval_samples_per_second": 4.658,
396
+ "eval_steps_per_second": 1.215,
397
+ "step": 220
398
  },
399
  {
400
+ "epoch": 4.0,
401
+ "step": 220,
402
+ "total_flos": 7586259830267904.0,
403
+ "train_loss": 1.7501471573656255,
404
+ "train_runtime": 1644.9971,
405
+ "train_samples_per_second": 0.528,
406
+ "train_steps_per_second": 0.134
407
  }
408
  ],
409
+ "logging_steps": 4,
410
+ "max_steps": 220,
411
  "num_input_tokens_seen": 0,
412
+ "num_train_epochs": 4,
413
  "save_steps": 20,
414
+ "total_flos": 7586259830267904.0,
415
  "train_batch_size": 4,
416
  "trial_name": null,
417
  "trial_params": null