yyx123 commited on
Commit
adda7a6
1 Parent(s): 764acdc

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2.1235
26
 
27
  ## Model description
28
 
@@ -54,12 +50,11 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 2.4558 | 1.0 | 19 | 2.1018 |
58
- | 2.4558 | 2.0 | 38 | 1.9877 |
59
- | 2.0718 | 3.0 | 57 | 1.9598 |
60
- | 2.0718 | 4.0 | 76 | 1.9522 |
61
- | 1.9638 | 3.0 | 1206 | 2.4228 |
62
- | 1.8176 | 4.0 | 1608 | 2.4900 |
63
 
64
 
65
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba
 
16
 
17
  # Yi-6B-ruozhiba
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 2.2082
22
 
23
  ## Model description
24
 
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 1.948 | 1.0 | 217 | 1.9488 |
54
+ | 1.7781 | 2.0 | 434 | 1.9393 |
55
+ | 1.4563 | 3.0 | 651 | 2.0187 |
56
+ | 1.3206 | 4.0 | 868 | 2.1767 |
57
+ | 1.1018 | 5.0 | 1085 | 2.2082 |
 
58
 
59
 
60
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4e14f6400e394f4ec3e521399218a2b28c2755cfafaab8c44d2e8daa90f6f7d
3
  size 72673912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff29234f0bb30543464da6cc219166f299f4a42af925bef04b8a8aa2aa26d45c
3
  size 72673912
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.56,
3
- "eval_loss": 2.123528480529785,
4
- "eval_runtime": 4.2701,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 5.386,
7
- "eval_steps_per_second": 5.386,
8
- "train_loss": 0.0,
9
- "train_runtime": 9.8498,
10
  "train_samples": 217,
11
- "train_samples_per_second": 110.155,
12
- "train_steps_per_second": 110.155
13
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_loss": 2.2082066535949707,
4
+ "eval_runtime": 2.7423,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 8.387,
7
+ "eval_steps_per_second": 8.387,
8
+ "train_loss": 1.4582410645375055,
9
+ "train_runtime": 606.5736,
10
  "train_samples": 217,
11
+ "train_samples_per_second": 1.789,
12
+ "train_steps_per_second": 1.789
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.56,
3
- "eval_loss": 2.123528480529785,
4
- "eval_runtime": 4.2701,
5
  "eval_samples": 23,
6
- "eval_samples_per_second": 5.386,
7
- "eval_steps_per_second": 5.386
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "eval_loss": 2.2082066535949707,
4
+ "eval_runtime": 2.7423,
5
  "eval_samples": 23,
6
+ "eval_samples_per_second": 8.387,
7
+ "eval_steps_per_second": 8.387
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.56,
3
- "train_loss": 0.0,
4
- "train_runtime": 9.8498,
5
  "train_samples": 217,
6
- "train_samples_per_second": 110.155,
7
- "train_steps_per_second": 110.155
8
  }
 
1
  {
2
+ "epoch": 5.0,
3
+ "train_loss": 1.4582410645375055,
4
+ "train_runtime": 606.5736,
5
  "train_samples": 217,
6
+ "train_samples_per_second": 1.789,
7
+ "train_steps_per_second": 1.789
8
  }
trainer_state.json CHANGED
@@ -1,411 +1,229 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5610724925521351,
5
  "eval_steps": 500,
6
- "global_step": 2260,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05,
13
- "learning_rate": 5e-06,
14
- "loss": 2.4558,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 1.0,
19
- "eval_loss": 2.1018261909484863,
20
- "eval_runtime": 0.6129,
21
- "eval_samples_per_second": 3.263,
22
- "eval_steps_per_second": 3.263,
23
- "step": 19
24
- },
25
- {
26
- "epoch": 2.0,
27
- "eval_loss": 1.9877368211746216,
28
- "eval_runtime": 0.6133,
29
- "eval_samples_per_second": 3.261,
30
- "eval_steps_per_second": 3.261,
31
- "step": 38
32
- },
33
- {
34
- "epoch": 2.11,
35
- "learning_rate": 3.6143458894413465e-05,
36
- "loss": 2.0718,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 3.0,
41
- "eval_loss": 1.9598064422607422,
42
- "eval_runtime": 0.6117,
43
- "eval_samples_per_second": 3.269,
44
- "eval_steps_per_second": 3.269,
45
- "step": 57
46
- },
47
- {
48
- "epoch": 4.0,
49
- "eval_loss": 1.9522112607955933,
50
- "eval_runtime": 0.6123,
51
- "eval_samples_per_second": 3.267,
52
- "eval_steps_per_second": 3.267,
53
- "step": 76
54
- },
55
- {
56
- "epoch": 4.21,
57
- "learning_rate": 3.7445716067596503e-06,
58
- "loss": 1.826,
59
  "step": 80
60
  },
61
  {
62
- "epoch": 0.12,
63
- "learning_rate": 1.1904761904761905e-05,
64
- "loss": 3.0723,
65
  "step": 120
66
  },
67
  {
68
- "epoch": 0.16,
69
- "learning_rate": 1.5873015873015872e-05,
70
- "loss": 2.9971,
71
  "step": 160
72
  },
73
  {
74
- "epoch": 0.2,
75
- "learning_rate": 1.984126984126984e-05,
76
- "loss": 2.8439,
77
  "step": 200
78
  },
79
  {
80
- "epoch": 0.24,
81
- "learning_rate": 2.380952380952381e-05,
82
- "loss": 2.8909,
 
 
 
 
 
 
 
 
83
  "step": 240
84
  },
85
  {
86
- "epoch": 0.28,
87
- "learning_rate": 2.777777777777778e-05,
88
- "loss": 2.7911,
89
  "step": 280
90
  },
91
  {
92
- "epoch": 0.32,
93
- "learning_rate": 3.1746031746031745e-05,
94
- "loss": 2.9226,
95
  "step": 320
96
  },
97
  {
98
- "epoch": 0.36,
99
- "learning_rate": 3.571428571428572e-05,
100
- "loss": 2.686,
101
  "step": 360
102
  },
103
  {
104
- "epoch": 0.4,
105
- "learning_rate": 3.968253968253968e-05,
106
- "loss": 2.7288,
107
  "step": 400
108
  },
109
  {
110
- "epoch": 0.44,
111
- "learning_rate": 4.3650793650793655e-05,
112
- "loss": 2.8761,
 
 
 
 
 
 
 
 
113
  "step": 440
114
  },
115
  {
116
- "epoch": 0.48,
117
- "learning_rate": 4.761904761904762e-05,
118
- "loss": 2.7233,
119
  "step": 480
120
  },
121
  {
122
- "epoch": 0.52,
123
- "learning_rate": 4.999846164295305e-05,
124
- "loss": 2.7677,
125
  "step": 520
126
  },
127
  {
128
- "epoch": 0.56,
129
- "learning_rate": 4.998115730036208e-05,
130
- "loss": 2.8222,
131
  "step": 560
132
  },
133
  {
134
- "epoch": 0.6,
135
- "learning_rate": 4.994463902265648e-05,
136
- "loss": 2.741,
137
  "step": 600
138
  },
139
  {
140
- "epoch": 0.64,
141
- "learning_rate": 4.9888934897398796e-05,
142
- "loss": 2.7184,
143
  "step": 640
144
  },
145
  {
146
- "epoch": 0.68,
147
- "learning_rate": 4.981408776869891e-05,
148
- "loss": 2.7622,
 
 
 
 
 
 
 
 
149
  "step": 680
150
  },
151
  {
152
- "epoch": 0.71,
153
- "learning_rate": 4.972015520426105e-05,
154
- "loss": 2.7802,
155
  "step": 720
156
  },
157
  {
158
- "epoch": 0.75,
159
- "learning_rate": 4.960720945110629e-05,
160
- "loss": 2.8312,
161
  "step": 760
162
  },
163
  {
164
- "epoch": 0.79,
165
- "learning_rate": 4.9475337380004715e-05,
166
- "loss": 2.6631,
167
  "step": 800
168
  },
169
  {
170
- "epoch": 0.83,
171
- "learning_rate": 4.932464041865992e-05,
172
- "loss": 2.7406,
173
  "step": 840
174
  },
175
  {
176
- "epoch": 2.19,
177
- "learning_rate": 3.454201062050763e-05,
178
- "loss": 2.0682,
 
 
 
 
 
 
 
 
179
  "step": 880
180
  },
181
  {
182
- "epoch": 2.29,
183
- "learning_rate": 3.291511591195636e-05,
184
- "loss": 2.0316,
185
  "step": 920
186
  },
187
  {
188
- "epoch": 2.39,
189
- "learning_rate": 3.125004214391146e-05,
190
- "loss": 2.0728,
191
  "step": 960
192
  },
193
  {
194
- "epoch": 2.49,
195
- "learning_rate": 2.955482090443287e-05,
196
- "loss": 2.025,
197
  "step": 1000
198
  },
199
  {
200
- "epoch": 2.59,
201
- "learning_rate": 2.7837629199805354e-05,
202
- "loss": 2.0502,
203
  "step": 1040
204
  },
205
- {
206
- "epoch": 2.69,
207
- "learning_rate": 2.610675001223441e-05,
208
- "loss": 2.0532,
209
- "step": 1080
210
- },
211
- {
212
- "epoch": 2.79,
213
- "learning_rate": 2.4370532346360474e-05,
214
- "loss": 2.1131,
215
- "step": 1120
216
- },
217
- {
218
- "epoch": 2.89,
219
- "learning_rate": 2.2637350957309882e-05,
220
- "loss": 2.0662,
221
- "step": 1160
222
- },
223
- {
224
- "epoch": 2.99,
225
- "learning_rate": 2.0915565954536744e-05,
226
- "loss": 1.9638,
227
- "step": 1200
228
- },
229
- {
230
- "epoch": 3.0,
231
- "eval_loss": 2.4228010177612305,
232
- "eval_runtime": 5.1555,
233
- "eval_samples_per_second": 9.116,
234
- "eval_steps_per_second": 9.116,
235
- "step": 1206
236
- },
237
- {
238
- "epoch": 3.08,
239
- "learning_rate": 1.9213482476309065e-05,
240
- "loss": 1.8999,
241
- "step": 1240
242
- },
243
- {
244
- "epoch": 3.18,
245
- "learning_rate": 1.7539310629351915e-05,
246
- "loss": 1.8828,
247
- "step": 1280
248
- },
249
- {
250
- "epoch": 3.28,
251
- "learning_rate": 1.5901125886881146e-05,
252
- "loss": 1.7305,
253
- "step": 1320
254
- },
255
- {
256
- "epoch": 3.38,
257
- "learning_rate": 1.430683013605043e-05,
258
- "loss": 1.8129,
259
- "step": 1360
260
- },
261
- {
262
- "epoch": 3.48,
263
- "learning_rate": 1.276411356270143e-05,
264
- "loss": 1.9584,
265
- "step": 1400
266
- },
267
- {
268
- "epoch": 3.58,
269
- "learning_rate": 1.1280417557268735e-05,
270
- "loss": 1.8074,
271
- "step": 1440
272
- },
273
- {
274
- "epoch": 3.68,
275
- "learning_rate": 9.862898820764927e-06,
276
- "loss": 1.7996,
277
- "step": 1480
278
- },
279
- {
280
- "epoch": 3.78,
281
- "learning_rate": 8.518394843983093e-06,
282
- "loss": 1.8291,
283
- "step": 1520
284
- },
285
- {
286
- "epoch": 3.88,
287
- "learning_rate": 7.253390926429918e-06,
288
- "loss": 1.8193,
289
- "step": 1560
290
- },
291
- {
292
- "epoch": 3.98,
293
- "learning_rate": 6.073988894075491e-06,
294
- "loss": 1.8176,
295
- "step": 1600
296
- },
297
- {
298
- "epoch": 4.0,
299
- "eval_loss": 2.489954948425293,
300
- "eval_runtime": 5.1439,
301
- "eval_samples_per_second": 9.137,
302
- "eval_steps_per_second": 9.137,
303
- "step": 1608
304
- },
305
- {
306
- "epoch": 4.08,
307
- "learning_rate": 4.985877666811953e-06,
308
- "loss": 1.7447,
309
- "step": 1640
310
- },
311
- {
312
- "epoch": 4.18,
313
- "learning_rate": 3.994305817590549e-06,
314
- "loss": 1.8169,
315
- "step": 1680
316
- },
317
- {
318
- "epoch": 4.28,
319
- "learning_rate": 3.1040562555998216e-06,
320
- "loss": 1.7304,
321
- "step": 1720
322
- },
323
- {
324
- "epoch": 4.38,
325
- "learning_rate": 2.3194231556022544e-06,
326
- "loss": 1.6802,
327
- "step": 1760
328
- },
329
- {
330
- "epoch": 4.48,
331
- "learning_rate": 1.644191244712251e-06,
332
- "loss": 1.762,
333
- "step": 1800
334
- },
335
- {
336
- "epoch": 4.58,
337
- "learning_rate": 1.0816175465267586e-06,
338
- "loss": 1.6769,
339
- "step": 1840
340
- },
341
- {
342
- "epoch": 4.68,
343
- "learning_rate": 6.344156706670989e-07,
344
- "loss": 1.6643,
345
- "step": 1880
346
- },
347
- {
348
- "epoch": 4.78,
349
- "learning_rate": 3.047427235122663e-07,
350
- "loss": 1.7539,
351
- "step": 1920
352
- },
353
- {
354
- "epoch": 4.88,
355
- "learning_rate": 9.418890326059748e-08,
356
- "loss": 1.7693,
357
- "step": 1960
358
- },
359
  {
360
  "epoch": 4.98,
361
- "learning_rate": 3.769829508754041e-09,
362
- "loss": 1.8315,
363
- "step": 2000
364
- },
365
- {
366
- "epoch": 0.51,
367
- "learning_rate": 4.999974616484595e-05,
368
- "loss": 3.4729,
369
- "step": 2040
370
- },
371
- {
372
- "epoch": 0.52,
373
- "learning_rate": 4.99983643554066e-05,
374
- "loss": 3.8916,
375
- "step": 2080
376
- },
377
- {
378
- "epoch": 0.53,
379
- "learning_rate": 4.999578104083307e-05,
380
- "loss": 3.7239,
381
- "step": 2120
382
- },
383
- {
384
- "epoch": 0.54,
385
- "learning_rate": 4.9991996345288116e-05,
386
- "loss": 3.8084,
387
- "step": 2160
388
- },
389
- {
390
- "epoch": 0.55,
391
- "learning_rate": 4.9987010450676885e-05,
392
- "loss": 3.2992,
393
- "step": 2200
394
  },
395
  {
396
- "epoch": 0.56,
397
- "learning_rate": 4.998082359663817e-05,
398
- "loss": 3.809,
399
- "step": 2240
 
 
400
  },
401
  {
402
- "epoch": 0.56,
403
- "step": 2260,
404
- "total_flos": 1.095290092486656e+16,
405
- "train_loss": 0.0,
406
- "train_runtime": 9.8498,
407
- "train_samples_per_second": 110.155,
408
- "train_steps_per_second": 110.155
409
  }
410
  ],
411
  "logging_steps": 40,
@@ -413,7 +231,7 @@
413
  "num_input_tokens_seen": 0,
414
  "num_train_epochs": 5,
415
  "save_steps": 20,
416
- "total_flos": 1.095290092486656e+16,
417
  "train_batch_size": 1,
418
  "trial_name": null,
419
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 1085,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4.587155963302753e-07,
14
+ "loss": 2.9047,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.18,
19
+ "learning_rate": 1.834862385321101e-05,
20
+ "loss": 2.6449,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "step": 40
22
  },
23
  {
24
+ "epoch": 0.37,
25
+ "learning_rate": 3.669724770642202e-05,
26
+ "loss": 2.2297,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "step": 80
28
  },
29
  {
30
+ "epoch": 0.55,
31
+ "learning_rate": 4.998433068104634e-05,
32
+ "loss": 2.0343,
33
  "step": 120
34
  },
35
  {
36
+ "epoch": 0.74,
37
+ "learning_rate": 4.9663895022434335e-05,
38
+ "loss": 1.9288,
39
  "step": 160
40
  },
41
  {
42
+ "epoch": 0.92,
43
+ "learning_rate": 4.893515717147499e-05,
44
+ "loss": 1.948,
45
  "step": 200
46
  },
47
  {
48
+ "epoch": 1.0,
49
+ "eval_loss": 1.9487614631652832,
50
+ "eval_runtime": 2.7263,
51
+ "eval_samples_per_second": 8.436,
52
+ "eval_steps_per_second": 8.436,
53
+ "step": 217
54
+ },
55
+ {
56
+ "epoch": 1.11,
57
+ "learning_rate": 4.7810181129491795e-05,
58
+ "loss": 1.9046,
59
  "step": 240
60
  },
61
  {
62
+ "epoch": 1.29,
63
+ "learning_rate": 4.630759048306189e-05,
64
+ "loss": 1.8033,
65
  "step": 280
66
  },
67
  {
68
+ "epoch": 1.47,
69
+ "learning_rate": 4.4452260097026376e-05,
70
+ "loss": 1.7401,
71
  "step": 320
72
  },
73
  {
74
+ "epoch": 1.66,
75
+ "learning_rate": 4.227490431976606e-05,
76
+ "loss": 1.8471,
77
  "step": 360
78
  },
79
  {
80
+ "epoch": 1.84,
81
+ "learning_rate": 3.981156851786102e-05,
82
+ "loss": 1.7781,
83
  "step": 400
84
  },
85
  {
86
+ "epoch": 2.0,
87
+ "eval_loss": 1.9392595291137695,
88
+ "eval_runtime": 2.7235,
89
+ "eval_samples_per_second": 8.445,
90
+ "eval_steps_per_second": 8.445,
91
+ "step": 434
92
+ },
93
+ {
94
+ "epoch": 2.03,
95
+ "learning_rate": 3.710303235760038e-05,
96
+ "loss": 1.6503,
97
  "step": 440
98
  },
99
  {
100
+ "epoch": 2.21,
101
+ "learning_rate": 3.41941347118094e-05,
102
+ "loss": 1.5325,
103
  "step": 480
104
  },
105
  {
106
+ "epoch": 2.4,
107
+ "learning_rate": 3.113303136792597e-05,
108
+ "loss": 1.4801,
109
  "step": 520
110
  },
111
  {
112
+ "epoch": 2.58,
113
+ "learning_rate": 2.7970397825710876e-05,
114
+ "loss": 1.5216,
115
  "step": 560
116
  },
117
  {
118
+ "epoch": 2.76,
119
+ "learning_rate": 2.4758590381998137e-05,
120
+ "loss": 1.426,
121
  "step": 600
122
  },
123
  {
124
+ "epoch": 2.95,
125
+ "learning_rate": 2.1550779390435147e-05,
126
+ "loss": 1.4563,
127
  "step": 640
128
  },
129
  {
130
+ "epoch": 3.0,
131
+ "eval_loss": 2.018707036972046,
132
+ "eval_runtime": 2.7309,
133
+ "eval_samples_per_second": 8.422,
134
+ "eval_steps_per_second": 8.422,
135
+ "step": 651
136
+ },
137
+ {
138
+ "epoch": 3.13,
139
+ "learning_rate": 1.840006904479584e-05,
140
+ "loss": 1.2741,
141
  "step": 680
142
  },
143
  {
144
+ "epoch": 3.32,
145
+ "learning_rate": 1.5358618257547464e-05,
146
+ "loss": 1.1991,
147
  "step": 720
148
  },
149
  {
150
+ "epoch": 3.5,
151
+ "learning_rate": 1.2476777187220119e-05,
152
+ "loss": 1.2377,
153
  "step": 760
154
  },
155
  {
156
+ "epoch": 3.69,
157
+ "learning_rate": 9.802253709067949e-06,
158
+ "loss": 1.2592,
159
  "step": 800
160
  },
161
  {
162
+ "epoch": 3.87,
163
+ "learning_rate": 7.3793236278095755e-06,
164
+ "loss": 1.3206,
165
  "step": 840
166
  },
167
  {
168
+ "epoch": 4.0,
169
+ "eval_loss": 2.17669677734375,
170
+ "eval_runtime": 2.7315,
171
+ "eval_samples_per_second": 8.42,
172
+ "eval_steps_per_second": 8.42,
173
+ "step": 868
174
+ },
175
+ {
176
+ "epoch": 4.06,
177
+ "learning_rate": 5.248097707101035e-06,
178
+ "loss": 1.1614,
179
  "step": 880
180
  },
181
  {
182
+ "epoch": 4.24,
183
+ "learning_rate": 3.443857649812915e-06,
184
+ "loss": 1.1929,
185
  "step": 920
186
  },
187
  {
188
+ "epoch": 4.42,
189
+ "learning_rate": 1.9964720217269558e-06,
190
+ "loss": 1.0971,
191
  "step": 960
192
  },
193
  {
194
+ "epoch": 4.61,
195
+ "learning_rate": 9.299017878319383e-07,
196
+ "loss": 1.091,
197
  "step": 1000
198
  },
199
  {
200
+ "epoch": 4.79,
201
+ "learning_rate": 2.6180364689323554e-07,
202
+ "loss": 1.1938,
203
  "step": 1040
204
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  {
206
  "epoch": 4.98,
207
+ "learning_rate": 3.237730954069873e-09,
208
+ "loss": 1.1018,
209
+ "step": 1080
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  },
211
  {
212
+ "epoch": 5.0,
213
+ "eval_loss": 2.2082066535949707,
214
+ "eval_runtime": 2.7308,
215
+ "eval_samples_per_second": 8.422,
216
+ "eval_steps_per_second": 8.422,
217
+ "step": 1085
218
  },
219
  {
220
+ "epoch": 5.0,
221
+ "step": 1085,
222
+ "total_flos": 6971428308910080.0,
223
+ "train_loss": 1.4582410645375055,
224
+ "train_runtime": 606.5736,
225
+ "train_samples_per_second": 1.789,
226
+ "train_steps_per_second": 1.789
227
  }
228
  ],
229
  "logging_steps": 40,
 
231
  "num_input_tokens_seen": 0,
232
  "num_train_epochs": 5,
233
  "save_steps": 20,
234
+ "total_flos": 6971428308910080.0,
235
  "train_batch_size": 1,
236
  "trial_name": null,
237
  "trial_params": null