emendes3 commited on
Commit
65743fc
1 Parent(s): d9443ad

Model save

Browse files
README.md CHANGED
@@ -1,27 +1,19 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - liuhaotian/llava-v1.5-13b_10.0
5
  - generated_from_trainer
6
  base_model: liuhaotian/llava-v1.5-13b
7
  model-index:
8
- - name: liuhaotian/llava-v1.5-13b_10.0
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # liuhaotian/llava-v1.5-13b_10.0
16
 
17
- This model is a fine-tuned version of [liuhaotian/llava-v1.5-13b_10.0](https://huggingface.co/liuhaotian/llava-v1.5-13b_10.0) on an unknown dataset.
18
- It achieves the following results on the evaluation set:
19
- - eval_loss: 0.0004
20
- - eval_runtime: 27.6221
21
- - eval_samples_per_second: 15.386
22
- - eval_steps_per_second: 0.507
23
- - epoch: 9.0
24
- - step: 126
25
 
26
  ## Model description
27
 
@@ -45,13 +37,13 @@ The following hyperparameters were used during training:
45
  - eval_batch_size: 4
46
  - seed: 42
47
  - distributed_type: multi-GPU
48
- - num_devices: 8
49
- - total_train_batch_size: 32
50
- - total_eval_batch_size: 32
51
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.03
54
- - num_epochs: 10.0
55
 
56
  ### Framework versions
57
 
 
1
  ---
2
  library_name: peft
3
  tags:
 
4
  - generated_from_trainer
5
  base_model: liuhaotian/llava-v1.5-13b
6
  model-index:
7
+ - name: llava_13b_exact_location_name_synthetic
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
+ # llava_13b_exact_location_name_synthetic
15
 
16
+ This model is a fine-tuned version of [liuhaotian/llava-v1.5-13b](https://huggingface.co/liuhaotian/llava-v1.5-13b) on an unknown dataset.
 
 
 
 
 
 
 
17
 
18
  ## Model description
19
 
 
37
  - eval_batch_size: 4
38
  - seed: 42
39
  - distributed_type: multi-GPU
40
+ - num_devices: 4
41
+ - total_train_batch_size: 16
42
+ - total_eval_batch_size: 16
43
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
  - lr_scheduler_type: cosine
45
  - lr_scheduler_warmup_ratio: 0.03
46
+ - num_epochs: 20.0
47
 
48
  ### Framework versions
49
 
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "k_proj",
 
24
  "v_proj",
25
- "q_proj",
26
  "down_proj",
27
- "up_proj",
28
  "o_proj",
29
- "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "k_proj",
25
+ "up_proj",
26
  "v_proj",
 
27
  "down_proj",
 
28
  "o_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7ab07a1e82d1268bbb41d53be6600adccc5fed1fc714f9f2835865a092e298d
3
  size 1001466944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3e154e3e8fb9c55fc0612cde2e1743fc1b2ee777c5ec1c8d7b454f88057371
3
  size 1001466944
num_examples=100/llava-v1.5-13b_1.0/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "k_proj",
 
24
  "v_proj",
25
- "q_proj",
26
  "down_proj",
27
- "up_proj",
28
  "o_proj",
29
- "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "k_proj",
25
+ "up_proj",
26
  "v_proj",
 
27
  "down_proj",
 
28
  "o_proj",
29
+ "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
num_examples=100/llava-v1.5-13b_1.0/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bb3a80b637c433e0ea348fa437556e993ed780f00affdec993142f675c2af7d
3
  size 1001466944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad3e154e3e8fb9c55fc0612cde2e1743fc1b2ee777c5ec1c8d7b454f88057371
3
  size 1001466944
num_examples=100/llava-v1.5-13b_1.0/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab60061c6db4983ad6a9334f47864cfc12b2212c4796c83a0d247a55439133a6
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa90b394534132b7b403551766766cdfe27216265cc6d0e380d159dcb58549e3
3
  size 6840
trainer_state.json CHANGED
@@ -3,1787 +3,947 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
  "learning_rate": 0.0,
14
- "loss": 1.4894,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.07,
19
- "learning_rate": 6.309297535714573e-05,
20
- "loss": 1.2017,
21
  "step": 2
22
  },
23
  {
24
- "epoch": 0.11,
25
- "learning_rate": 0.0001,
26
- "loss": 1.1717,
27
  "step": 3
28
  },
29
  {
30
- "epoch": 0.14,
31
- "learning_rate": 0.00012618595071429146,
32
- "loss": 1.1308,
33
  "step": 4
34
  },
35
  {
36
- "epoch": 0.18,
37
- "learning_rate": 0.0001464973520717927,
38
- "loss": 1.0773,
39
  "step": 5
40
  },
41
  {
42
- "epoch": 0.21,
43
- "learning_rate": 0.00016309297535714573,
44
- "loss": 1.0267,
45
  "step": 6
46
  },
47
  {
48
- "epoch": 0.25,
49
- "learning_rate": 0.00017712437491614223,
50
- "loss": 0.9945,
51
  "step": 7
52
  },
53
  {
54
- "epoch": 0.29,
55
- "learning_rate": 0.0001892789260714372,
56
- "loss": 0.9553,
57
  "step": 8
58
  },
59
  {
60
- "epoch": 0.32,
61
  "learning_rate": 0.0002,
62
- "loss": 0.9318,
63
  "step": 9
64
  },
65
  {
66
- "epoch": 0.36,
67
  "learning_rate": 0.0002,
68
- "loss": 0.9507,
69
  "step": 10
70
  },
71
  {
72
- "epoch": 0.39,
73
  "learning_rate": 0.0002,
74
- "loss": 0.8816,
75
  "step": 11
76
  },
77
  {
78
- "epoch": 0.43,
79
  "learning_rate": 0.0002,
80
- "loss": 0.8428,
81
  "step": 12
82
  },
83
  {
84
- "epoch": 0.46,
85
  "learning_rate": 0.0002,
86
- "loss": 0.8199,
87
  "step": 13
88
  },
89
  {
90
- "epoch": 0.5,
91
  "learning_rate": 0.0002,
92
- "loss": 0.78,
 
 
 
 
 
 
 
 
93
  "step": 14
94
  },
95
  {
96
- "epoch": 0.54,
97
  "learning_rate": 0.0002,
98
- "loss": 0.7722,
99
  "step": 15
100
  },
101
  {
102
- "epoch": 0.57,
103
  "learning_rate": 0.0002,
104
- "loss": 0.7317,
105
  "step": 16
106
  },
107
  {
108
- "epoch": 0.61,
109
  "learning_rate": 0.0002,
110
- "loss": 0.6472,
111
  "step": 17
112
  },
113
  {
114
- "epoch": 0.64,
115
  "learning_rate": 0.0002,
116
- "loss": 0.7027,
117
  "step": 18
118
  },
119
  {
120
- "epoch": 0.68,
121
  "learning_rate": 0.0002,
122
- "loss": 0.6111,
123
  "step": 19
124
  },
125
  {
126
- "epoch": 0.71,
127
  "learning_rate": 0.0002,
128
- "loss": 0.6577,
129
  "step": 20
130
  },
131
  {
132
- "epoch": 0.75,
133
  "learning_rate": 0.0002,
134
- "loss": 0.5182,
135
  "step": 21
136
  },
137
  {
138
- "epoch": 0.79,
139
  "learning_rate": 0.0002,
140
- "loss": 0.4826,
141
  "step": 22
142
  },
143
  {
144
- "epoch": 0.82,
145
  "learning_rate": 0.0002,
146
- "loss": 0.4477,
147
  "step": 23
148
  },
149
  {
150
- "epoch": 0.86,
151
  "learning_rate": 0.0002,
152
- "loss": 0.3735,
153
  "step": 24
154
  },
155
  {
156
- "epoch": 0.89,
157
  "learning_rate": 0.0002,
158
- "loss": 0.4257,
159
  "step": 25
160
  },
161
  {
162
- "epoch": 0.93,
163
  "learning_rate": 0.0002,
164
- "loss": 0.3523,
165
  "step": 26
166
  },
167
  {
168
- "epoch": 0.96,
169
  "learning_rate": 0.0002,
170
- "loss": 0.3972,
171
  "step": 27
172
  },
173
  {
174
- "epoch": 1.0,
175
  "learning_rate": 0.0002,
176
- "loss": 0.3234,
177
  "step": 28
178
  },
179
  {
180
- "epoch": 1.0,
181
- "eval_loss": 0.2477385699748993,
182
- "eval_runtime": 54.9686,
183
- "eval_samples_per_second": 15.773,
184
  "eval_steps_per_second": 0.509,
185
  "step": 28
186
  },
187
  {
188
- "epoch": 1.04,
189
  "learning_rate": 0.0002,
190
- "loss": 0.2312,
191
  "step": 29
192
  },
193
  {
194
- "epoch": 1.07,
195
  "learning_rate": 0.0002,
196
- "loss": 0.2441,
197
  "step": 30
198
  },
199
  {
200
- "epoch": 1.11,
201
  "learning_rate": 0.0002,
202
- "loss": 0.2217,
203
  "step": 31
204
  },
205
  {
206
- "epoch": 1.14,
207
  "learning_rate": 0.0002,
208
- "loss": 0.1936,
209
  "step": 32
210
  },
211
  {
212
- "epoch": 1.18,
213
  "learning_rate": 0.0002,
214
- "loss": 0.2099,
215
  "step": 33
216
  },
217
  {
218
- "epoch": 1.21,
219
  "learning_rate": 0.0002,
220
- "loss": 0.1898,
221
  "step": 34
222
  },
223
  {
224
- "epoch": 1.25,
225
  "learning_rate": 0.0002,
226
- "loss": 0.2296,
227
  "step": 35
228
  },
229
  {
230
- "epoch": 1.29,
231
  "learning_rate": 0.0002,
232
- "loss": 0.1523,
233
  "step": 36
234
  },
235
  {
236
- "epoch": 1.32,
237
  "learning_rate": 0.0002,
238
- "loss": 0.151,
239
  "step": 37
240
  },
241
  {
242
- "epoch": 1.36,
243
  "learning_rate": 0.0002,
244
- "loss": 0.0952,
245
  "step": 38
246
  },
247
  {
248
- "epoch": 1.39,
249
  "learning_rate": 0.0002,
250
- "loss": 0.1017,
251
  "step": 39
252
  },
253
  {
254
- "epoch": 1.43,
255
  "learning_rate": 0.0002,
256
- "loss": 0.1736,
257
  "step": 40
258
  },
259
  {
260
- "epoch": 1.46,
261
  "learning_rate": 0.0002,
262
- "loss": 0.1143,
263
  "step": 41
264
  },
265
  {
266
- "epoch": 1.5,
267
  "learning_rate": 0.0002,
268
- "loss": 0.1064,
 
 
 
 
 
 
 
 
269
  "step": 42
270
  },
271
  {
272
- "epoch": 1.54,
273
  "learning_rate": 0.0002,
274
- "loss": 0.1315,
275
  "step": 43
276
  },
277
  {
278
- "epoch": 1.57,
279
  "learning_rate": 0.0002,
280
- "loss": 0.1297,
281
  "step": 44
282
  },
283
  {
284
- "epoch": 1.61,
285
  "learning_rate": 0.0002,
286
- "loss": 0.0686,
287
  "step": 45
288
  },
289
  {
290
- "epoch": 1.64,
291
  "learning_rate": 0.0002,
292
- "loss": 0.0729,
293
  "step": 46
294
  },
295
  {
296
- "epoch": 1.68,
297
  "learning_rate": 0.0002,
298
- "loss": 0.0909,
299
  "step": 47
300
  },
301
  {
302
- "epoch": 1.71,
303
  "learning_rate": 0.0002,
304
- "loss": 0.1128,
305
  "step": 48
306
  },
307
  {
308
- "epoch": 1.75,
309
  "learning_rate": 0.0002,
310
- "loss": 0.0989,
311
  "step": 49
312
  },
313
  {
314
- "epoch": 1.79,
315
  "learning_rate": 0.0002,
316
- "loss": 0.0551,
317
  "step": 50
318
  },
319
  {
320
- "epoch": 1.82,
321
  "learning_rate": 0.0002,
322
- "loss": 0.1288,
323
  "step": 51
324
  },
325
  {
326
- "epoch": 1.86,
327
  "learning_rate": 0.0002,
328
- "loss": 0.0469,
329
  "step": 52
330
  },
331
  {
332
- "epoch": 1.89,
333
  "learning_rate": 0.0002,
334
- "loss": 0.1091,
335
  "step": 53
336
  },
337
  {
338
- "epoch": 1.93,
339
  "learning_rate": 0.0002,
340
- "loss": 0.1593,
341
  "step": 54
342
  },
343
  {
344
- "epoch": 1.96,
345
  "learning_rate": 0.0002,
346
- "loss": 0.1075,
347
  "step": 55
348
  },
349
  {
350
- "epoch": 2.0,
351
  "learning_rate": 0.0002,
352
- "loss": 0.0391,
353
  "step": 56
354
  },
355
  {
356
- "epoch": 2.0,
357
- "eval_loss": 0.04489068686962128,
358
- "eval_runtime": 55.0922,
359
- "eval_samples_per_second": 15.737,
360
- "eval_steps_per_second": 0.508,
361
  "step": 56
362
  },
363
  {
364
- "epoch": 2.04,
365
  "learning_rate": 0.0002,
366
- "loss": 0.0387,
367
  "step": 57
368
  },
369
  {
370
- "epoch": 2.07,
371
  "learning_rate": 0.0002,
372
- "loss": 0.0306,
373
  "step": 58
374
  },
375
  {
376
- "epoch": 2.11,
377
  "learning_rate": 0.0002,
378
- "loss": 0.0307,
379
  "step": 59
380
  },
381
  {
382
- "epoch": 2.14,
383
  "learning_rate": 0.0002,
384
- "loss": 0.0451,
385
  "step": 60
386
  },
387
  {
388
- "epoch": 2.18,
389
  "learning_rate": 0.0002,
390
- "loss": 0.0245,
391
  "step": 61
392
  },
393
  {
394
- "epoch": 2.21,
395
  "learning_rate": 0.0002,
396
- "loss": 0.0351,
397
  "step": 62
398
  },
399
  {
400
- "epoch": 2.25,
401
  "learning_rate": 0.0002,
402
- "loss": 0.0248,
403
  "step": 63
404
  },
405
  {
406
- "epoch": 2.29,
407
  "learning_rate": 0.0002,
408
- "loss": 0.0244,
409
  "step": 64
410
  },
411
  {
412
- "epoch": 2.32,
413
  "learning_rate": 0.0002,
414
- "loss": 0.0309,
415
  "step": 65
416
  },
417
  {
418
- "epoch": 2.36,
419
  "learning_rate": 0.0002,
420
- "loss": 0.0272,
421
  "step": 66
422
  },
423
  {
424
- "epoch": 2.39,
425
  "learning_rate": 0.0002,
426
- "loss": 0.0136,
427
  "step": 67
428
  },
429
  {
430
- "epoch": 2.43,
431
  "learning_rate": 0.0002,
432
- "loss": 0.0297,
433
  "step": 68
434
  },
435
  {
436
- "epoch": 2.46,
437
  "learning_rate": 0.0002,
438
- "loss": 0.0356,
439
  "step": 69
440
  },
441
  {
442
- "epoch": 2.5,
443
  "learning_rate": 0.0002,
444
- "loss": 0.0284,
 
 
 
 
 
 
 
 
445
  "step": 70
446
  },
447
  {
448
- "epoch": 2.54,
449
  "learning_rate": 0.0002,
450
- "loss": 0.0189,
451
  "step": 71
452
  },
453
  {
454
- "epoch": 2.57,
455
  "learning_rate": 0.0002,
456
- "loss": 0.0199,
457
  "step": 72
458
  },
459
  {
460
- "epoch": 2.61,
461
  "learning_rate": 0.0002,
462
- "loss": 0.0121,
463
  "step": 73
464
  },
465
  {
466
- "epoch": 2.64,
467
  "learning_rate": 0.0002,
468
- "loss": 0.0389,
469
  "step": 74
470
  },
471
  {
472
- "epoch": 2.68,
473
  "learning_rate": 0.0002,
474
- "loss": 0.0298,
475
  "step": 75
476
  },
477
  {
478
- "epoch": 2.71,
479
  "learning_rate": 0.0002,
480
- "loss": 0.0176,
481
  "step": 76
482
  },
483
  {
484
- "epoch": 2.75,
485
  "learning_rate": 0.0002,
486
- "loss": 0.0165,
487
  "step": 77
488
  },
489
  {
490
- "epoch": 2.79,
491
  "learning_rate": 0.0002,
492
- "loss": 0.0255,
493
  "step": 78
494
  },
495
  {
496
- "epoch": 2.82,
497
  "learning_rate": 0.0002,
498
- "loss": 0.0282,
499
  "step": 79
500
  },
501
  {
502
- "epoch": 2.86,
503
  "learning_rate": 0.0002,
504
- "loss": 0.0247,
505
  "step": 80
506
  },
507
  {
508
- "epoch": 2.89,
509
  "learning_rate": 0.0002,
510
- "loss": 0.021,
511
  "step": 81
512
  },
513
  {
514
- "epoch": 2.93,
515
  "learning_rate": 0.0002,
516
- "loss": 0.024,
517
  "step": 82
518
  },
519
  {
520
- "epoch": 2.96,
521
  "learning_rate": 0.0002,
522
- "loss": 0.0321,
523
  "step": 83
524
  },
525
  {
526
- "epoch": 3.0,
527
  "learning_rate": 0.0002,
528
- "loss": 0.0123,
529
  "step": 84
530
  },
531
  {
532
- "epoch": 3.0,
533
- "eval_loss": 0.012368076480925083,
534
- "eval_runtime": 55.1635,
535
- "eval_samples_per_second": 15.717,
536
- "eval_steps_per_second": 0.508,
537
  "step": 84
538
  },
539
  {
540
- "epoch": 3.04,
541
  "learning_rate": 0.0002,
542
- "loss": 0.0057,
543
  "step": 85
544
  },
545
  {
546
- "epoch": 3.07,
547
  "learning_rate": 0.0002,
548
- "loss": 0.0139,
549
  "step": 86
550
  },
551
  {
552
- "epoch": 3.11,
553
  "learning_rate": 0.0002,
554
- "loss": 0.0095,
555
  "step": 87
556
  },
557
  {
558
- "epoch": 3.14,
559
  "learning_rate": 0.0002,
560
- "loss": 0.009,
561
  "step": 88
562
  },
563
  {
564
- "epoch": 3.18,
565
  "learning_rate": 0.0002,
566
- "loss": 0.0123,
567
  "step": 89
568
  },
569
  {
570
- "epoch": 3.21,
571
  "learning_rate": 0.0002,
572
- "loss": 0.0109,
573
  "step": 90
574
  },
575
  {
576
- "epoch": 3.25,
577
  "learning_rate": 0.0002,
578
- "loss": 0.0098,
579
  "step": 91
580
  },
581
  {
582
- "epoch": 3.29,
583
  "learning_rate": 0.0002,
584
- "loss": 0.0063,
585
  "step": 92
586
  },
587
  {
588
- "epoch": 3.32,
589
  "learning_rate": 0.0002,
590
- "loss": 0.0105,
591
  "step": 93
592
  },
593
  {
594
- "epoch": 3.36,
595
  "learning_rate": 0.0002,
596
- "loss": 0.0062,
597
  "step": 94
598
  },
599
  {
600
- "epoch": 3.39,
601
  "learning_rate": 0.0002,
602
- "loss": 0.0139,
603
  "step": 95
604
  },
605
  {
606
- "epoch": 3.43,
607
  "learning_rate": 0.0002,
608
- "loss": 0.0141,
609
  "step": 96
610
  },
611
  {
612
- "epoch": 3.46,
613
  "learning_rate": 0.0002,
614
- "loss": 0.011,
615
  "step": 97
616
  },
617
  {
618
- "epoch": 3.5,
619
  "learning_rate": 0.0002,
620
- "loss": 0.0106,
 
 
 
 
 
 
 
 
621
  "step": 98
622
  },
623
  {
624
- "epoch": 3.54,
625
  "learning_rate": 0.0002,
626
- "loss": 0.0131,
627
  "step": 99
628
  },
629
  {
630
- "epoch": 3.57,
631
  "learning_rate": 0.0002,
632
- "loss": 0.009,
633
  "step": 100
634
  },
635
  {
636
- "epoch": 3.61,
637
  "learning_rate": 0.0002,
638
- "loss": 0.0204,
639
  "step": 101
640
  },
641
  {
642
- "epoch": 3.64,
643
  "learning_rate": 0.0002,
644
- "loss": 0.0117,
645
  "step": 102
646
  },
647
  {
648
- "epoch": 3.68,
649
  "learning_rate": 0.0002,
650
- "loss": 0.0156,
651
  "step": 103
652
  },
653
  {
654
- "epoch": 3.71,
655
  "learning_rate": 0.0002,
656
- "loss": 0.0137,
657
  "step": 104
658
  },
659
  {
660
- "epoch": 3.75,
661
  "learning_rate": 0.0002,
662
- "loss": 0.0157,
663
  "step": 105
664
  },
665
  {
666
- "epoch": 3.79,
667
  "learning_rate": 0.0002,
668
- "loss": 0.0143,
669
  "step": 106
670
  },
671
  {
672
- "epoch": 3.82,
673
  "learning_rate": 0.0002,
674
- "loss": 0.007,
675
  "step": 107
676
  },
677
  {
678
- "epoch": 3.86,
679
  "learning_rate": 0.0002,
680
- "loss": 0.0092,
681
  "step": 108
682
  },
683
  {
684
- "epoch": 3.89,
685
  "learning_rate": 0.0002,
686
- "loss": 0.0091,
687
  "step": 109
688
  },
689
  {
690
- "epoch": 3.93,
691
  "learning_rate": 0.0002,
692
- "loss": 0.0249,
693
  "step": 110
694
  },
695
  {
696
- "epoch": 3.96,
697
  "learning_rate": 0.0002,
698
- "loss": 0.0055,
699
  "step": 111
700
  },
701
- {
702
- "epoch": 4.0,
703
- "learning_rate": 0.0002,
704
- "loss": 0.0055,
705
- "step": 112
706
- },
707
- {
708
- "epoch": 4.0,
709
- "eval_loss": 0.006369821261614561,
710
- "eval_runtime": 55.1727,
711
- "eval_samples_per_second": 15.714,
712
- "eval_steps_per_second": 0.507,
713
- "step": 112
714
- },
715
- {
716
- "epoch": 4.04,
717
- "learning_rate": 0.0002,
718
- "loss": 0.0069,
719
- "step": 113
720
- },
721
- {
722
- "epoch": 4.07,
723
- "learning_rate": 0.0002,
724
- "loss": 0.0044,
725
- "step": 114
726
- },
727
- {
728
- "epoch": 4.11,
729
- "learning_rate": 0.0002,
730
- "loss": 0.0034,
731
- "step": 115
732
- },
733
- {
734
- "epoch": 4.14,
735
- "learning_rate": 0.0002,
736
- "loss": 0.004,
737
- "step": 116
738
- },
739
- {
740
- "epoch": 4.18,
741
- "learning_rate": 0.0002,
742
- "loss": 0.0049,
743
- "step": 117
744
- },
745
- {
746
- "epoch": 4.21,
747
- "learning_rate": 0.0002,
748
- "loss": 0.0048,
749
- "step": 118
750
- },
751
- {
752
- "epoch": 4.25,
753
- "learning_rate": 0.0002,
754
- "loss": 0.0122,
755
- "step": 119
756
- },
757
- {
758
- "epoch": 4.29,
759
- "learning_rate": 0.0002,
760
- "loss": 0.0052,
761
- "step": 120
762
- },
763
- {
764
- "epoch": 4.32,
765
- "learning_rate": 0.0002,
766
- "loss": 0.005,
767
- "step": 121
768
- },
769
- {
770
- "epoch": 4.36,
771
- "learning_rate": 0.0002,
772
- "loss": 0.006,
773
- "step": 122
774
- },
775
- {
776
- "epoch": 4.39,
777
- "learning_rate": 0.0002,
778
- "loss": 0.0059,
779
- "step": 123
780
- },
781
- {
782
- "epoch": 4.43,
783
- "learning_rate": 0.0002,
784
- "loss": 0.0027,
785
- "step": 124
786
- },
787
- {
788
- "epoch": 4.46,
789
- "learning_rate": 0.0002,
790
- "loss": 0.0025,
791
- "step": 125
792
- },
793
- {
794
- "epoch": 4.5,
795
- "learning_rate": 0.0002,
796
- "loss": 0.004,
797
- "step": 126
798
- },
799
- {
800
- "epoch": 4.54,
801
- "learning_rate": 0.0002,
802
- "loss": 0.0033,
803
- "step": 127
804
- },
805
- {
806
- "epoch": 4.57,
807
- "learning_rate": 0.0002,
808
- "loss": 0.0024,
809
- "step": 128
810
- },
811
- {
812
- "epoch": 4.61,
813
- "learning_rate": 0.0002,
814
- "loss": 0.0034,
815
- "step": 129
816
- },
817
- {
818
- "epoch": 4.64,
819
- "learning_rate": 0.0002,
820
- "loss": 0.0051,
821
- "step": 130
822
- },
823
- {
824
- "epoch": 4.68,
825
- "learning_rate": 0.0002,
826
- "loss": 0.0025,
827
- "step": 131
828
- },
829
- {
830
- "epoch": 4.71,
831
- "learning_rate": 0.0002,
832
- "loss": 0.0039,
833
- "step": 132
834
- },
835
- {
836
- "epoch": 4.75,
837
- "learning_rate": 0.0002,
838
- "loss": 0.005,
839
- "step": 133
840
- },
841
- {
842
- "epoch": 4.79,
843
- "learning_rate": 0.0002,
844
- "loss": 0.0013,
845
- "step": 134
846
- },
847
- {
848
- "epoch": 4.82,
849
- "learning_rate": 0.0002,
850
- "loss": 0.0041,
851
- "step": 135
852
- },
853
- {
854
- "epoch": 4.86,
855
- "learning_rate": 0.0002,
856
- "loss": 0.0045,
857
- "step": 136
858
- },
859
- {
860
- "epoch": 4.89,
861
- "learning_rate": 0.0002,
862
- "loss": 0.0037,
863
- "step": 137
864
- },
865
- {
866
- "epoch": 4.93,
867
- "learning_rate": 0.0002,
868
- "loss": 0.0045,
869
- "step": 138
870
- },
871
- {
872
- "epoch": 4.96,
873
- "learning_rate": 0.0002,
874
- "loss": 0.0029,
875
- "step": 139
876
- },
877
- {
878
- "epoch": 5.0,
879
- "learning_rate": 0.0002,
880
- "loss": 0.0023,
881
- "step": 140
882
- },
883
- {
884
- "epoch": 5.0,
885
- "eval_loss": 0.0037152974400669336,
886
- "eval_runtime": 55.1887,
887
- "eval_samples_per_second": 15.71,
888
- "eval_steps_per_second": 0.507,
889
- "step": 140
890
- },
891
- {
892
- "epoch": 5.04,
893
- "learning_rate": 0.0002,
894
- "loss": 0.001,
895
- "step": 141
896
- },
897
- {
898
- "epoch": 5.07,
899
- "learning_rate": 0.0002,
900
- "loss": 0.0036,
901
- "step": 142
902
- },
903
- {
904
- "epoch": 5.11,
905
- "learning_rate": 0.0002,
906
- "loss": 0.0031,
907
- "step": 143
908
- },
909
- {
910
- "epoch": 5.14,
911
- "learning_rate": 0.0002,
912
- "loss": 0.0085,
913
- "step": 144
914
- },
915
- {
916
- "epoch": 5.18,
917
- "learning_rate": 0.0002,
918
- "loss": 0.0053,
919
- "step": 145
920
- },
921
- {
922
- "epoch": 5.21,
923
- "learning_rate": 0.0002,
924
- "loss": 0.004,
925
- "step": 146
926
- },
927
- {
928
- "epoch": 5.25,
929
- "learning_rate": 0.0002,
930
- "loss": 0.0057,
931
- "step": 147
932
- },
933
- {
934
- "epoch": 5.29,
935
- "learning_rate": 0.0002,
936
- "loss": 0.0057,
937
- "step": 148
938
- },
939
- {
940
- "epoch": 5.32,
941
- "learning_rate": 0.0002,
942
- "loss": 0.0046,
943
- "step": 149
944
- },
945
- {
946
- "epoch": 5.36,
947
- "learning_rate": 0.0002,
948
- "loss": 0.003,
949
- "step": 150
950
- },
951
- {
952
- "epoch": 5.39,
953
- "learning_rate": 0.0002,
954
- "loss": 0.0019,
955
- "step": 151
956
- },
957
- {
958
- "epoch": 5.43,
959
- "learning_rate": 0.0002,
960
- "loss": 0.0043,
961
- "step": 152
962
- },
963
- {
964
- "epoch": 5.46,
965
- "learning_rate": 0.0002,
966
- "loss": 0.0019,
967
- "step": 153
968
- },
969
- {
970
- "epoch": 5.5,
971
- "learning_rate": 0.0002,
972
- "loss": 0.0094,
973
- "step": 154
974
- },
975
- {
976
- "epoch": 5.54,
977
- "learning_rate": 0.0002,
978
- "loss": 0.0033,
979
- "step": 155
980
- },
981
- {
982
- "epoch": 5.57,
983
- "learning_rate": 0.0002,
984
- "loss": 0.0028,
985
- "step": 156
986
- },
987
- {
988
- "epoch": 5.61,
989
- "learning_rate": 0.0002,
990
- "loss": 0.0013,
991
- "step": 157
992
- },
993
- {
994
- "epoch": 5.64,
995
- "learning_rate": 0.0002,
996
- "loss": 0.0073,
997
- "step": 158
998
- },
999
- {
1000
- "epoch": 5.68,
1001
- "learning_rate": 0.0002,
1002
- "loss": 0.0054,
1003
- "step": 159
1004
- },
1005
- {
1006
- "epoch": 5.71,
1007
- "learning_rate": 0.0002,
1008
- "loss": 0.0066,
1009
- "step": 160
1010
- },
1011
- {
1012
- "epoch": 5.75,
1013
- "learning_rate": 0.0002,
1014
- "loss": 0.0019,
1015
- "step": 161
1016
- },
1017
- {
1018
- "epoch": 5.79,
1019
- "learning_rate": 0.0002,
1020
- "loss": 0.0014,
1021
- "step": 162
1022
- },
1023
- {
1024
- "epoch": 5.82,
1025
- "learning_rate": 0.0002,
1026
- "loss": 0.0026,
1027
- "step": 163
1028
- },
1029
- {
1030
- "epoch": 5.86,
1031
- "learning_rate": 0.0002,
1032
- "loss": 0.0016,
1033
- "step": 164
1034
- },
1035
- {
1036
- "epoch": 5.89,
1037
- "learning_rate": 0.0002,
1038
- "loss": 0.0022,
1039
- "step": 165
1040
- },
1041
- {
1042
- "epoch": 5.93,
1043
- "learning_rate": 0.0002,
1044
- "loss": 0.0077,
1045
- "step": 166
1046
- },
1047
- {
1048
- "epoch": 5.96,
1049
- "learning_rate": 0.0002,
1050
- "loss": 0.0054,
1051
- "step": 167
1052
- },
1053
- {
1054
- "epoch": 6.0,
1055
- "learning_rate": 0.0002,
1056
- "loss": 0.0049,
1057
- "step": 168
1058
- },
1059
- {
1060
- "epoch": 6.0,
1061
- "eval_loss": 0.003725806251168251,
1062
- "eval_runtime": 55.2317,
1063
- "eval_samples_per_second": 15.698,
1064
- "eval_steps_per_second": 0.507,
1065
- "step": 168
1066
- },
1067
- {
1068
- "epoch": 6.04,
1069
- "learning_rate": 0.0002,
1070
- "loss": 0.0037,
1071
- "step": 169
1072
- },
1073
- {
1074
- "epoch": 6.07,
1075
- "learning_rate": 0.0002,
1076
- "loss": 0.0022,
1077
- "step": 170
1078
- },
1079
- {
1080
- "epoch": 6.11,
1081
- "learning_rate": 0.0002,
1082
- "loss": 0.0021,
1083
- "step": 171
1084
- },
1085
- {
1086
- "epoch": 6.14,
1087
- "learning_rate": 0.0002,
1088
- "loss": 0.0045,
1089
- "step": 172
1090
- },
1091
- {
1092
- "epoch": 6.18,
1093
- "learning_rate": 0.0002,
1094
- "loss": 0.0026,
1095
- "step": 173
1096
- },
1097
- {
1098
- "epoch": 6.21,
1099
- "learning_rate": 0.0002,
1100
- "loss": 0.0026,
1101
- "step": 174
1102
- },
1103
- {
1104
- "epoch": 6.25,
1105
- "learning_rate": 0.0002,
1106
- "loss": 0.005,
1107
- "step": 175
1108
- },
1109
- {
1110
- "epoch": 6.29,
1111
- "learning_rate": 0.0002,
1112
- "loss": 0.0048,
1113
- "step": 176
1114
- },
1115
- {
1116
- "epoch": 6.32,
1117
- "learning_rate": 0.0002,
1118
- "loss": 0.0066,
1119
- "step": 177
1120
- },
1121
- {
1122
- "epoch": 6.36,
1123
- "learning_rate": 0.0002,
1124
- "loss": 0.0028,
1125
- "step": 178
1126
- },
1127
- {
1128
- "epoch": 6.39,
1129
- "learning_rate": 0.0002,
1130
- "loss": 0.006,
1131
- "step": 179
1132
- },
1133
- {
1134
- "epoch": 6.43,
1135
- "learning_rate": 0.0002,
1136
- "loss": 0.0008,
1137
- "step": 180
1138
- },
1139
- {
1140
- "epoch": 6.46,
1141
- "learning_rate": 0.0002,
1142
- "loss": 0.0037,
1143
- "step": 181
1144
- },
1145
- {
1146
- "epoch": 6.5,
1147
- "learning_rate": 0.0002,
1148
- "loss": 0.0051,
1149
- "step": 182
1150
- },
1151
- {
1152
- "epoch": 6.54,
1153
- "learning_rate": 0.0002,
1154
- "loss": 0.0028,
1155
- "step": 183
1156
- },
1157
- {
1158
- "epoch": 6.57,
1159
- "learning_rate": 0.0002,
1160
- "loss": 0.0064,
1161
- "step": 184
1162
- },
1163
- {
1164
- "epoch": 6.61,
1165
- "learning_rate": 0.0002,
1166
- "loss": 0.0025,
1167
- "step": 185
1168
- },
1169
- {
1170
- "epoch": 6.64,
1171
- "learning_rate": 0.0002,
1172
- "loss": 0.0025,
1173
- "step": 186
1174
- },
1175
- {
1176
- "epoch": 6.68,
1177
- "learning_rate": 0.0002,
1178
- "loss": 0.0043,
1179
- "step": 187
1180
- },
1181
- {
1182
- "epoch": 6.71,
1183
- "learning_rate": 0.0002,
1184
- "loss": 0.0017,
1185
- "step": 188
1186
- },
1187
- {
1188
- "epoch": 6.75,
1189
- "learning_rate": 0.0002,
1190
- "loss": 0.002,
1191
- "step": 189
1192
- },
1193
- {
1194
- "epoch": 6.79,
1195
- "learning_rate": 0.0002,
1196
- "loss": 0.0009,
1197
- "step": 190
1198
- },
1199
- {
1200
- "epoch": 6.82,
1201
- "learning_rate": 0.0002,
1202
- "loss": 0.0018,
1203
- "step": 191
1204
- },
1205
- {
1206
- "epoch": 6.86,
1207
- "learning_rate": 0.0002,
1208
- "loss": 0.0037,
1209
- "step": 192
1210
- },
1211
- {
1212
- "epoch": 6.89,
1213
- "learning_rate": 0.0002,
1214
- "loss": 0.0013,
1215
- "step": 193
1216
- },
1217
- {
1218
- "epoch": 6.93,
1219
- "learning_rate": 0.0002,
1220
- "loss": 0.002,
1221
- "step": 194
1222
- },
1223
- {
1224
- "epoch": 6.96,
1225
- "learning_rate": 0.0002,
1226
- "loss": 0.002,
1227
- "step": 195
1228
- },
1229
- {
1230
- "epoch": 7.0,
1231
- "learning_rate": 0.0002,
1232
- "loss": 0.0009,
1233
- "step": 196
1234
- },
1235
- {
1236
- "epoch": 7.0,
1237
- "eval_loss": 0.0015701488591730595,
1238
- "eval_runtime": 55.1955,
1239
- "eval_samples_per_second": 15.708,
1240
- "eval_steps_per_second": 0.507,
1241
- "step": 196
1242
- },
1243
- {
1244
- "epoch": 7.04,
1245
- "learning_rate": 0.0002,
1246
- "loss": 0.0016,
1247
- "step": 197
1248
- },
1249
- {
1250
- "epoch": 7.07,
1251
- "learning_rate": 0.0002,
1252
- "loss": 0.0018,
1253
- "step": 198
1254
- },
1255
- {
1256
- "epoch": 7.11,
1257
- "learning_rate": 0.0002,
1258
- "loss": 0.0015,
1259
- "step": 199
1260
- },
1261
- {
1262
- "epoch": 7.14,
1263
- "learning_rate": 0.0002,
1264
- "loss": 0.0016,
1265
- "step": 200
1266
- },
1267
- {
1268
- "epoch": 7.18,
1269
- "learning_rate": 0.0002,
1270
- "loss": 0.0014,
1271
- "step": 201
1272
- },
1273
- {
1274
- "epoch": 7.21,
1275
- "learning_rate": 0.0002,
1276
- "loss": 0.0032,
1277
- "step": 202
1278
- },
1279
- {
1280
- "epoch": 7.25,
1281
- "learning_rate": 0.0002,
1282
- "loss": 0.0006,
1283
- "step": 203
1284
- },
1285
- {
1286
- "epoch": 7.29,
1287
- "learning_rate": 0.0002,
1288
- "loss": 0.0018,
1289
- "step": 204
1290
- },
1291
- {
1292
- "epoch": 7.32,
1293
- "learning_rate": 0.0002,
1294
- "loss": 0.0005,
1295
- "step": 205
1296
- },
1297
- {
1298
- "epoch": 7.36,
1299
- "learning_rate": 0.0002,
1300
- "loss": 0.0024,
1301
- "step": 206
1302
- },
1303
- {
1304
- "epoch": 7.39,
1305
- "learning_rate": 0.0002,
1306
- "loss": 0.0027,
1307
- "step": 207
1308
- },
1309
- {
1310
- "epoch": 7.43,
1311
- "learning_rate": 0.0002,
1312
- "loss": 0.0005,
1313
- "step": 208
1314
- },
1315
- {
1316
- "epoch": 7.46,
1317
- "learning_rate": 0.0002,
1318
- "loss": 0.0056,
1319
- "step": 209
1320
- },
1321
- {
1322
- "epoch": 7.5,
1323
- "learning_rate": 0.0002,
1324
- "loss": 0.0015,
1325
- "step": 210
1326
- },
1327
- {
1328
- "epoch": 7.54,
1329
- "learning_rate": 0.0002,
1330
- "loss": 0.0019,
1331
- "step": 211
1332
- },
1333
- {
1334
- "epoch": 7.57,
1335
- "learning_rate": 0.0002,
1336
- "loss": 0.0014,
1337
- "step": 212
1338
- },
1339
- {
1340
- "epoch": 7.61,
1341
- "learning_rate": 0.0002,
1342
- "loss": 0.0009,
1343
- "step": 213
1344
- },
1345
- {
1346
- "epoch": 7.64,
1347
- "learning_rate": 0.0002,
1348
- "loss": 0.0017,
1349
- "step": 214
1350
- },
1351
- {
1352
- "epoch": 7.68,
1353
- "learning_rate": 0.0002,
1354
- "loss": 0.0037,
1355
- "step": 215
1356
- },
1357
- {
1358
- "epoch": 7.71,
1359
- "learning_rate": 0.0002,
1360
- "loss": 0.0009,
1361
- "step": 216
1362
- },
1363
- {
1364
- "epoch": 7.75,
1365
- "learning_rate": 0.0002,
1366
- "loss": 0.0021,
1367
- "step": 217
1368
- },
1369
- {
1370
- "epoch": 7.79,
1371
- "learning_rate": 0.0002,
1372
- "loss": 0.0024,
1373
- "step": 218
1374
- },
1375
- {
1376
- "epoch": 7.82,
1377
- "learning_rate": 0.0002,
1378
- "loss": 0.002,
1379
- "step": 219
1380
- },
1381
- {
1382
- "epoch": 7.86,
1383
- "learning_rate": 0.0002,
1384
- "loss": 0.0011,
1385
- "step": 220
1386
- },
1387
- {
1388
- "epoch": 7.89,
1389
- "learning_rate": 0.0002,
1390
- "loss": 0.002,
1391
- "step": 221
1392
- },
1393
- {
1394
- "epoch": 7.93,
1395
- "learning_rate": 0.0002,
1396
- "loss": 0.0042,
1397
- "step": 222
1398
- },
1399
- {
1400
- "epoch": 7.96,
1401
- "learning_rate": 0.0002,
1402
- "loss": 0.0046,
1403
- "step": 223
1404
- },
1405
  {
1406
  "epoch": 8.0,
1407
  "learning_rate": 0.0002,
1408
- "loss": 0.0009,
1409
- "step": 224
1410
  },
1411
  {
1412
  "epoch": 8.0,
1413
- "eval_loss": 0.00099816860165447,
1414
- "eval_runtime": 55.2372,
1415
- "eval_samples_per_second": 15.696,
1416
- "eval_steps_per_second": 0.507,
1417
- "step": 224
1418
- },
1419
- {
1420
- "epoch": 8.04,
1421
- "learning_rate": 0.0002,
1422
- "loss": 0.0005,
1423
- "step": 225
1424
  },
1425
  {
1426
  "epoch": 8.07,
1427
  "learning_rate": 0.0002,
1428
  "loss": 0.0007,
1429
- "step": 226
1430
- },
1431
- {
1432
- "epoch": 8.11,
1433
- "learning_rate": 0.0002,
1434
- "loss": 0.003,
1435
- "step": 227
1436
  },
1437
  {
1438
  "epoch": 8.14,
1439
  "learning_rate": 0.0002,
1440
- "loss": 0.0024,
1441
- "step": 228
1442
- },
1443
- {
1444
- "epoch": 8.18,
1445
- "learning_rate": 0.0002,
1446
- "loss": 0.0023,
1447
- "step": 229
1448
  },
1449
  {
1450
  "epoch": 8.21,
1451
  "learning_rate": 0.0002,
1452
- "loss": 0.0024,
1453
- "step": 230
1454
- },
1455
- {
1456
- "epoch": 8.25,
1457
- "learning_rate": 0.0002,
1458
- "loss": 0.0018,
1459
- "step": 231
1460
  },
1461
  {
1462
  "epoch": 8.29,
1463
  "learning_rate": 0.0002,
1464
- "loss": 0.0031,
1465
- "step": 232
1466
- },
1467
- {
1468
- "epoch": 8.32,
1469
- "learning_rate": 0.0002,
1470
- "loss": 0.001,
1471
- "step": 233
1472
  },
1473
  {
1474
  "epoch": 8.36,
1475
  "learning_rate": 0.0002,
1476
- "loss": 0.0011,
1477
- "step": 234
1478
- },
1479
- {
1480
- "epoch": 8.39,
1481
- "learning_rate": 0.0002,
1482
- "loss": 0.0022,
1483
- "step": 235
1484
  },
1485
  {
1486
  "epoch": 8.43,
1487
  "learning_rate": 0.0002,
1488
- "loss": 0.0008,
1489
- "step": 236
1490
- },
1491
- {
1492
- "epoch": 8.46,
1493
- "learning_rate": 0.0002,
1494
- "loss": 0.0006,
1495
- "step": 237
1496
  },
1497
  {
1498
  "epoch": 8.5,
1499
  "learning_rate": 0.0002,
1500
- "loss": 0.0016,
1501
- "step": 238
1502
- },
1503
- {
1504
- "epoch": 8.54,
1505
- "learning_rate": 0.0002,
1506
- "loss": 0.0016,
1507
- "step": 239
1508
  },
1509
  {
1510
  "epoch": 8.57,
1511
  "learning_rate": 0.0002,
1512
- "loss": 0.0012,
1513
- "step": 240
1514
- },
1515
- {
1516
- "epoch": 8.61,
1517
- "learning_rate": 0.0002,
1518
- "loss": 0.0011,
1519
- "step": 241
1520
  },
1521
  {
1522
  "epoch": 8.64,
1523
  "learning_rate": 0.0002,
1524
- "loss": 0.0024,
1525
- "step": 242
1526
- },
1527
- {
1528
- "epoch": 8.68,
1529
- "learning_rate": 0.0002,
1530
- "loss": 0.0013,
1531
- "step": 243
1532
  },
1533
  {
1534
  "epoch": 8.71,
1535
  "learning_rate": 0.0002,
1536
- "loss": 0.0006,
1537
- "step": 244
1538
- },
1539
- {
1540
- "epoch": 8.75,
1541
- "learning_rate": 0.0002,
1542
- "loss": 0.0005,
1543
- "step": 245
1544
  },
1545
  {
1546
  "epoch": 8.79,
1547
  "learning_rate": 0.0002,
1548
- "loss": 0.0025,
1549
- "step": 246
1550
- },
1551
- {
1552
- "epoch": 8.82,
1553
- "learning_rate": 0.0002,
1554
- "loss": 0.0026,
1555
- "step": 247
1556
  },
1557
  {
1558
  "epoch": 8.86,
1559
  "learning_rate": 0.0002,
1560
- "loss": 0.0015,
1561
- "step": 248
1562
- },
1563
- {
1564
- "epoch": 8.89,
1565
- "learning_rate": 0.0002,
1566
- "loss": 0.0012,
1567
- "step": 249
1568
  },
1569
  {
1570
  "epoch": 8.93,
1571
  "learning_rate": 0.0002,
1572
- "loss": 0.0012,
1573
- "step": 250
1574
- },
1575
- {
1576
- "epoch": 8.96,
1577
- "learning_rate": 0.0002,
1578
- "loss": 0.0011,
1579
- "step": 251
1580
  },
1581
  {
1582
  "epoch": 9.0,
1583
  "learning_rate": 0.0002,
1584
- "loss": 0.0007,
1585
- "step": 252
1586
  },
1587
  {
1588
  "epoch": 9.0,
1589
- "eval_loss": 0.0009729066514410079,
1590
- "eval_runtime": 55.7144,
1591
- "eval_samples_per_second": 15.562,
1592
- "eval_steps_per_second": 0.503,
1593
- "step": 252
1594
- },
1595
- {
1596
- "epoch": 9.04,
1597
- "learning_rate": 0.0002,
1598
- "loss": 0.0019,
1599
- "step": 253
1600
  },
1601
  {
1602
  "epoch": 9.07,
1603
  "learning_rate": 0.0002,
1604
- "loss": 0.001,
1605
- "step": 254
1606
- },
1607
- {
1608
- "epoch": 9.11,
1609
- "learning_rate": 0.0002,
1610
  "loss": 0.0004,
1611
- "step": 255
1612
  },
1613
  {
1614
  "epoch": 9.14,
1615
  "learning_rate": 0.0002,
1616
- "loss": 0.0004,
1617
- "step": 256
1618
- },
1619
- {
1620
- "epoch": 9.18,
1621
- "learning_rate": 0.0002,
1622
- "loss": 0.0022,
1623
- "step": 257
1624
  },
1625
  {
1626
  "epoch": 9.21,
1627
  "learning_rate": 0.0002,
1628
- "loss": 0.001,
1629
- "step": 258
1630
- },
1631
- {
1632
- "epoch": 9.25,
1633
- "learning_rate": 0.0002,
1634
- "loss": 0.0013,
1635
- "step": 259
1636
  },
1637
  {
1638
  "epoch": 9.29,
1639
  "learning_rate": 0.0002,
1640
- "loss": 0.0012,
1641
- "step": 260
1642
- },
1643
- {
1644
- "epoch": 9.32,
1645
- "learning_rate": 0.0002,
1646
- "loss": 0.0028,
1647
- "step": 261
1648
  },
1649
  {
1650
  "epoch": 9.36,
1651
  "learning_rate": 0.0002,
1652
- "loss": 0.0004,
1653
- "step": 262
1654
- },
1655
- {
1656
- "epoch": 9.39,
1657
- "learning_rate": 0.0002,
1658
- "loss": 0.0006,
1659
- "step": 263
1660
  },
1661
  {
1662
  "epoch": 9.43,
1663
  "learning_rate": 0.0002,
1664
- "loss": 0.0053,
1665
- "step": 264
1666
- },
1667
- {
1668
- "epoch": 9.46,
1669
- "learning_rate": 0.0002,
1670
- "loss": 0.001,
1671
- "step": 265
1672
  },
1673
  {
1674
  "epoch": 9.5,
1675
  "learning_rate": 0.0002,
1676
- "loss": 0.0029,
1677
- "step": 266
1678
- },
1679
- {
1680
- "epoch": 9.54,
1681
- "learning_rate": 0.0002,
1682
- "loss": 0.0008,
1683
- "step": 267
1684
  },
1685
  {
1686
  "epoch": 9.57,
1687
  "learning_rate": 0.0002,
1688
- "loss": 0.0044,
1689
- "step": 268
1690
- },
1691
- {
1692
- "epoch": 9.61,
1693
- "learning_rate": 0.0002,
1694
- "loss": 0.0012,
1695
- "step": 269
1696
  },
1697
  {
1698
  "epoch": 9.64,
1699
  "learning_rate": 0.0002,
1700
- "loss": 0.0017,
1701
- "step": 270
1702
- },
1703
- {
1704
- "epoch": 9.68,
1705
- "learning_rate": 0.0002,
1706
- "loss": 0.0033,
1707
- "step": 271
1708
  },
1709
  {
1710
  "epoch": 9.71,
1711
  "learning_rate": 0.0002,
1712
- "loss": 0.0021,
1713
- "step": 272
1714
- },
1715
- {
1716
- "epoch": 9.75,
1717
- "learning_rate": 0.0002,
1718
- "loss": 0.0019,
1719
- "step": 273
1720
  },
1721
  {
1722
  "epoch": 9.79,
1723
  "learning_rate": 0.0002,
1724
- "loss": 0.001,
1725
- "step": 274
1726
- },
1727
- {
1728
- "epoch": 9.82,
1729
- "learning_rate": 0.0002,
1730
- "loss": 0.0045,
1731
- "step": 275
1732
  },
1733
  {
1734
  "epoch": 9.86,
1735
  "learning_rate": 0.0002,
1736
- "loss": 0.0032,
1737
- "step": 276
1738
- },
1739
- {
1740
- "epoch": 9.89,
1741
- "learning_rate": 0.0002,
1742
- "loss": 0.0039,
1743
- "step": 277
1744
  },
1745
  {
1746
  "epoch": 9.93,
1747
  "learning_rate": 0.0002,
1748
- "loss": 0.0048,
1749
- "step": 278
1750
- },
1751
- {
1752
- "epoch": 9.96,
1753
- "learning_rate": 0.0002,
1754
- "loss": 0.0035,
1755
- "step": 279
1756
  },
1757
  {
1758
  "epoch": 10.0,
1759
  "learning_rate": 0.0002,
1760
- "loss": 0.0018,
1761
- "step": 280
1762
  },
1763
  {
1764
  "epoch": 10.0,
1765
- "eval_loss": 0.0019094761228188872,
1766
- "eval_runtime": 55.2125,
1767
- "eval_samples_per_second": 15.703,
1768
- "eval_steps_per_second": 0.507,
1769
- "step": 280
1770
  },
1771
  {
1772
  "epoch": 10.0,
1773
- "step": 280,
1774
- "total_flos": 8.298694499798876e+17,
1775
- "train_loss": 0.09645156941977413,
1776
- "train_runtime": 3386.4389,
1777
- "train_samples_per_second": 2.56,
1778
- "train_steps_per_second": 0.083
1779
  }
1780
  ],
1781
  "logging_steps": 1.0,
1782
- "max_steps": 280,
1783
  "num_input_tokens_seen": 0,
1784
  "num_train_epochs": 10,
1785
  "save_steps": 50000,
1786
- "total_flos": 8.298694499798876e+17,
1787
  "train_batch_size": 4,
1788
  "trial_name": null,
1789
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.07,
13
  "learning_rate": 0.0,
14
+ "loss": 1.4966,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.14,
19
+ "learning_rate": 8.613531161467861e-05,
20
+ "loss": 1.0914,
21
  "step": 2
22
  },
23
  {
24
+ "epoch": 0.21,
25
+ "learning_rate": 0.00013652123889719707,
26
+ "loss": 1.1815,
27
  "step": 3
28
  },
29
  {
30
+ "epoch": 0.29,
31
+ "learning_rate": 0.00017227062322935723,
32
+ "loss": 1.1374,
33
  "step": 4
34
  },
35
  {
36
+ "epoch": 0.36,
37
+ "learning_rate": 0.0002,
38
+ "loss": 1.0049,
39
  "step": 5
40
  },
41
  {
42
+ "epoch": 0.43,
43
+ "learning_rate": 0.0002,
44
+ "loss": 0.9757,
45
  "step": 6
46
  },
47
  {
48
+ "epoch": 0.5,
49
+ "learning_rate": 0.0002,
50
+ "loss": 0.9526,
51
  "step": 7
52
  },
53
  {
54
+ "epoch": 0.57,
55
+ "learning_rate": 0.0002,
56
+ "loss": 0.8746,
57
  "step": 8
58
  },
59
  {
60
+ "epoch": 0.64,
61
  "learning_rate": 0.0002,
62
+ "loss": 0.7538,
63
  "step": 9
64
  },
65
  {
66
+ "epoch": 0.71,
67
  "learning_rate": 0.0002,
68
+ "loss": 0.7817,
69
  "step": 10
70
  },
71
  {
72
+ "epoch": 0.79,
73
  "learning_rate": 0.0002,
74
+ "loss": 0.7263,
75
  "step": 11
76
  },
77
  {
78
+ "epoch": 0.86,
79
  "learning_rate": 0.0002,
80
+ "loss": 0.6033,
81
  "step": 12
82
  },
83
  {
84
+ "epoch": 0.93,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.582,
87
  "step": 13
88
  },
89
  {
90
+ "epoch": 1.0,
91
  "learning_rate": 0.0002,
92
+ "loss": 0.4256,
93
+ "step": 14
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_loss": 0.3590076267719269,
98
+ "eval_runtime": 27.2669,
99
+ "eval_samples_per_second": 15.587,
100
+ "eval_steps_per_second": 0.513,
101
  "step": 14
102
  },
103
  {
104
+ "epoch": 1.07,
105
  "learning_rate": 0.0002,
106
+ "loss": 0.2531,
107
  "step": 15
108
  },
109
  {
110
+ "epoch": 1.14,
111
  "learning_rate": 0.0002,
112
+ "loss": 0.3431,
113
  "step": 16
114
  },
115
  {
116
+ "epoch": 1.21,
117
  "learning_rate": 0.0002,
118
+ "loss": 0.2548,
119
  "step": 17
120
  },
121
  {
122
+ "epoch": 1.29,
123
  "learning_rate": 0.0002,
124
+ "loss": 0.2814,
125
  "step": 18
126
  },
127
  {
128
+ "epoch": 1.36,
129
  "learning_rate": 0.0002,
130
+ "loss": 0.1958,
131
  "step": 19
132
  },
133
  {
134
+ "epoch": 1.43,
135
  "learning_rate": 0.0002,
136
+ "loss": 0.2286,
137
  "step": 20
138
  },
139
  {
140
+ "epoch": 1.5,
141
  "learning_rate": 0.0002,
142
+ "loss": 0.2332,
143
  "step": 21
144
  },
145
  {
146
+ "epoch": 1.57,
147
  "learning_rate": 0.0002,
148
+ "loss": 0.1755,
149
  "step": 22
150
  },
151
  {
152
+ "epoch": 1.64,
153
  "learning_rate": 0.0002,
154
+ "loss": 0.1312,
155
  "step": 23
156
  },
157
  {
158
+ "epoch": 1.71,
159
  "learning_rate": 0.0002,
160
+ "loss": 0.0845,
161
  "step": 24
162
  },
163
  {
164
+ "epoch": 1.79,
165
  "learning_rate": 0.0002,
166
+ "loss": 0.1214,
167
  "step": 25
168
  },
169
  {
170
+ "epoch": 1.86,
171
  "learning_rate": 0.0002,
172
+ "loss": 0.0884,
173
  "step": 26
174
  },
175
  {
176
+ "epoch": 1.93,
177
  "learning_rate": 0.0002,
178
+ "loss": 0.1381,
179
  "step": 27
180
  },
181
  {
182
+ "epoch": 2.0,
183
  "learning_rate": 0.0002,
184
+ "loss": 0.0643,
185
  "step": 28
186
  },
187
  {
188
+ "epoch": 2.0,
189
+ "eval_loss": 0.07109997421503067,
190
+ "eval_runtime": 27.4843,
191
+ "eval_samples_per_second": 15.463,
192
  "eval_steps_per_second": 0.509,
193
  "step": 28
194
  },
195
  {
196
+ "epoch": 2.07,
197
  "learning_rate": 0.0002,
198
+ "loss": 0.0533,
199
  "step": 29
200
  },
201
  {
202
+ "epoch": 2.14,
203
  "learning_rate": 0.0002,
204
+ "loss": 0.0566,
205
  "step": 30
206
  },
207
  {
208
+ "epoch": 2.21,
209
  "learning_rate": 0.0002,
210
+ "loss": 0.0457,
211
  "step": 31
212
  },
213
  {
214
+ "epoch": 2.29,
215
  "learning_rate": 0.0002,
216
+ "loss": 0.041,
217
  "step": 32
218
  },
219
  {
220
+ "epoch": 2.36,
221
  "learning_rate": 0.0002,
222
+ "loss": 0.0447,
223
  "step": 33
224
  },
225
  {
226
+ "epoch": 2.43,
227
  "learning_rate": 0.0002,
228
+ "loss": 0.0206,
229
  "step": 34
230
  },
231
  {
232
+ "epoch": 2.5,
233
  "learning_rate": 0.0002,
234
+ "loss": 0.0221,
235
  "step": 35
236
  },
237
  {
238
+ "epoch": 2.57,
239
  "learning_rate": 0.0002,
240
+ "loss": 0.0541,
241
  "step": 36
242
  },
243
  {
244
+ "epoch": 2.64,
245
  "learning_rate": 0.0002,
246
+ "loss": 0.0274,
247
  "step": 37
248
  },
249
  {
250
+ "epoch": 2.71,
251
  "learning_rate": 0.0002,
252
+ "loss": 0.0169,
253
  "step": 38
254
  },
255
  {
256
+ "epoch": 2.79,
257
  "learning_rate": 0.0002,
258
+ "loss": 0.0358,
259
  "step": 39
260
  },
261
  {
262
+ "epoch": 2.86,
263
  "learning_rate": 0.0002,
264
+ "loss": 0.0375,
265
  "step": 40
266
  },
267
  {
268
+ "epoch": 2.93,
269
  "learning_rate": 0.0002,
270
+ "loss": 0.0233,
271
  "step": 41
272
  },
273
  {
274
+ "epoch": 3.0,
275
  "learning_rate": 0.0002,
276
+ "loss": 0.0114,
277
+ "step": 42
278
+ },
279
+ {
280
+ "epoch": 3.0,
281
+ "eval_loss": 0.01424238458275795,
282
+ "eval_runtime": 27.4614,
283
+ "eval_samples_per_second": 15.476,
284
+ "eval_steps_per_second": 0.51,
285
  "step": 42
286
  },
287
  {
288
+ "epoch": 3.07,
289
  "learning_rate": 0.0002,
290
+ "loss": 0.0095,
291
  "step": 43
292
  },
293
  {
294
+ "epoch": 3.14,
295
  "learning_rate": 0.0002,
296
+ "loss": 0.0093,
297
  "step": 44
298
  },
299
  {
300
+ "epoch": 3.21,
301
  "learning_rate": 0.0002,
302
+ "loss": 0.0068,
303
  "step": 45
304
  },
305
  {
306
+ "epoch": 3.29,
307
  "learning_rate": 0.0002,
308
+ "loss": 0.0069,
309
  "step": 46
310
  },
311
  {
312
+ "epoch": 3.36,
313
  "learning_rate": 0.0002,
314
+ "loss": 0.0067,
315
  "step": 47
316
  },
317
  {
318
+ "epoch": 3.43,
319
  "learning_rate": 0.0002,
320
+ "loss": 0.0161,
321
  "step": 48
322
  },
323
  {
324
+ "epoch": 3.5,
325
  "learning_rate": 0.0002,
326
+ "loss": 0.0123,
327
  "step": 49
328
  },
329
  {
330
+ "epoch": 3.57,
331
  "learning_rate": 0.0002,
332
+ "loss": 0.0074,
333
  "step": 50
334
  },
335
  {
336
+ "epoch": 3.64,
337
  "learning_rate": 0.0002,
338
+ "loss": 0.0052,
339
  "step": 51
340
  },
341
  {
342
+ "epoch": 3.71,
343
  "learning_rate": 0.0002,
344
+ "loss": 0.0088,
345
  "step": 52
346
  },
347
  {
348
+ "epoch": 3.79,
349
  "learning_rate": 0.0002,
350
+ "loss": 0.0147,
351
  "step": 53
352
  },
353
  {
354
+ "epoch": 3.86,
355
  "learning_rate": 0.0002,
356
+ "loss": 0.008,
357
  "step": 54
358
  },
359
  {
360
+ "epoch": 3.93,
361
  "learning_rate": 0.0002,
362
+ "loss": 0.0062,
363
  "step": 55
364
  },
365
  {
366
+ "epoch": 4.0,
367
  "learning_rate": 0.0002,
368
+ "loss": 0.0089,
369
  "step": 56
370
  },
371
  {
372
+ "epoch": 4.0,
373
+ "eval_loss": 0.006587002892047167,
374
+ "eval_runtime": 27.6072,
375
+ "eval_samples_per_second": 15.395,
376
+ "eval_steps_per_second": 0.507,
377
  "step": 56
378
  },
379
  {
380
+ "epoch": 4.07,
381
  "learning_rate": 0.0002,
382
+ "loss": 0.0047,
383
  "step": 57
384
  },
385
  {
386
+ "epoch": 4.14,
387
  "learning_rate": 0.0002,
388
+ "loss": 0.0043,
389
  "step": 58
390
  },
391
  {
392
+ "epoch": 4.21,
393
  "learning_rate": 0.0002,
394
+ "loss": 0.0061,
395
  "step": 59
396
  },
397
  {
398
+ "epoch": 4.29,
399
  "learning_rate": 0.0002,
400
+ "loss": 0.0036,
401
  "step": 60
402
  },
403
  {
404
+ "epoch": 4.36,
405
  "learning_rate": 0.0002,
406
+ "loss": 0.0019,
407
  "step": 61
408
  },
409
  {
410
+ "epoch": 4.43,
411
  "learning_rate": 0.0002,
412
+ "loss": 0.0032,
413
  "step": 62
414
  },
415
  {
416
+ "epoch": 4.5,
417
  "learning_rate": 0.0002,
418
+ "loss": 0.0056,
419
  "step": 63
420
  },
421
  {
422
+ "epoch": 4.57,
423
  "learning_rate": 0.0002,
424
+ "loss": 0.0063,
425
  "step": 64
426
  },
427
  {
428
+ "epoch": 4.64,
429
  "learning_rate": 0.0002,
430
+ "loss": 0.002,
431
  "step": 65
432
  },
433
  {
434
+ "epoch": 4.71,
435
  "learning_rate": 0.0002,
436
+ "loss": 0.0057,
437
  "step": 66
438
  },
439
  {
440
+ "epoch": 4.79,
441
  "learning_rate": 0.0002,
442
+ "loss": 0.0031,
443
  "step": 67
444
  },
445
  {
446
+ "epoch": 4.86,
447
  "learning_rate": 0.0002,
448
+ "loss": 0.0065,
449
  "step": 68
450
  },
451
  {
452
+ "epoch": 4.93,
453
  "learning_rate": 0.0002,
454
+ "loss": 0.0022,
455
  "step": 69
456
  },
457
  {
458
+ "epoch": 5.0,
459
  "learning_rate": 0.0002,
460
+ "loss": 0.0021,
461
+ "step": 70
462
+ },
463
+ {
464
+ "epoch": 5.0,
465
+ "eval_loss": 0.0026798362378031015,
466
+ "eval_runtime": 27.5909,
467
+ "eval_samples_per_second": 15.404,
468
+ "eval_steps_per_second": 0.507,
469
  "step": 70
470
  },
471
  {
472
+ "epoch": 5.07,
473
  "learning_rate": 0.0002,
474
+ "loss": 0.0009,
475
  "step": 71
476
  },
477
  {
478
+ "epoch": 5.14,
479
  "learning_rate": 0.0002,
480
+ "loss": 0.0013,
481
  "step": 72
482
  },
483
  {
484
+ "epoch": 5.21,
485
  "learning_rate": 0.0002,
486
+ "loss": 0.0024,
487
  "step": 73
488
  },
489
  {
490
+ "epoch": 5.29,
491
  "learning_rate": 0.0002,
492
+ "loss": 0.0011,
493
  "step": 74
494
  },
495
  {
496
+ "epoch": 5.36,
497
  "learning_rate": 0.0002,
498
+ "loss": 0.0008,
499
  "step": 75
500
  },
501
  {
502
+ "epoch": 5.43,
503
  "learning_rate": 0.0002,
504
+ "loss": 0.0019,
505
  "step": 76
506
  },
507
  {
508
+ "epoch": 5.5,
509
  "learning_rate": 0.0002,
510
+ "loss": 0.0009,
511
  "step": 77
512
  },
513
  {
514
+ "epoch": 5.57,
515
  "learning_rate": 0.0002,
516
+ "loss": 0.0016,
517
  "step": 78
518
  },
519
  {
520
+ "epoch": 5.64,
521
  "learning_rate": 0.0002,
522
+ "loss": 0.0021,
523
  "step": 79
524
  },
525
  {
526
+ "epoch": 5.71,
527
  "learning_rate": 0.0002,
528
+ "loss": 0.0017,
529
  "step": 80
530
  },
531
  {
532
+ "epoch": 5.79,
533
  "learning_rate": 0.0002,
534
+ "loss": 0.0028,
535
  "step": 81
536
  },
537
  {
538
+ "epoch": 5.86,
539
  "learning_rate": 0.0002,
540
+ "loss": 0.0028,
541
  "step": 82
542
  },
543
  {
544
+ "epoch": 5.93,
545
  "learning_rate": 0.0002,
546
+ "loss": 0.0013,
547
  "step": 83
548
  },
549
  {
550
+ "epoch": 6.0,
551
  "learning_rate": 0.0002,
552
+ "loss": 0.0012,
553
  "step": 84
554
  },
555
  {
556
+ "epoch": 6.0,
557
+ "eval_loss": 0.0026673530228435993,
558
+ "eval_runtime": 27.6491,
559
+ "eval_samples_per_second": 15.371,
560
+ "eval_steps_per_second": 0.506,
561
  "step": 84
562
  },
563
  {
564
+ "epoch": 6.07,
565
  "learning_rate": 0.0002,
566
+ "loss": 0.0027,
567
  "step": 85
568
  },
569
  {
570
+ "epoch": 6.14,
571
  "learning_rate": 0.0002,
572
+ "loss": 0.0037,
573
  "step": 86
574
  },
575
  {
576
+ "epoch": 6.21,
577
  "learning_rate": 0.0002,
578
+ "loss": 0.0022,
579
  "step": 87
580
  },
581
  {
582
+ "epoch": 6.29,
583
  "learning_rate": 0.0002,
584
+ "loss": 0.002,
585
  "step": 88
586
  },
587
  {
588
+ "epoch": 6.36,
589
  "learning_rate": 0.0002,
590
+ "loss": 0.0004,
591
  "step": 89
592
  },
593
  {
594
+ "epoch": 6.43,
595
  "learning_rate": 0.0002,
596
+ "loss": 0.0009,
597
  "step": 90
598
  },
599
  {
600
+ "epoch": 6.5,
601
  "learning_rate": 0.0002,
602
+ "loss": 0.0009,
603
  "step": 91
604
  },
605
  {
606
+ "epoch": 6.57,
607
  "learning_rate": 0.0002,
608
+ "loss": 0.0014,
609
  "step": 92
610
  },
611
  {
612
+ "epoch": 6.64,
613
  "learning_rate": 0.0002,
614
+ "loss": 0.0014,
615
  "step": 93
616
  },
617
  {
618
+ "epoch": 6.71,
619
  "learning_rate": 0.0002,
620
+ "loss": 0.002,
621
  "step": 94
622
  },
623
  {
624
+ "epoch": 6.79,
625
  "learning_rate": 0.0002,
626
+ "loss": 0.001,
627
  "step": 95
628
  },
629
  {
630
+ "epoch": 6.86,
631
  "learning_rate": 0.0002,
632
+ "loss": 0.0005,
633
  "step": 96
634
  },
635
  {
636
+ "epoch": 6.93,
637
  "learning_rate": 0.0002,
638
+ "loss": 0.0031,
639
  "step": 97
640
  },
641
  {
642
+ "epoch": 7.0,
643
  "learning_rate": 0.0002,
644
+ "loss": 0.0008,
645
+ "step": 98
646
+ },
647
+ {
648
+ "epoch": 7.0,
649
+ "eval_loss": 0.0014789514243602753,
650
+ "eval_runtime": 27.6117,
651
+ "eval_samples_per_second": 15.392,
652
+ "eval_steps_per_second": 0.507,
653
  "step": 98
654
  },
655
  {
656
+ "epoch": 7.07,
657
  "learning_rate": 0.0002,
658
+ "loss": 0.0005,
659
  "step": 99
660
  },
661
  {
662
+ "epoch": 7.14,
663
  "learning_rate": 0.0002,
664
+ "loss": 0.0003,
665
  "step": 100
666
  },
667
  {
668
+ "epoch": 7.21,
669
  "learning_rate": 0.0002,
670
+ "loss": 0.0011,
671
  "step": 101
672
  },
673
  {
674
+ "epoch": 7.29,
675
  "learning_rate": 0.0002,
676
+ "loss": 0.0005,
677
  "step": 102
678
  },
679
  {
680
+ "epoch": 7.36,
681
  "learning_rate": 0.0002,
682
+ "loss": 0.0005,
683
  "step": 103
684
  },
685
  {
686
+ "epoch": 7.43,
687
  "learning_rate": 0.0002,
688
+ "loss": 0.0003,
689
  "step": 104
690
  },
691
  {
692
+ "epoch": 7.5,
693
  "learning_rate": 0.0002,
694
+ "loss": 0.0004,
695
  "step": 105
696
  },
697
  {
698
+ "epoch": 7.57,
699
  "learning_rate": 0.0002,
700
+ "loss": 0.0003,
701
  "step": 106
702
  },
703
  {
704
+ "epoch": 7.64,
705
  "learning_rate": 0.0002,
706
+ "loss": 0.002,
707
  "step": 107
708
  },
709
  {
710
+ "epoch": 7.71,
711
  "learning_rate": 0.0002,
712
+ "loss": 0.001,
713
  "step": 108
714
  },
715
  {
716
+ "epoch": 7.79,
717
  "learning_rate": 0.0002,
718
+ "loss": 0.0005,
719
  "step": 109
720
  },
721
  {
722
+ "epoch": 7.86,
723
  "learning_rate": 0.0002,
724
+ "loss": 0.0025,
725
  "step": 110
726
  },
727
  {
728
+ "epoch": 7.93,
729
  "learning_rate": 0.0002,
730
+ "loss": 0.0005,
731
  "step": 111
732
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  {
734
  "epoch": 8.0,
735
  "learning_rate": 0.0002,
736
+ "loss": 0.0002,
737
+ "step": 112
738
  },
739
  {
740
  "epoch": 8.0,
741
+ "eval_loss": 0.0006328822346404195,
742
+ "eval_runtime": 27.661,
743
+ "eval_samples_per_second": 15.365,
744
+ "eval_steps_per_second": 0.506,
745
+ "step": 112
 
 
 
 
 
 
746
  },
747
  {
748
  "epoch": 8.07,
749
  "learning_rate": 0.0002,
750
  "loss": 0.0007,
751
+ "step": 113
 
 
 
 
 
 
752
  },
753
  {
754
  "epoch": 8.14,
755
  "learning_rate": 0.0002,
756
+ "loss": 0.0002,
757
+ "step": 114
 
 
 
 
 
 
758
  },
759
  {
760
  "epoch": 8.21,
761
  "learning_rate": 0.0002,
762
+ "loss": 0.0002,
763
+ "step": 115
 
 
 
 
 
 
764
  },
765
  {
766
  "epoch": 8.29,
767
  "learning_rate": 0.0002,
768
+ "loss": 0.0002,
769
+ "step": 116
 
 
 
 
 
 
770
  },
771
  {
772
  "epoch": 8.36,
773
  "learning_rate": 0.0002,
774
+ "loss": 0.0005,
775
+ "step": 117
 
 
 
 
 
 
776
  },
777
  {
778
  "epoch": 8.43,
779
  "learning_rate": 0.0002,
780
+ "loss": 0.0002,
781
+ "step": 118
 
 
 
 
 
 
782
  },
783
  {
784
  "epoch": 8.5,
785
  "learning_rate": 0.0002,
786
+ "loss": 0.0007,
787
+ "step": 119
 
 
 
 
 
 
788
  },
789
  {
790
  "epoch": 8.57,
791
  "learning_rate": 0.0002,
792
+ "loss": 0.0002,
793
+ "step": 120
 
 
 
 
 
 
794
  },
795
  {
796
  "epoch": 8.64,
797
  "learning_rate": 0.0002,
798
+ "loss": 0.0025,
799
+ "step": 121
 
 
 
 
 
 
800
  },
801
  {
802
  "epoch": 8.71,
803
  "learning_rate": 0.0002,
804
+ "loss": 0.0008,
805
+ "step": 122
 
 
 
 
 
 
806
  },
807
  {
808
  "epoch": 8.79,
809
  "learning_rate": 0.0002,
810
+ "loss": 0.0012,
811
+ "step": 123
 
 
 
 
 
 
812
  },
813
  {
814
  "epoch": 8.86,
815
  "learning_rate": 0.0002,
816
+ "loss": 0.0002,
817
+ "step": 124
 
 
 
 
 
 
818
  },
819
  {
820
  "epoch": 8.93,
821
  "learning_rate": 0.0002,
822
+ "loss": 0.0003,
823
+ "step": 125
 
 
 
 
 
 
824
  },
825
  {
826
  "epoch": 9.0,
827
  "learning_rate": 0.0002,
828
+ "loss": 0.0002,
829
+ "step": 126
830
  },
831
  {
832
  "epoch": 9.0,
833
+ "eval_loss": 0.0004491883155424148,
834
+ "eval_runtime": 27.6221,
835
+ "eval_samples_per_second": 15.386,
836
+ "eval_steps_per_second": 0.507,
837
+ "step": 126
 
 
 
 
 
 
838
  },
839
  {
840
  "epoch": 9.07,
841
  "learning_rate": 0.0002,
 
 
 
 
 
 
842
  "loss": 0.0004,
843
+ "step": 127
844
  },
845
  {
846
  "epoch": 9.14,
847
  "learning_rate": 0.0002,
848
+ "loss": 0.0001,
849
+ "step": 128
 
 
 
 
 
 
850
  },
851
  {
852
  "epoch": 9.21,
853
  "learning_rate": 0.0002,
854
+ "loss": 0.0001,
855
+ "step": 129
 
 
 
 
 
 
856
  },
857
  {
858
  "epoch": 9.29,
859
  "learning_rate": 0.0002,
860
+ "loss": 0.0003,
861
+ "step": 130
 
 
 
 
 
 
862
  },
863
  {
864
  "epoch": 9.36,
865
  "learning_rate": 0.0002,
866
+ "loss": 0.0011,
867
+ "step": 131
 
 
 
 
 
 
868
  },
869
  {
870
  "epoch": 9.43,
871
  "learning_rate": 0.0002,
872
+ "loss": 0.0002,
873
+ "step": 132
 
 
 
 
 
 
874
  },
875
  {
876
  "epoch": 9.5,
877
  "learning_rate": 0.0002,
878
+ "loss": 0.0002,
879
+ "step": 133
 
 
 
 
 
 
880
  },
881
  {
882
  "epoch": 9.57,
883
  "learning_rate": 0.0002,
884
+ "loss": 0.0007,
885
+ "step": 134
 
 
 
 
 
 
886
  },
887
  {
888
  "epoch": 9.64,
889
  "learning_rate": 0.0002,
890
+ "loss": 0.0022,
891
+ "step": 135
 
 
 
 
 
 
892
  },
893
  {
894
  "epoch": 9.71,
895
  "learning_rate": 0.0002,
896
+ "loss": 0.0016,
897
+ "step": 136
 
 
 
 
 
 
898
  },
899
  {
900
  "epoch": 9.79,
901
  "learning_rate": 0.0002,
902
+ "loss": 0.0001,
903
+ "step": 137
 
 
 
 
 
 
904
  },
905
  {
906
  "epoch": 9.86,
907
  "learning_rate": 0.0002,
908
+ "loss": 0.0001,
909
+ "step": 138
 
 
 
 
 
 
910
  },
911
  {
912
  "epoch": 9.93,
913
  "learning_rate": 0.0002,
914
+ "loss": 0.0003,
915
+ "step": 139
 
 
 
 
 
 
916
  },
917
  {
918
  "epoch": 10.0,
919
  "learning_rate": 0.0002,
920
+ "loss": 0.0005,
921
+ "step": 140
922
  },
923
  {
924
  "epoch": 10.0,
925
+ "eval_loss": 0.00029718142468482256,
926
+ "eval_runtime": 27.6542,
927
+ "eval_samples_per_second": 15.368,
928
+ "eval_steps_per_second": 0.506,
929
+ "step": 140
930
  },
931
  {
932
  "epoch": 10.0,
933
+ "step": 140,
934
+ "total_flos": 4.2897933864638874e+17,
935
+ "train_loss": 0.11377025729951648,
936
+ "train_runtime": 2129.3536,
937
+ "train_samples_per_second": 1.996,
938
+ "train_steps_per_second": 0.066
939
  }
940
  ],
941
  "logging_steps": 1.0,
942
+ "max_steps": 140,
943
  "num_input_tokens_seen": 0,
944
  "num_train_epochs": 10,
945
  "save_steps": 50000,
946
+ "total_flos": 4.2897933864638874e+17,
947
  "train_batch_size": 4,
948
  "trial_name": null,
949
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab60061c6db4983ad6a9334f47864cfc12b2212c4796c83a0d247a55439133a6
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa90b394534132b7b403551766766cdfe27216265cc6d0e380d159dcb58549e3
3
  size 6840