ccore commited on
Commit
86f875a
1 Parent(s): 79fdfb7

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: ./core2
3
  tags:
4
  - generated_from_trainer
5
  metrics:
@@ -14,10 +14,10 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # core2
16
 
17
- This model is a fine-tuned version of [./core2](https://huggingface.co/./core2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 2.7997
20
- - Accuracy: 0.4028
21
 
22
  ## Model description
23
 
@@ -36,12 +36,12 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 0.0001
40
  - train_batch_size: 1
41
  - eval_batch_size: 8
42
  - seed: 42
43
- - gradient_accumulation_steps: 64
44
- - total_train_batch_size: 64
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - num_epochs: 1.0
 
1
  ---
2
+ base_model: ./core2/
3
  tags:
4
  - generated_from_trainer
5
  metrics:
 
14
 
15
  # core2
16
 
17
+ This model is a fine-tuned version of [./core2/](https://huggingface.co/./core2/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 2.5534
20
+ - Accuracy: 0.4330
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 0.001
40
  - train_batch_size: 1
41
  - eval_batch_size: 8
42
  - seed: 42
43
+ - gradient_accumulation_steps: 8
44
+ - total_train_batch_size: 8
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - num_epochs: 1.0
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.40275220320231575,
4
- "eval_loss": 2.7997472286224365,
5
- "eval_runtime": 1.756,
6
  "eval_samples": 129,
7
- "eval_samples_per_second": 73.461,
8
- "eval_steps_per_second": 9.681,
9
- "perplexity": 16.44049056038686,
10
- "train_loss": 2.7764307147395,
11
- "train_runtime": 11751.6473,
12
- "train_samples": 455985,
13
- "train_samples_per_second": 38.802,
14
- "train_steps_per_second": 0.606
15
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.43297187933346976,
4
+ "eval_loss": 2.553417444229126,
5
+ "eval_runtime": 1.7282,
6
  "eval_samples": 129,
7
+ "eval_samples_per_second": 74.646,
8
+ "eval_steps_per_second": 9.837,
9
+ "perplexity": 12.850946217445413,
10
+ "train_loss": 3.3343640817306177,
11
+ "train_runtime": 1458.5581,
12
+ "train_samples": 51585,
13
+ "train_samples_per_second": 35.367,
14
+ "train_steps_per_second": 4.421
15
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "./core2",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "./core2/",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_accuracy": 0.40275220320231575,
4
- "eval_loss": 2.7997472286224365,
5
- "eval_runtime": 1.756,
6
  "eval_samples": 129,
7
- "eval_samples_per_second": 73.461,
8
- "eval_steps_per_second": 9.681,
9
- "perplexity": 16.44049056038686
10
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_accuracy": 0.43297187933346976,
4
+ "eval_loss": 2.553417444229126,
5
+ "eval_runtime": 1.7282,
6
  "eval_samples": 129,
7
+ "eval_samples_per_second": 74.646,
8
+ "eval_steps_per_second": 9.837,
9
+ "perplexity": 12.850946217445413
10
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32cd03779b9057848bc65624f196a82bb6be278b4cda59ee3d6b7e610feb3991
3
  size 929067029
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dd324b6cf685a7b5f40c8fbea96175df25641cafbbe0135d2c0bd3da3ac73e3
3
  size 929067029
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 2.7764307147395,
4
- "train_runtime": 11751.6473,
5
- "train_samples": 455985,
6
- "train_samples_per_second": 38.802,
7
- "train_steps_per_second": 0.606
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 3.3343640817306177,
4
+ "train_runtime": 1458.5581,
5
+ "train_samples": 51585,
6
+ "train_samples_per_second": 35.367,
7
+ "train_steps_per_second": 4.421
8
  }
trainer_state.json CHANGED
@@ -1,880 +1,796 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9998925403247914,
5
  "eval_steps": 500,
6
- "global_step": 7124,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "learning_rate": 9.92981471083661e-05,
14
- "loss": 2.9958,
15
  "step": 50
16
  },
17
  {
18
- "epoch": 0.01,
19
- "learning_rate": 9.859629421673217e-05,
20
- "loss": 2.9669,
21
  "step": 100
22
  },
23
  {
24
  "epoch": 0.02,
25
- "learning_rate": 9.789444132509826e-05,
26
- "loss": 2.958,
27
  "step": 150
28
  },
29
  {
30
  "epoch": 0.03,
31
- "learning_rate": 9.719258843346435e-05,
32
- "loss": 2.9459,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 0.04,
37
- "learning_rate": 9.649073554183044e-05,
38
- "loss": 2.9461,
39
  "step": 250
40
  },
41
  {
42
- "epoch": 0.04,
43
- "learning_rate": 9.578888265019651e-05,
44
- "loss": 2.9109,
45
  "step": 300
46
  },
47
  {
48
  "epoch": 0.05,
49
- "learning_rate": 9.50870297585626e-05,
50
- "loss": 2.9316,
51
  "step": 350
52
  },
53
  {
54
  "epoch": 0.06,
55
- "learning_rate": 9.43851768669287e-05,
56
- "loss": 2.8693,
57
  "step": 400
58
  },
59
  {
60
- "epoch": 0.06,
61
- "learning_rate": 9.368332397529478e-05,
62
- "loss": 2.8988,
63
  "step": 450
64
  },
65
  {
66
- "epoch": 0.07,
67
- "learning_rate": 9.298147108366086e-05,
68
- "loss": 2.8858,
69
  "step": 500
70
  },
71
  {
72
- "epoch": 0.08,
73
- "learning_rate": 9.227961819202695e-05,
74
- "loss": 2.8717,
75
  "step": 550
76
  },
77
  {
78
- "epoch": 0.08,
79
- "learning_rate": 9.157776530039304e-05,
80
- "loss": 2.8852,
81
  "step": 600
82
  },
83
  {
84
- "epoch": 0.09,
85
- "learning_rate": 9.087591240875913e-05,
86
- "loss": 2.8702,
87
  "step": 650
88
  },
89
  {
90
- "epoch": 0.1,
91
- "learning_rate": 9.017405951712522e-05,
92
- "loss": 2.8506,
93
  "step": 700
94
  },
95
  {
96
- "epoch": 0.11,
97
- "learning_rate": 8.947220662549131e-05,
98
- "loss": 2.8793,
99
  "step": 750
100
  },
101
  {
102
- "epoch": 0.11,
103
- "learning_rate": 8.87703537338574e-05,
104
- "loss": 2.857,
105
  "step": 800
106
  },
107
  {
108
- "epoch": 0.12,
109
- "learning_rate": 8.806850084222347e-05,
110
- "loss": 2.8538,
111
  "step": 850
112
  },
113
  {
114
- "epoch": 0.13,
115
- "learning_rate": 8.736664795058956e-05,
116
- "loss": 2.8592,
117
  "step": 900
118
  },
119
  {
120
- "epoch": 0.13,
121
- "learning_rate": 8.666479505895565e-05,
122
- "loss": 2.859,
123
  "step": 950
124
  },
125
  {
126
- "epoch": 0.14,
127
- "learning_rate": 8.596294216732174e-05,
128
- "loss": 2.8183,
129
  "step": 1000
130
  },
131
  {
132
- "epoch": 0.15,
133
- "learning_rate": 8.526108927568782e-05,
134
- "loss": 2.8456,
135
  "step": 1050
136
  },
137
  {
138
- "epoch": 0.15,
139
- "learning_rate": 8.455923638405391e-05,
140
- "loss": 2.803,
141
  "step": 1100
142
  },
143
  {
144
- "epoch": 0.16,
145
- "learning_rate": 8.385738349242e-05,
146
- "loss": 2.8414,
147
  "step": 1150
148
  },
149
  {
150
- "epoch": 0.17,
151
- "learning_rate": 8.315553060078609e-05,
152
- "loss": 2.8162,
153
  "step": 1200
154
  },
155
  {
156
- "epoch": 0.18,
157
- "learning_rate": 8.245367770915216e-05,
158
- "loss": 2.8404,
159
  "step": 1250
160
  },
161
  {
162
- "epoch": 0.18,
163
- "learning_rate": 8.175182481751825e-05,
164
- "loss": 2.801,
165
  "step": 1300
166
  },
167
  {
168
- "epoch": 0.19,
169
- "learning_rate": 8.104997192588434e-05,
170
- "loss": 2.818,
171
  "step": 1350
172
  },
173
  {
174
- "epoch": 0.2,
175
- "learning_rate": 8.034811903425043e-05,
176
- "loss": 2.8279,
177
  "step": 1400
178
  },
179
  {
180
- "epoch": 0.2,
181
- "learning_rate": 7.96462661426165e-05,
182
- "loss": 2.8261,
183
  "step": 1450
184
  },
185
  {
186
- "epoch": 0.21,
187
- "learning_rate": 7.89444132509826e-05,
188
- "loss": 2.7759,
189
  "step": 1500
190
  },
191
  {
192
- "epoch": 0.22,
193
- "learning_rate": 7.824256035934869e-05,
194
- "loss": 2.7991,
195
  "step": 1550
196
  },
197
  {
198
- "epoch": 0.22,
199
- "learning_rate": 7.754070746771476e-05,
200
- "loss": 2.7823,
201
  "step": 1600
202
  },
203
  {
204
- "epoch": 0.23,
205
- "learning_rate": 7.683885457608085e-05,
206
- "loss": 2.7843,
207
  "step": 1650
208
  },
209
  {
210
- "epoch": 0.24,
211
- "learning_rate": 7.613700168444694e-05,
212
- "loss": 2.7736,
213
  "step": 1700
214
  },
215
  {
216
- "epoch": 0.25,
217
- "learning_rate": 7.543514879281303e-05,
218
- "loss": 2.8135,
219
  "step": 1750
220
  },
221
  {
222
- "epoch": 0.25,
223
- "learning_rate": 7.47332959011791e-05,
224
- "loss": 2.7873,
225
  "step": 1800
226
  },
227
  {
228
- "epoch": 0.26,
229
- "learning_rate": 7.40314430095452e-05,
230
- "loss": 2.777,
231
  "step": 1850
232
  },
233
  {
234
- "epoch": 0.27,
235
- "learning_rate": 7.332959011791129e-05,
236
- "loss": 2.7542,
237
  "step": 1900
238
  },
239
  {
240
- "epoch": 0.27,
241
- "learning_rate": 7.262773722627737e-05,
242
- "loss": 2.7759,
243
  "step": 1950
244
  },
245
  {
246
- "epoch": 0.28,
247
- "learning_rate": 7.192588433464346e-05,
248
- "loss": 2.7772,
249
  "step": 2000
250
  },
251
  {
252
- "epoch": 0.29,
253
- "learning_rate": 7.122403144300955e-05,
254
- "loss": 2.7689,
255
  "step": 2050
256
  },
257
  {
258
- "epoch": 0.29,
259
- "learning_rate": 7.052217855137564e-05,
260
- "loss": 2.8015,
261
  "step": 2100
262
  },
263
  {
264
- "epoch": 0.3,
265
- "learning_rate": 6.982032565974172e-05,
266
- "loss": 2.7655,
267
  "step": 2150
268
  },
269
  {
270
- "epoch": 0.31,
271
- "learning_rate": 6.911847276810781e-05,
272
- "loss": 2.7728,
273
  "step": 2200
274
  },
275
  {
276
- "epoch": 0.32,
277
- "learning_rate": 6.84166198764739e-05,
278
- "loss": 2.7686,
279
  "step": 2250
280
  },
281
  {
282
- "epoch": 0.32,
283
- "learning_rate": 6.771476698483999e-05,
284
- "loss": 2.7651,
285
  "step": 2300
286
  },
287
  {
288
- "epoch": 0.33,
289
- "learning_rate": 6.701291409320606e-05,
290
- "loss": 2.7594,
291
  "step": 2350
292
  },
293
  {
294
- "epoch": 0.34,
295
- "learning_rate": 6.631106120157215e-05,
296
- "loss": 2.8137,
297
  "step": 2400
298
  },
299
  {
300
- "epoch": 0.34,
301
- "learning_rate": 6.560920830993824e-05,
302
- "loss": 2.7566,
303
  "step": 2450
304
  },
305
  {
306
- "epoch": 0.35,
307
- "learning_rate": 6.490735541830433e-05,
308
- "loss": 2.7705,
309
  "step": 2500
310
  },
311
  {
312
- "epoch": 0.36,
313
- "learning_rate": 6.420550252667041e-05,
314
- "loss": 2.7873,
315
  "step": 2550
316
  },
317
  {
318
- "epoch": 0.36,
319
- "learning_rate": 6.35036496350365e-05,
320
- "loss": 2.7841,
321
  "step": 2600
322
  },
323
  {
324
- "epoch": 0.37,
325
- "learning_rate": 6.280179674340259e-05,
326
- "loss": 2.7643,
327
  "step": 2650
328
  },
329
  {
330
- "epoch": 0.38,
331
- "learning_rate": 6.209994385176868e-05,
332
- "loss": 2.7838,
333
  "step": 2700
334
  },
335
  {
336
- "epoch": 0.39,
337
- "learning_rate": 6.139809096013475e-05,
338
- "loss": 2.7849,
339
  "step": 2750
340
  },
341
  {
342
- "epoch": 0.39,
343
- "learning_rate": 6.069623806850084e-05,
344
- "loss": 2.789,
345
  "step": 2800
346
  },
347
  {
348
- "epoch": 0.4,
349
- "learning_rate": 5.999438517686693e-05,
350
- "loss": 2.7397,
351
  "step": 2850
352
  },
353
  {
354
- "epoch": 0.41,
355
- "learning_rate": 5.9292532285233015e-05,
356
- "loss": 2.7426,
357
  "step": 2900
358
  },
359
  {
360
- "epoch": 0.41,
361
- "learning_rate": 5.8590679393599104e-05,
362
- "loss": 2.7395,
363
  "step": 2950
364
  },
365
  {
366
- "epoch": 0.42,
367
- "learning_rate": 5.788882650196519e-05,
368
- "loss": 2.7462,
369
  "step": 3000
370
  },
371
  {
372
- "epoch": 0.43,
373
- "learning_rate": 5.718697361033128e-05,
374
- "loss": 2.7756,
375
  "step": 3050
376
  },
377
  {
378
- "epoch": 0.44,
379
- "learning_rate": 5.648512071869736e-05,
380
- "loss": 2.7218,
381
  "step": 3100
382
  },
383
  {
384
- "epoch": 0.44,
385
- "learning_rate": 5.578326782706345e-05,
386
- "loss": 2.7712,
387
  "step": 3150
388
  },
389
  {
390
- "epoch": 0.45,
391
- "learning_rate": 5.508141493542953e-05,
392
- "loss": 2.768,
393
  "step": 3200
394
  },
395
  {
396
- "epoch": 0.46,
397
- "learning_rate": 5.4379562043795615e-05,
398
- "loss": 2.7623,
399
  "step": 3250
400
  },
401
  {
402
- "epoch": 0.46,
403
- "learning_rate": 5.367770915216172e-05,
404
- "loss": 2.7365,
405
  "step": 3300
406
  },
407
  {
408
- "epoch": 0.47,
409
- "learning_rate": 5.29758562605278e-05,
410
- "loss": 2.7875,
411
  "step": 3350
412
  },
413
  {
414
- "epoch": 0.48,
415
- "learning_rate": 5.227400336889389e-05,
416
- "loss": 2.6979,
417
  "step": 3400
418
  },
419
  {
420
- "epoch": 0.48,
421
- "learning_rate": 5.157215047725997e-05,
422
- "loss": 2.7315,
423
  "step": 3450
424
  },
425
  {
426
- "epoch": 0.49,
427
- "learning_rate": 5.087029758562606e-05,
428
- "loss": 2.7189,
429
  "step": 3500
430
  },
431
  {
432
- "epoch": 0.5,
433
- "learning_rate": 5.0168444693992145e-05,
434
- "loss": 2.7283,
435
  "step": 3550
436
  },
437
  {
438
- "epoch": 0.51,
439
- "learning_rate": 4.946659180235823e-05,
440
- "loss": 2.7685,
441
  "step": 3600
442
  },
443
  {
444
- "epoch": 0.51,
445
- "learning_rate": 4.876473891072432e-05,
446
- "loss": 2.7724,
447
  "step": 3650
448
  },
449
  {
450
- "epoch": 0.52,
451
- "learning_rate": 4.80628860190904e-05,
452
- "loss": 2.773,
453
  "step": 3700
454
  },
455
  {
456
- "epoch": 0.53,
457
- "learning_rate": 4.736103312745649e-05,
458
- "loss": 2.7564,
459
  "step": 3750
460
  },
461
  {
462
- "epoch": 0.53,
463
- "learning_rate": 4.665918023582257e-05,
464
- "loss": 2.769,
465
  "step": 3800
466
  },
467
  {
468
- "epoch": 0.54,
469
- "learning_rate": 4.595732734418866e-05,
470
- "loss": 2.7544,
471
  "step": 3850
472
  },
473
  {
474
- "epoch": 0.55,
475
- "learning_rate": 4.5255474452554745e-05,
476
- "loss": 2.742,
477
  "step": 3900
478
  },
479
  {
480
- "epoch": 0.55,
481
- "learning_rate": 4.4553621560920834e-05,
482
- "loss": 2.7558,
483
  "step": 3950
484
  },
485
  {
486
- "epoch": 0.56,
487
- "learning_rate": 4.385176866928692e-05,
488
- "loss": 2.752,
489
  "step": 4000
490
  },
491
  {
492
- "epoch": 0.57,
493
- "learning_rate": 4.3149915777653006e-05,
494
- "loss": 2.7577,
495
  "step": 4050
496
  },
497
  {
498
- "epoch": 0.58,
499
- "learning_rate": 4.244806288601909e-05,
500
- "loss": 2.7248,
501
  "step": 4100
502
  },
503
  {
504
- "epoch": 0.58,
505
- "learning_rate": 4.174620999438518e-05,
506
- "loss": 2.7314,
507
  "step": 4150
508
  },
509
  {
510
- "epoch": 0.59,
511
- "learning_rate": 4.104435710275126e-05,
512
- "loss": 2.7308,
513
  "step": 4200
514
  },
515
  {
516
- "epoch": 0.6,
517
- "learning_rate": 4.034250421111735e-05,
518
- "loss": 2.7252,
519
  "step": 4250
520
  },
521
  {
522
- "epoch": 0.6,
523
- "learning_rate": 3.964065131948344e-05,
524
- "loss": 2.7444,
525
  "step": 4300
526
  },
527
  {
528
- "epoch": 0.61,
529
- "learning_rate": 3.893879842784952e-05,
530
- "loss": 2.7529,
531
  "step": 4350
532
  },
533
  {
534
- "epoch": 0.62,
535
- "learning_rate": 3.823694553621561e-05,
536
- "loss": 2.7312,
537
  "step": 4400
538
  },
539
  {
540
- "epoch": 0.62,
541
- "learning_rate": 3.7535092644581696e-05,
542
- "loss": 2.7471,
543
  "step": 4450
544
  },
545
  {
546
- "epoch": 0.63,
547
- "learning_rate": 3.6833239752947785e-05,
548
- "loss": 2.7289,
549
  "step": 4500
550
  },
551
  {
552
- "epoch": 0.64,
553
- "learning_rate": 3.613138686131387e-05,
554
- "loss": 2.7667,
555
  "step": 4550
556
  },
557
  {
558
- "epoch": 0.65,
559
- "learning_rate": 3.542953396967996e-05,
560
- "loss": 2.7553,
561
  "step": 4600
562
  },
563
  {
564
- "epoch": 0.65,
565
- "learning_rate": 3.472768107804604e-05,
566
- "loss": 2.7422,
567
  "step": 4650
568
  },
569
  {
570
- "epoch": 0.66,
571
- "learning_rate": 3.402582818641213e-05,
572
- "loss": 2.7328,
573
  "step": 4700
574
  },
575
  {
576
- "epoch": 0.67,
577
- "learning_rate": 3.332397529477821e-05,
578
- "loss": 2.7449,
579
  "step": 4750
580
  },
581
  {
582
- "epoch": 0.67,
583
- "learning_rate": 3.26221224031443e-05,
584
- "loss": 2.7332,
585
  "step": 4800
586
  },
587
  {
588
- "epoch": 0.68,
589
- "learning_rate": 3.1920269511510385e-05,
590
- "loss": 2.7529,
591
  "step": 4850
592
  },
593
  {
594
- "epoch": 0.69,
595
- "learning_rate": 3.1218416619876474e-05,
596
- "loss": 2.7493,
597
  "step": 4900
598
  },
599
  {
600
- "epoch": 0.69,
601
- "learning_rate": 3.0516563728242564e-05,
602
- "loss": 2.7525,
603
  "step": 4950
604
  },
605
  {
606
- "epoch": 0.7,
607
- "learning_rate": 2.981471083660865e-05,
608
- "loss": 2.7367,
609
  "step": 5000
610
  },
611
  {
612
- "epoch": 0.71,
613
- "learning_rate": 2.9112857944974736e-05,
614
- "loss": 2.726,
615
  "step": 5050
616
  },
617
  {
618
- "epoch": 0.72,
619
- "learning_rate": 2.8411005053340822e-05,
620
- "loss": 2.7327,
621
  "step": 5100
622
  },
623
  {
624
- "epoch": 0.72,
625
- "learning_rate": 2.770915216170691e-05,
626
- "loss": 2.7545,
627
  "step": 5150
628
  },
629
  {
630
- "epoch": 0.73,
631
- "learning_rate": 2.7007299270072995e-05,
632
- "loss": 2.7682,
633
  "step": 5200
634
  },
635
  {
636
- "epoch": 0.74,
637
- "learning_rate": 2.630544637843908e-05,
638
- "loss": 2.7486,
639
  "step": 5250
640
  },
641
  {
642
- "epoch": 0.74,
643
- "learning_rate": 2.5603593486805167e-05,
644
- "loss": 2.7341,
645
  "step": 5300
646
  },
647
  {
648
- "epoch": 0.75,
649
- "learning_rate": 2.4901740595171253e-05,
650
- "loss": 2.7423,
651
  "step": 5350
652
  },
653
  {
654
- "epoch": 0.76,
655
- "learning_rate": 2.419988770353734e-05,
656
- "loss": 2.743,
657
  "step": 5400
658
  },
659
  {
660
- "epoch": 0.76,
661
- "learning_rate": 2.349803481190343e-05,
662
- "loss": 2.7319,
663
  "step": 5450
664
  },
665
- {
666
- "epoch": 0.77,
667
- "learning_rate": 2.279618192026951e-05,
668
- "loss": 2.7555,
669
- "step": 5500
670
- },
671
- {
672
- "epoch": 0.78,
673
- "learning_rate": 2.2094329028635598e-05,
674
- "loss": 2.7349,
675
- "step": 5550
676
- },
677
- {
678
- "epoch": 0.79,
679
- "learning_rate": 2.1392476137001684e-05,
680
- "loss": 2.7447,
681
- "step": 5600
682
- },
683
- {
684
- "epoch": 0.79,
685
- "learning_rate": 2.069062324536777e-05,
686
- "loss": 2.7325,
687
- "step": 5650
688
- },
689
- {
690
- "epoch": 0.8,
691
- "learning_rate": 1.9988770353733856e-05,
692
- "loss": 2.7334,
693
- "step": 5700
694
- },
695
- {
696
- "epoch": 0.81,
697
- "learning_rate": 1.9286917462099946e-05,
698
- "loss": 2.7493,
699
- "step": 5750
700
- },
701
- {
702
- "epoch": 0.81,
703
- "learning_rate": 1.8585064570466032e-05,
704
- "loss": 2.7298,
705
- "step": 5800
706
- },
707
- {
708
- "epoch": 0.82,
709
- "learning_rate": 1.7883211678832118e-05,
710
- "loss": 2.7208,
711
- "step": 5850
712
- },
713
- {
714
- "epoch": 0.83,
715
- "learning_rate": 1.7181358787198204e-05,
716
- "loss": 2.7383,
717
- "step": 5900
718
- },
719
- {
720
- "epoch": 0.84,
721
- "learning_rate": 1.647950589556429e-05,
722
- "loss": 2.714,
723
- "step": 5950
724
- },
725
- {
726
- "epoch": 0.84,
727
- "learning_rate": 1.5777653003930376e-05,
728
- "loss": 2.7646,
729
- "step": 6000
730
- },
731
  {
732
  "epoch": 0.85,
733
- "learning_rate": 1.5075800112296463e-05,
734
- "loss": 2.7601,
735
- "step": 6050
736
- },
737
- {
738
- "epoch": 0.86,
739
- "learning_rate": 1.437394722066255e-05,
740
- "loss": 2.7473,
741
- "step": 6100
742
  },
743
  {
744
  "epoch": 0.86,
745
- "learning_rate": 1.3672094329028636e-05,
746
- "loss": 2.7423,
747
- "step": 6150
748
  },
749
  {
750
  "epoch": 0.87,
751
- "learning_rate": 1.2970241437394723e-05,
752
- "loss": 2.7147,
753
- "step": 6200
754
  },
755
  {
756
  "epoch": 0.88,
757
- "learning_rate": 1.2268388545760809e-05,
758
- "loss": 2.7426,
759
- "step": 6250
760
  },
761
  {
762
  "epoch": 0.88,
763
- "learning_rate": 1.1566535654126895e-05,
764
- "loss": 2.7496,
765
- "step": 6300
766
  },
767
  {
768
  "epoch": 0.89,
769
- "learning_rate": 1.0864682762492983e-05,
770
- "loss": 2.7438,
771
- "step": 6350
772
  },
773
  {
774
  "epoch": 0.9,
775
- "learning_rate": 1.0162829870859069e-05,
776
- "loss": 2.7441,
777
- "step": 6400
778
  },
779
  {
780
  "epoch": 0.91,
781
- "learning_rate": 9.460976979225155e-06,
782
- "loss": 2.7426,
783
- "step": 6450
784
  },
785
  {
786
  "epoch": 0.91,
787
- "learning_rate": 8.759124087591241e-06,
788
- "loss": 2.7334,
789
- "step": 6500
790
  },
791
  {
792
  "epoch": 0.92,
793
- "learning_rate": 8.057271195957327e-06,
794
- "loss": 2.7512,
795
- "step": 6550
796
- },
797
- {
798
- "epoch": 0.93,
799
- "learning_rate": 7.3554183043234135e-06,
800
- "loss": 2.7391,
801
- "step": 6600
802
  },
803
  {
804
  "epoch": 0.93,
805
- "learning_rate": 6.653565412689501e-06,
806
- "loss": 2.7737,
807
- "step": 6650
808
  },
809
  {
810
  "epoch": 0.94,
811
- "learning_rate": 5.9517125210555875e-06,
812
- "loss": 2.7009,
813
- "step": 6700
814
  },
815
  {
816
  "epoch": 0.95,
817
- "learning_rate": 5.249859629421673e-06,
818
- "loss": 2.7149,
819
- "step": 6750
820
  },
821
  {
822
  "epoch": 0.95,
823
- "learning_rate": 4.54800673778776e-06,
824
- "loss": 2.7317,
825
- "step": 6800
826
  },
827
  {
828
  "epoch": 0.96,
829
- "learning_rate": 3.846153846153847e-06,
830
- "loss": 2.7297,
831
- "step": 6850
832
  },
833
  {
834
  "epoch": 0.97,
835
- "learning_rate": 3.1443009545199325e-06,
836
- "loss": 2.7356,
837
- "step": 6900
838
  },
839
  {
840
  "epoch": 0.98,
841
- "learning_rate": 2.442448062886019e-06,
842
- "loss": 2.7414,
843
- "step": 6950
844
  },
845
  {
846
  "epoch": 0.98,
847
- "learning_rate": 1.7405951712521057e-06,
848
- "loss": 2.7362,
849
- "step": 7000
850
  },
851
  {
852
  "epoch": 0.99,
853
- "learning_rate": 1.0387422796181922e-06,
854
- "loss": 2.7303,
855
- "step": 7050
856
- },
857
- {
858
- "epoch": 1.0,
859
- "learning_rate": 3.3688938798427854e-07,
860
- "loss": 2.7418,
861
- "step": 7100
862
  },
863
  {
864
  "epoch": 1.0,
865
- "step": 7124,
866
- "total_flos": 9.341102317413335e+17,
867
- "train_loss": 2.7764307147395,
868
- "train_runtime": 11751.6473,
869
- "train_samples_per_second": 38.802,
870
- "train_steps_per_second": 0.606
871
  }
872
  ],
873
  "logging_steps": 50,
874
- "max_steps": 7124,
875
  "num_train_epochs": 1,
876
- "save_steps": 2000,
877
- "total_flos": 9.341102317413335e+17,
878
  "trial_name": null,
879
  "trial_params": null
880
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9999806145197248,
5
  "eval_steps": 500,
6
+ "global_step": 6448,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "learning_rate": 0.0009922456575682382,
14
+ "loss": 4.0749,
15
  "step": 50
16
  },
17
  {
18
+ "epoch": 0.02,
19
+ "learning_rate": 0.0009844913151364765,
20
+ "loss": 3.9181,
21
  "step": 100
22
  },
23
  {
24
  "epoch": 0.02,
25
+ "learning_rate": 0.0009767369727047147,
26
+ "loss": 3.8669,
27
  "step": 150
28
  },
29
  {
30
  "epoch": 0.03,
31
+ "learning_rate": 0.0009689826302729528,
32
+ "loss": 3.8069,
33
  "step": 200
34
  },
35
  {
36
  "epoch": 0.04,
37
+ "learning_rate": 0.000961228287841191,
38
+ "loss": 3.7749,
39
  "step": 250
40
  },
41
  {
42
+ "epoch": 0.05,
43
+ "learning_rate": 0.0009534739454094294,
44
+ "loss": 3.7757,
45
  "step": 300
46
  },
47
  {
48
  "epoch": 0.05,
49
+ "learning_rate": 0.0009457196029776675,
50
+ "loss": 3.7511,
51
  "step": 350
52
  },
53
  {
54
  "epoch": 0.06,
55
+ "learning_rate": 0.0009379652605459057,
56
+ "loss": 3.684,
57
  "step": 400
58
  },
59
  {
60
+ "epoch": 0.07,
61
+ "learning_rate": 0.000930210918114144,
62
+ "loss": 3.6771,
63
  "step": 450
64
  },
65
  {
66
+ "epoch": 0.08,
67
+ "learning_rate": 0.0009224565756823822,
68
+ "loss": 3.6402,
69
  "step": 500
70
  },
71
  {
72
+ "epoch": 0.09,
73
+ "learning_rate": 0.0009147022332506204,
74
+ "loss": 3.6889,
75
  "step": 550
76
  },
77
  {
78
+ "epoch": 0.09,
79
+ "learning_rate": 0.0009069478908188585,
80
+ "loss": 3.6868,
81
  "step": 600
82
  },
83
  {
84
+ "epoch": 0.1,
85
+ "learning_rate": 0.0008991935483870968,
86
+ "loss": 3.6779,
87
  "step": 650
88
  },
89
  {
90
+ "epoch": 0.11,
91
+ "learning_rate": 0.000891439205955335,
92
+ "loss": 3.6314,
93
  "step": 700
94
  },
95
  {
96
+ "epoch": 0.12,
97
+ "learning_rate": 0.0008836848635235732,
98
+ "loss": 3.6376,
99
  "step": 750
100
  },
101
  {
102
+ "epoch": 0.12,
103
+ "learning_rate": 0.0008759305210918114,
104
+ "loss": 3.6291,
105
  "step": 800
106
  },
107
  {
108
+ "epoch": 0.13,
109
+ "learning_rate": 0.0008681761786600497,
110
+ "loss": 3.629,
111
  "step": 850
112
  },
113
  {
114
+ "epoch": 0.14,
115
+ "learning_rate": 0.0008604218362282879,
116
+ "loss": 3.5972,
117
  "step": 900
118
  },
119
  {
120
+ "epoch": 0.15,
121
+ "learning_rate": 0.0008526674937965261,
122
+ "loss": 3.6299,
123
  "step": 950
124
  },
125
  {
126
+ "epoch": 0.16,
127
+ "learning_rate": 0.0008449131513647643,
128
+ "loss": 3.551,
129
  "step": 1000
130
  },
131
  {
132
+ "epoch": 0.16,
133
+ "learning_rate": 0.0008371588089330025,
134
+ "loss": 3.5943,
135
  "step": 1050
136
  },
137
  {
138
+ "epoch": 0.17,
139
+ "learning_rate": 0.0008294044665012407,
140
+ "loss": 3.5458,
141
  "step": 1100
142
  },
143
  {
144
+ "epoch": 0.18,
145
+ "learning_rate": 0.0008216501240694789,
146
+ "loss": 3.581,
147
  "step": 1150
148
  },
149
  {
150
+ "epoch": 0.19,
151
+ "learning_rate": 0.0008138957816377171,
152
+ "loss": 3.542,
153
  "step": 1200
154
  },
155
  {
156
+ "epoch": 0.19,
157
+ "learning_rate": 0.0008061414392059554,
158
+ "loss": 3.5666,
159
  "step": 1250
160
  },
161
  {
162
+ "epoch": 0.2,
163
+ "learning_rate": 0.0007983870967741935,
164
+ "loss": 3.5265,
165
  "step": 1300
166
  },
167
  {
168
+ "epoch": 0.21,
169
+ "learning_rate": 0.0007906327543424317,
170
+ "loss": 3.5315,
171
  "step": 1350
172
  },
173
  {
174
+ "epoch": 0.22,
175
+ "learning_rate": 0.00078287841191067,
176
+ "loss": 3.4934,
177
  "step": 1400
178
  },
179
  {
180
+ "epoch": 0.22,
181
+ "learning_rate": 0.0007751240694789083,
182
+ "loss": 3.5086,
183
  "step": 1450
184
  },
185
  {
186
+ "epoch": 0.23,
187
+ "learning_rate": 0.0007673697270471465,
188
+ "loss": 3.5028,
189
  "step": 1500
190
  },
191
  {
192
+ "epoch": 0.24,
193
+ "learning_rate": 0.0007596153846153846,
194
+ "loss": 3.4822,
195
  "step": 1550
196
  },
197
  {
198
+ "epoch": 0.25,
199
+ "learning_rate": 0.0007518610421836228,
200
+ "loss": 3.4943,
201
  "step": 1600
202
  },
203
  {
204
+ "epoch": 0.26,
205
+ "learning_rate": 0.0007441066997518611,
206
+ "loss": 3.5014,
207
  "step": 1650
208
  },
209
  {
210
+ "epoch": 0.26,
211
+ "learning_rate": 0.0007363523573200993,
212
+ "loss": 3.4705,
213
  "step": 1700
214
  },
215
  {
216
+ "epoch": 0.27,
217
+ "learning_rate": 0.0007285980148883374,
218
+ "loss": 3.4899,
219
  "step": 1750
220
  },
221
  {
222
+ "epoch": 0.28,
223
+ "learning_rate": 0.0007208436724565756,
224
+ "loss": 3.4403,
225
  "step": 1800
226
  },
227
  {
228
+ "epoch": 0.29,
229
+ "learning_rate": 0.0007130893300248139,
230
+ "loss": 3.4123,
231
  "step": 1850
232
  },
233
  {
234
+ "epoch": 0.29,
235
+ "learning_rate": 0.0007053349875930521,
236
+ "loss": 3.4231,
237
  "step": 1900
238
  },
239
  {
240
+ "epoch": 0.3,
241
+ "learning_rate": 0.0006975806451612903,
242
+ "loss": 3.3957,
243
  "step": 1950
244
  },
245
  {
246
+ "epoch": 0.31,
247
+ "learning_rate": 0.0006898263027295286,
248
+ "loss": 3.3722,
249
  "step": 2000
250
  },
251
  {
252
+ "epoch": 0.32,
253
+ "learning_rate": 0.0006820719602977668,
254
+ "loss": 3.4255,
255
  "step": 2050
256
  },
257
  {
258
+ "epoch": 0.33,
259
+ "learning_rate": 0.000674317617866005,
260
+ "loss": 3.4004,
261
  "step": 2100
262
  },
263
  {
264
+ "epoch": 0.33,
265
+ "learning_rate": 0.0006665632754342432,
266
+ "loss": 3.3749,
267
  "step": 2150
268
  },
269
  {
270
+ "epoch": 0.34,
271
+ "learning_rate": 0.0006588089330024815,
272
+ "loss": 3.3498,
273
  "step": 2200
274
  },
275
  {
276
+ "epoch": 0.35,
277
+ "learning_rate": 0.0006510545905707196,
278
+ "loss": 3.4563,
279
  "step": 2250
280
  },
281
  {
282
+ "epoch": 0.36,
283
+ "learning_rate": 0.0006433002481389578,
284
+ "loss": 3.392,
285
  "step": 2300
286
  },
287
  {
288
+ "epoch": 0.36,
289
+ "learning_rate": 0.000635545905707196,
290
+ "loss": 3.3686,
291
  "step": 2350
292
  },
293
  {
294
+ "epoch": 0.37,
295
+ "learning_rate": 0.0006277915632754343,
296
+ "loss": 3.3739,
297
  "step": 2400
298
  },
299
  {
300
+ "epoch": 0.38,
301
+ "learning_rate": 0.0006200372208436724,
302
+ "loss": 3.3357,
303
  "step": 2450
304
  },
305
  {
306
+ "epoch": 0.39,
307
+ "learning_rate": 0.0006122828784119106,
308
+ "loss": 3.3859,
309
  "step": 2500
310
  },
311
  {
312
+ "epoch": 0.4,
313
+ "learning_rate": 0.0006045285359801489,
314
+ "loss": 3.3605,
315
  "step": 2550
316
  },
317
  {
318
+ "epoch": 0.4,
319
+ "learning_rate": 0.0005967741935483872,
320
+ "loss": 3.361,
321
  "step": 2600
322
  },
323
  {
324
+ "epoch": 0.41,
325
+ "learning_rate": 0.0005890198511166254,
326
+ "loss": 3.3729,
327
  "step": 2650
328
  },
329
  {
330
+ "epoch": 0.42,
331
+ "learning_rate": 0.0005812655086848635,
332
+ "loss": 3.3592,
333
  "step": 2700
334
  },
335
  {
336
+ "epoch": 0.43,
337
+ "learning_rate": 0.0005735111662531017,
338
+ "loss": 3.3576,
339
  "step": 2750
340
  },
341
  {
342
+ "epoch": 0.43,
343
+ "learning_rate": 0.00056575682382134,
344
+ "loss": 3.3464,
345
  "step": 2800
346
  },
347
  {
348
+ "epoch": 0.44,
349
+ "learning_rate": 0.0005580024813895782,
350
+ "loss": 3.3225,
351
  "step": 2850
352
  },
353
  {
354
+ "epoch": 0.45,
355
+ "learning_rate": 0.0005502481389578163,
356
+ "loss": 3.3228,
357
  "step": 2900
358
  },
359
  {
360
+ "epoch": 0.46,
361
+ "learning_rate": 0.0005424937965260545,
362
+ "loss": 3.3369,
363
  "step": 2950
364
  },
365
  {
366
+ "epoch": 0.47,
367
+ "learning_rate": 0.0005347394540942928,
368
+ "loss": 3.2912,
369
  "step": 3000
370
  },
371
  {
372
+ "epoch": 0.47,
373
+ "learning_rate": 0.000526985111662531,
374
+ "loss": 3.2722,
375
  "step": 3050
376
  },
377
  {
378
+ "epoch": 0.48,
379
+ "learning_rate": 0.0005192307692307693,
380
+ "loss": 3.2878,
381
  "step": 3100
382
  },
383
  {
384
+ "epoch": 0.49,
385
+ "learning_rate": 0.0005114764267990075,
386
+ "loss": 3.2558,
387
  "step": 3150
388
  },
389
  {
390
+ "epoch": 0.5,
391
+ "learning_rate": 0.0005037220843672457,
392
+ "loss": 3.3018,
393
  "step": 3200
394
  },
395
  {
396
+ "epoch": 0.5,
397
+ "learning_rate": 0.0004959677419354839,
398
+ "loss": 3.2785,
399
  "step": 3250
400
  },
401
  {
402
+ "epoch": 0.51,
403
+ "learning_rate": 0.00048821339950372213,
404
+ "loss": 3.2249,
405
  "step": 3300
406
  },
407
  {
408
+ "epoch": 0.52,
409
+ "learning_rate": 0.0004804590570719603,
410
+ "loss": 3.2703,
411
  "step": 3350
412
  },
413
  {
414
+ "epoch": 0.53,
415
+ "learning_rate": 0.00047270471464019853,
416
+ "loss": 3.2871,
417
  "step": 3400
418
  },
419
  {
420
+ "epoch": 0.54,
421
+ "learning_rate": 0.0004649503722084367,
422
+ "loss": 3.2357,
423
  "step": 3450
424
  },
425
  {
426
+ "epoch": 0.54,
427
+ "learning_rate": 0.000457196029776675,
428
+ "loss": 3.2428,
429
  "step": 3500
430
  },
431
  {
432
+ "epoch": 0.55,
433
+ "learning_rate": 0.00044944168734491316,
434
+ "loss": 3.2125,
435
  "step": 3550
436
  },
437
  {
438
+ "epoch": 0.56,
439
+ "learning_rate": 0.0004416873449131514,
440
+ "loss": 3.2338,
441
  "step": 3600
442
  },
443
  {
444
+ "epoch": 0.57,
445
+ "learning_rate": 0.00043393300248138956,
446
+ "loss": 3.288,
447
  "step": 3650
448
  },
449
  {
450
+ "epoch": 0.57,
451
+ "learning_rate": 0.0004261786600496278,
452
+ "loss": 3.2564,
453
  "step": 3700
454
  },
455
  {
456
+ "epoch": 0.58,
457
+ "learning_rate": 0.000418424317617866,
458
+ "loss": 3.1859,
459
  "step": 3750
460
  },
461
  {
462
+ "epoch": 0.59,
463
+ "learning_rate": 0.00041066997518610424,
464
+ "loss": 3.2368,
465
  "step": 3800
466
  },
467
  {
468
+ "epoch": 0.6,
469
+ "learning_rate": 0.00040291563275434247,
470
+ "loss": 3.2368,
471
  "step": 3850
472
  },
473
  {
474
+ "epoch": 0.6,
475
+ "learning_rate": 0.00039516129032258064,
476
+ "loss": 3.1754,
477
  "step": 3900
478
  },
479
  {
480
+ "epoch": 0.61,
481
+ "learning_rate": 0.00038740694789081887,
482
+ "loss": 3.2392,
483
  "step": 3950
484
  },
485
  {
486
+ "epoch": 0.62,
487
+ "learning_rate": 0.00037965260545905704,
488
+ "loss": 3.2382,
489
  "step": 4000
490
  },
491
  {
492
+ "epoch": 0.63,
493
+ "learning_rate": 0.00037189826302729527,
494
+ "loss": 3.2545,
495
  "step": 4050
496
  },
497
  {
498
+ "epoch": 0.64,
499
+ "learning_rate": 0.00036414392059553355,
500
+ "loss": 3.1993,
501
  "step": 4100
502
  },
503
  {
504
+ "epoch": 0.64,
505
+ "learning_rate": 0.0003563895781637717,
506
+ "loss": 3.1565,
507
  "step": 4150
508
  },
509
  {
510
+ "epoch": 0.65,
511
+ "learning_rate": 0.00034863523573200995,
512
+ "loss": 3.2112,
513
  "step": 4200
514
  },
515
  {
516
+ "epoch": 0.66,
517
+ "learning_rate": 0.0003408808933002481,
518
+ "loss": 3.1617,
519
  "step": 4250
520
  },
521
  {
522
+ "epoch": 0.67,
523
+ "learning_rate": 0.00033312655086848635,
524
+ "loss": 3.1602,
525
  "step": 4300
526
  },
527
  {
528
+ "epoch": 0.67,
529
+ "learning_rate": 0.0003253722084367246,
530
+ "loss": 3.1477,
531
  "step": 4350
532
  },
533
  {
534
+ "epoch": 0.68,
535
+ "learning_rate": 0.0003176178660049628,
536
+ "loss": 3.1583,
537
  "step": 4400
538
  },
539
  {
540
+ "epoch": 0.69,
541
+ "learning_rate": 0.00030986352357320103,
542
+ "loss": 3.207,
543
  "step": 4450
544
  },
545
  {
546
+ "epoch": 0.7,
547
+ "learning_rate": 0.0003021091811414392,
548
+ "loss": 3.1408,
549
  "step": 4500
550
  },
551
  {
552
+ "epoch": 0.71,
553
+ "learning_rate": 0.00029435483870967743,
554
+ "loss": 3.0965,
555
  "step": 4550
556
  },
557
  {
558
+ "epoch": 0.71,
559
+ "learning_rate": 0.0002866004962779156,
560
+ "loss": 3.154,
561
  "step": 4600
562
  },
563
  {
564
+ "epoch": 0.72,
565
+ "learning_rate": 0.0002788461538461539,
566
+ "loss": 3.1413,
567
  "step": 4650
568
  },
569
  {
570
+ "epoch": 0.73,
571
+ "learning_rate": 0.00027109181141439205,
572
+ "loss": 3.1373,
573
  "step": 4700
574
  },
575
  {
576
+ "epoch": 0.74,
577
+ "learning_rate": 0.0002633374689826303,
578
+ "loss": 3.1198,
579
  "step": 4750
580
  },
581
  {
582
+ "epoch": 0.74,
583
+ "learning_rate": 0.0002555831265508685,
584
+ "loss": 3.1047,
585
  "step": 4800
586
  },
587
  {
588
+ "epoch": 0.75,
589
+ "learning_rate": 0.0002478287841191067,
590
+ "loss": 3.1728,
591
  "step": 4850
592
  },
593
  {
594
+ "epoch": 0.76,
595
+ "learning_rate": 0.0002400744416873449,
596
+ "loss": 3.1262,
597
  "step": 4900
598
  },
599
  {
600
+ "epoch": 0.77,
601
+ "learning_rate": 0.00023232009925558313,
602
+ "loss": 3.1111,
603
  "step": 4950
604
  },
605
  {
606
+ "epoch": 0.78,
607
+ "learning_rate": 0.00022456575682382136,
608
+ "loss": 3.1426,
609
  "step": 5000
610
  },
611
  {
612
+ "epoch": 0.78,
613
+ "learning_rate": 0.00021681141439205956,
614
+ "loss": 3.1709,
615
  "step": 5050
616
  },
617
  {
618
+ "epoch": 0.79,
619
+ "learning_rate": 0.00020905707196029776,
620
+ "loss": 3.1666,
621
  "step": 5100
622
  },
623
  {
624
+ "epoch": 0.8,
625
+ "learning_rate": 0.000201302729528536,
626
+ "loss": 3.0971,
627
  "step": 5150
628
  },
629
  {
630
+ "epoch": 0.81,
631
+ "learning_rate": 0.0001935483870967742,
632
+ "loss": 3.1585,
633
  "step": 5200
634
  },
635
  {
636
+ "epoch": 0.81,
637
+ "learning_rate": 0.00018579404466501241,
638
+ "loss": 3.0784,
639
  "step": 5250
640
  },
641
  {
642
+ "epoch": 0.82,
643
+ "learning_rate": 0.00017803970223325061,
644
+ "loss": 3.1064,
645
  "step": 5300
646
  },
647
  {
648
+ "epoch": 0.83,
649
+ "learning_rate": 0.00017028535980148884,
650
+ "loss": 3.1601,
651
  "step": 5350
652
  },
653
  {
654
+ "epoch": 0.84,
655
+ "learning_rate": 0.00016253101736972707,
656
+ "loss": 3.1306,
657
  "step": 5400
658
  },
659
  {
660
+ "epoch": 0.85,
661
+ "learning_rate": 0.00015477667493796527,
662
+ "loss": 3.1461,
663
  "step": 5450
664
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  {
666
  "epoch": 0.85,
667
+ "learning_rate": 0.00014702233250620347,
668
+ "loss": 3.1139,
669
+ "step": 5500
 
 
 
 
 
 
670
  },
671
  {
672
  "epoch": 0.86,
673
+ "learning_rate": 0.0001392679900744417,
674
+ "loss": 3.0884,
675
+ "step": 5550
676
  },
677
  {
678
  "epoch": 0.87,
679
+ "learning_rate": 0.0001315136476426799,
680
+ "loss": 3.1221,
681
+ "step": 5600
682
  },
683
  {
684
  "epoch": 0.88,
685
+ "learning_rate": 0.00012375930521091812,
686
+ "loss": 3.0781,
687
+ "step": 5650
688
  },
689
  {
690
  "epoch": 0.88,
691
+ "learning_rate": 0.00011600496277915632,
692
+ "loss": 3.156,
693
+ "step": 5700
694
  },
695
  {
696
  "epoch": 0.89,
697
+ "learning_rate": 0.00010825062034739454,
698
+ "loss": 3.1047,
699
+ "step": 5750
700
  },
701
  {
702
  "epoch": 0.9,
703
+ "learning_rate": 0.00010049627791563276,
704
+ "loss": 3.1811,
705
+ "step": 5800
706
  },
707
  {
708
  "epoch": 0.91,
709
+ "learning_rate": 9.274193548387098e-05,
710
+ "loss": 3.1423,
711
+ "step": 5850
712
  },
713
  {
714
  "epoch": 0.91,
715
+ "learning_rate": 8.498759305210918e-05,
716
+ "loss": 3.1057,
717
+ "step": 5900
718
  },
719
  {
720
  "epoch": 0.92,
721
+ "learning_rate": 7.723325062034739e-05,
722
+ "loss": 3.0984,
723
+ "step": 5950
 
 
 
 
 
 
724
  },
725
  {
726
  "epoch": 0.93,
727
+ "learning_rate": 6.947890818858562e-05,
728
+ "loss": 3.1098,
729
+ "step": 6000
730
  },
731
  {
732
  "epoch": 0.94,
733
+ "learning_rate": 6.172456575682382e-05,
734
+ "loss": 3.1232,
735
+ "step": 6050
736
  },
737
  {
738
  "epoch": 0.95,
739
+ "learning_rate": 5.3970223325062036e-05,
740
+ "loss": 3.129,
741
+ "step": 6100
742
  },
743
  {
744
  "epoch": 0.95,
745
+ "learning_rate": 4.621588089330025e-05,
746
+ "loss": 3.1156,
747
+ "step": 6150
748
  },
749
  {
750
  "epoch": 0.96,
751
+ "learning_rate": 3.846153846153846e-05,
752
+ "loss": 3.1295,
753
+ "step": 6200
754
  },
755
  {
756
  "epoch": 0.97,
757
+ "learning_rate": 3.0707196029776676e-05,
758
+ "loss": 3.0784,
759
+ "step": 6250
760
  },
761
  {
762
  "epoch": 0.98,
763
+ "learning_rate": 2.295285359801489e-05,
764
+ "loss": 3.1048,
765
+ "step": 6300
766
  },
767
  {
768
  "epoch": 0.98,
769
+ "learning_rate": 1.5198511166253101e-05,
770
+ "loss": 3.1145,
771
+ "step": 6350
772
  },
773
  {
774
  "epoch": 0.99,
775
+ "learning_rate": 7.444168734491316e-06,
776
+ "loss": 3.185,
777
+ "step": 6400
 
 
 
 
 
 
778
  },
779
  {
780
  "epoch": 1.0,
781
+ "step": 6448,
782
+ "total_flos": 1.0568400432109978e+17,
783
+ "train_loss": 3.3343640817306177,
784
+ "train_runtime": 1458.5581,
785
+ "train_samples_per_second": 35.367,
786
+ "train_steps_per_second": 4.421
787
  }
788
  ],
789
  "logging_steps": 50,
790
+ "max_steps": 6448,
791
  "num_train_epochs": 1,
792
+ "save_steps": -6448,
793
+ "total_flos": 1.0568400432109978e+17,
794
  "trial_name": null,
795
  "trial_params": null
796
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0be34ece13733d42cb106498be612b92d1ed5dd757900d449ecef81b9dfbb415
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164c4b37b97a054742e9ab666c8f07d9e7fda1d553be739f57042a8801d6d49b
3
  size 4027