CreatorPhan commited on
Commit
3f18f31
1 Parent(s): 7c86f55

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -5,7 +5,6 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
- - quant_method: bitsandbytes
9
  - load_in_8bit: True
10
  - load_in_4bit: False
11
  - llm_int8_threshold: 6.0
@@ -18,4 +17,4 @@ The following `bitsandbytes` quantization config was used during training:
18
  ### Framework versions
19
 
20
 
21
- - PEFT 0.6.0.dev0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: True
9
  - load_in_4bit: False
10
  - llm_int8_threshold: 6.0
 
17
  ### Framework versions
18
 
19
 
20
+ - PEFT 0.4.0
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9965124964cb337825ddfaf69b302836615f6c9078be280e07c56ca53a3dee2
3
  size 39409357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c57a457d1ce624c042cf5f076804ef0ce9ab9b3e07fef0a29dce7b58311b6c6
3
  size 39409357
checkpoint-100/README.md CHANGED
@@ -5,7 +5,6 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
- - quant_method: bitsandbytes
9
  - load_in_8bit: True
10
  - load_in_4bit: False
11
  - llm_int8_threshold: 6.0
@@ -17,7 +16,6 @@ The following `bitsandbytes` quantization config was used during training:
17
  - bnb_4bit_compute_dtype: float32
18
 
19
  The following `bitsandbytes` quantization config was used during training:
20
- - quant_method: bitsandbytes
21
  - load_in_8bit: True
22
  - load_in_4bit: False
23
  - llm_int8_threshold: 6.0
@@ -29,6 +27,6 @@ The following `bitsandbytes` quantization config was used during training:
29
  - bnb_4bit_compute_dtype: float32
30
  ### Framework versions
31
 
32
- - PEFT 0.6.0.dev0
33
 
34
- - PEFT 0.6.0.dev0
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
 
8
  - load_in_8bit: True
9
  - load_in_4bit: False
10
  - llm_int8_threshold: 6.0
 
16
  - bnb_4bit_compute_dtype: float32
17
 
18
  The following `bitsandbytes` quantization config was used during training:
 
19
  - load_in_8bit: True
20
  - load_in_4bit: False
21
  - llm_int8_threshold: 6.0
 
27
  - bnb_4bit_compute_dtype: float32
28
  ### Framework versions
29
 
30
+ - PEFT 0.4.0
31
 
32
+ - PEFT 0.4.0
checkpoint-100/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7162cb89a36eade56456564472b7286d9710cbd215038fb634105d465db2572a
3
  size 39409357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:044e3971abe97a7f3b70569e01834ebc3e8f0ca21682fb1b3a18c6f125bb4e8c
3
  size 39409357
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b21aaf3d05278b35d430ba855cffc90597c9a169b3ca6c2bddee123f1ec461a8
3
- size 78844421
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea58d9361a5148e15d11e309e14ba6ee6dba57f3cc0ab6da8fd515b70f6ce64
3
+ size 78784709
checkpoint-100/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:972139d83957a9cf2600cb6eeca17287d7a5377c33a53500ae7e13fe830ad36b
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7508d4b8dd267de5cc58e972da25236687927651336a28f292c92f7f23951475
3
  size 14575
checkpoint-100/trainer_state.json CHANGED
@@ -1,619 +1,616 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.244167962674961,
5
- "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
  "learning_rate": 0.00099375,
14
- "loss": 2.9311,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.02,
19
  "learning_rate": 0.0009875,
20
- "loss": 2.9309,
21
  "step": 2
22
  },
23
  {
24
- "epoch": 0.04,
25
  "learning_rate": 0.00098125,
26
- "loss": 2.8566,
27
  "step": 3
28
  },
29
  {
30
- "epoch": 0.05,
31
  "learning_rate": 0.000975,
32
- "loss": 2.773,
33
  "step": 4
34
  },
35
  {
36
- "epoch": 0.06,
37
  "learning_rate": 0.00096875,
38
- "loss": 2.7367,
39
  "step": 5
40
  },
41
  {
42
- "epoch": 0.07,
43
  "learning_rate": 0.0009625,
44
- "loss": 2.7924,
45
  "step": 6
46
  },
47
  {
48
- "epoch": 0.09,
49
  "learning_rate": 0.0009562500000000001,
50
- "loss": 2.7099,
51
  "step": 7
52
  },
53
  {
54
- "epoch": 0.1,
55
  "learning_rate": 0.00095,
56
- "loss": 2.7658,
57
  "step": 8
58
  },
59
  {
60
- "epoch": 0.11,
61
  "learning_rate": 0.00094375,
62
- "loss": 2.7514,
63
  "step": 9
64
  },
65
  {
66
- "epoch": 0.12,
67
  "learning_rate": 0.0009375,
68
- "loss": 2.6899,
69
  "step": 10
70
  },
71
  {
72
- "epoch": 0.14,
73
  "learning_rate": 0.00093125,
74
- "loss": 2.7072,
75
  "step": 11
76
  },
77
  {
78
- "epoch": 0.15,
79
  "learning_rate": 0.000925,
80
- "loss": 2.7687,
81
  "step": 12
82
  },
83
  {
84
- "epoch": 0.16,
85
  "learning_rate": 0.00091875,
86
- "loss": 2.652,
87
  "step": 13
88
  },
89
  {
90
- "epoch": 0.17,
91
  "learning_rate": 0.0009125,
92
- "loss": 2.6991,
93
  "step": 14
94
  },
95
  {
96
- "epoch": 0.19,
97
  "learning_rate": 0.00090625,
98
- "loss": 2.6259,
99
  "step": 15
100
  },
101
  {
102
- "epoch": 0.2,
103
  "learning_rate": 0.0009000000000000001,
104
- "loss": 2.5914,
105
  "step": 16
106
  },
107
  {
108
- "epoch": 0.21,
109
  "learning_rate": 0.00089375,
110
- "loss": 2.7453,
111
  "step": 17
112
  },
113
  {
114
- "epoch": 0.22,
115
  "learning_rate": 0.0008874999999999999,
116
- "loss": 2.6502,
117
  "step": 18
118
  },
119
  {
120
- "epoch": 0.24,
121
  "learning_rate": 0.00088125,
122
- "loss": 2.6558,
123
  "step": 19
124
  },
125
  {
126
- "epoch": 0.25,
127
  "learning_rate": 0.000875,
128
- "loss": 2.6839,
129
  "step": 20
130
  },
131
  {
132
- "epoch": 0.26,
133
  "learning_rate": 0.0008687500000000001,
134
- "loss": 2.5649,
135
  "step": 21
136
  },
137
  {
138
- "epoch": 0.27,
139
  "learning_rate": 0.0008625000000000001,
140
- "loss": 2.724,
141
  "step": 22
142
  },
143
  {
144
- "epoch": 0.29,
145
  "learning_rate": 0.00085625,
146
- "loss": 2.6601,
147
  "step": 23
148
  },
149
  {
150
- "epoch": 0.3,
151
  "learning_rate": 0.00085,
152
- "loss": 2.573,
153
  "step": 24
154
  },
155
  {
156
- "epoch": 0.31,
157
  "learning_rate": 0.00084375,
158
- "loss": 2.5586,
159
  "step": 25
160
  },
161
  {
162
- "epoch": 0.32,
163
  "learning_rate": 0.0008375,
164
- "loss": 2.6952,
165
  "step": 26
166
  },
167
  {
168
- "epoch": 0.34,
169
  "learning_rate": 0.0008312500000000001,
170
- "loss": 2.6349,
171
  "step": 27
172
  },
173
  {
174
- "epoch": 0.35,
175
  "learning_rate": 0.000825,
176
- "loss": 2.7674,
177
  "step": 28
178
  },
179
  {
180
- "epoch": 0.36,
181
  "learning_rate": 0.00081875,
182
- "loss": 2.6728,
183
  "step": 29
184
  },
185
  {
186
- "epoch": 0.37,
187
  "learning_rate": 0.0008125000000000001,
188
- "loss": 2.7608,
189
  "step": 30
190
  },
191
  {
192
- "epoch": 0.39,
193
  "learning_rate": 0.00080625,
194
- "loss": 2.5593,
195
  "step": 31
196
  },
197
  {
198
- "epoch": 0.4,
199
  "learning_rate": 0.0008,
200
- "loss": 2.6426,
201
  "step": 32
202
  },
203
  {
204
- "epoch": 0.41,
205
  "learning_rate": 0.00079375,
206
- "loss": 2.7065,
207
  "step": 33
208
  },
209
  {
210
- "epoch": 0.42,
211
  "learning_rate": 0.0007875,
212
- "loss": 2.7022,
213
  "step": 34
214
  },
215
  {
216
- "epoch": 0.44,
217
  "learning_rate": 0.00078125,
218
- "loss": 2.6398,
219
  "step": 35
220
  },
221
  {
222
- "epoch": 0.45,
223
  "learning_rate": 0.0007750000000000001,
224
- "loss": 2.5589,
225
  "step": 36
226
  },
227
  {
228
- "epoch": 0.46,
229
  "learning_rate": 0.00076875,
230
- "loss": 2.7217,
231
  "step": 37
232
  },
233
  {
234
- "epoch": 0.47,
235
  "learning_rate": 0.0007624999999999999,
236
- "loss": 2.7152,
237
  "step": 38
238
  },
239
  {
240
- "epoch": 0.49,
241
  "learning_rate": 0.00075625,
242
- "loss": 2.6679,
243
  "step": 39
244
  },
245
  {
246
- "epoch": 0.5,
247
  "learning_rate": 0.00075,
248
- "loss": 2.5681,
249
  "step": 40
250
  },
251
  {
252
- "epoch": 0.51,
253
  "learning_rate": 0.00074375,
254
- "loss": 2.6518,
255
  "step": 41
256
  },
257
  {
258
- "epoch": 0.52,
259
  "learning_rate": 0.0007375000000000001,
260
- "loss": 2.6684,
261
  "step": 42
262
  },
263
  {
264
- "epoch": 0.53,
265
  "learning_rate": 0.00073125,
266
- "loss": 2.6757,
267
  "step": 43
268
  },
269
  {
270
- "epoch": 0.55,
271
  "learning_rate": 0.000725,
272
- "loss": 2.6836,
273
  "step": 44
274
  },
275
  {
276
- "epoch": 0.56,
277
  "learning_rate": 0.00071875,
278
- "loss": 2.685,
279
  "step": 45
280
  },
281
  {
282
- "epoch": 0.57,
283
  "learning_rate": 0.0007125,
284
- "loss": 2.6512,
285
  "step": 46
286
  },
287
  {
288
- "epoch": 0.58,
289
  "learning_rate": 0.0007062500000000001,
290
- "loss": 2.6766,
291
  "step": 47
292
  },
293
  {
294
- "epoch": 0.6,
295
  "learning_rate": 0.0007,
296
- "loss": 2.5773,
297
  "step": 48
298
  },
299
  {
300
- "epoch": 0.61,
301
  "learning_rate": 0.00069375,
302
- "loss": 2.7344,
303
  "step": 49
304
  },
305
  {
306
- "epoch": 0.62,
307
  "learning_rate": 0.0006875,
308
- "loss": 2.6468,
309
  "step": 50
310
  },
311
  {
312
- "epoch": 0.63,
313
  "learning_rate": 0.00068125,
314
- "loss": 2.6663,
315
  "step": 51
316
  },
317
  {
318
- "epoch": 0.65,
319
  "learning_rate": 0.000675,
320
- "loss": 2.6363,
321
  "step": 52
322
  },
323
  {
324
- "epoch": 0.66,
325
  "learning_rate": 0.00066875,
326
- "loss": 2.7287,
327
  "step": 53
328
  },
329
  {
330
- "epoch": 0.67,
331
  "learning_rate": 0.0006625,
332
- "loss": 2.7259,
333
  "step": 54
334
  },
335
  {
336
- "epoch": 0.68,
337
  "learning_rate": 0.00065625,
338
- "loss": 2.6912,
339
  "step": 55
340
  },
341
  {
342
- "epoch": 0.7,
343
  "learning_rate": 0.0006500000000000001,
344
- "loss": 2.5956,
345
  "step": 56
346
  },
347
  {
348
- "epoch": 0.71,
349
  "learning_rate": 0.00064375,
350
- "loss": 2.7398,
351
  "step": 57
352
  },
353
  {
354
- "epoch": 0.72,
355
  "learning_rate": 0.0006374999999999999,
356
- "loss": 2.7265,
357
  "step": 58
358
  },
359
  {
360
- "epoch": 0.73,
361
  "learning_rate": 0.00063125,
362
- "loss": 2.6031,
363
  "step": 59
364
  },
365
  {
366
- "epoch": 0.75,
367
  "learning_rate": 0.000625,
368
- "loss": 2.6263,
369
  "step": 60
370
  },
371
  {
372
- "epoch": 0.76,
373
  "learning_rate": 0.00061875,
374
- "loss": 2.6544,
375
  "step": 61
376
  },
377
  {
378
- "epoch": 0.77,
379
  "learning_rate": 0.0006125000000000001,
380
- "loss": 2.7403,
381
  "step": 62
382
  },
383
  {
384
- "epoch": 0.78,
385
  "learning_rate": 0.00060625,
386
- "loss": 2.6565,
387
  "step": 63
388
  },
389
  {
390
- "epoch": 0.8,
391
  "learning_rate": 0.0006,
392
- "loss": 2.7286,
393
  "step": 64
394
  },
395
  {
396
- "epoch": 0.81,
397
  "learning_rate": 0.00059375,
398
- "loss": 2.7061,
399
  "step": 65
400
  },
401
  {
402
- "epoch": 0.82,
403
  "learning_rate": 0.0005875,
404
- "loss": 2.5849,
405
  "step": 66
406
  },
407
  {
408
- "epoch": 0.83,
409
  "learning_rate": 0.0005812500000000001,
410
- "loss": 2.6668,
411
  "step": 67
412
  },
413
  {
414
- "epoch": 0.85,
415
  "learning_rate": 0.000575,
416
- "loss": 2.6454,
417
  "step": 68
418
  },
419
  {
420
- "epoch": 0.86,
421
  "learning_rate": 0.00056875,
422
- "loss": 2.635,
423
  "step": 69
424
  },
425
  {
426
- "epoch": 0.87,
427
  "learning_rate": 0.0005625000000000001,
428
- "loss": 2.6358,
429
  "step": 70
430
  },
431
  {
432
- "epoch": 0.88,
433
  "learning_rate": 0.00055625,
434
- "loss": 2.6379,
435
  "step": 71
436
  },
437
  {
438
- "epoch": 0.9,
439
  "learning_rate": 0.00055,
440
- "loss": 2.6447,
441
  "step": 72
442
  },
443
  {
444
- "epoch": 0.91,
445
  "learning_rate": 0.00054375,
446
- "loss": 2.6083,
447
  "step": 73
448
  },
449
  {
450
- "epoch": 0.92,
451
  "learning_rate": 0.0005375,
452
- "loss": 2.6816,
453
  "step": 74
454
  },
455
  {
456
- "epoch": 0.93,
457
  "learning_rate": 0.00053125,
458
- "loss": 2.6415,
459
  "step": 75
460
  },
461
  {
462
- "epoch": 0.95,
463
  "learning_rate": 0.0005250000000000001,
464
- "loss": 2.6407,
465
  "step": 76
466
  },
467
  {
468
- "epoch": 0.96,
469
  "learning_rate": 0.00051875,
470
- "loss": 2.6056,
471
  "step": 77
472
  },
473
  {
474
- "epoch": 0.97,
475
  "learning_rate": 0.0005124999999999999,
476
- "loss": 2.5552,
477
  "step": 78
478
  },
479
  {
480
- "epoch": 0.98,
481
  "learning_rate": 0.00050625,
482
- "loss": 2.6193,
483
  "step": 79
484
  },
485
  {
486
- "epoch": 1.0,
487
  "learning_rate": 0.0005,
488
- "loss": 2.5949,
489
  "step": 80
490
  },
491
  {
492
- "epoch": 1.01,
493
  "learning_rate": 0.00049375,
494
- "loss": 2.5104,
495
  "step": 81
496
  },
497
  {
498
- "epoch": 1.02,
499
  "learning_rate": 0.0004875,
500
- "loss": 2.4372,
501
  "step": 82
502
  },
503
  {
504
- "epoch": 1.03,
505
  "learning_rate": 0.00048125,
506
- "loss": 2.3689,
507
  "step": 83
508
  },
509
  {
510
- "epoch": 1.05,
511
  "learning_rate": 0.000475,
512
- "loss": 2.5339,
513
  "step": 84
514
  },
515
  {
516
- "epoch": 1.06,
517
  "learning_rate": 0.00046875,
518
- "loss": 2.5443,
519
  "step": 85
520
  },
521
  {
522
- "epoch": 1.07,
523
  "learning_rate": 0.0004625,
524
- "loss": 2.4477,
525
  "step": 86
526
  },
527
  {
528
- "epoch": 1.08,
529
  "learning_rate": 0.00045625,
530
- "loss": 2.4904,
531
  "step": 87
532
  },
533
  {
534
- "epoch": 1.09,
535
  "learning_rate": 0.00045000000000000004,
536
- "loss": 2.43,
537
  "step": 88
538
  },
539
  {
540
- "epoch": 1.11,
541
  "learning_rate": 0.00044374999999999997,
542
- "loss": 2.476,
543
  "step": 89
544
  },
545
  {
546
- "epoch": 1.12,
547
  "learning_rate": 0.0004375,
548
- "loss": 2.5321,
549
  "step": 90
550
  },
551
  {
552
- "epoch": 1.13,
553
  "learning_rate": 0.00043125000000000005,
554
- "loss": 2.4374,
555
  "step": 91
556
  },
557
  {
558
- "epoch": 1.14,
559
  "learning_rate": 0.000425,
560
- "loss": 2.4265,
561
  "step": 92
562
  },
563
  {
564
- "epoch": 1.16,
565
  "learning_rate": 0.00041875,
566
- "loss": 2.4505,
567
  "step": 93
568
  },
569
  {
570
- "epoch": 1.17,
571
  "learning_rate": 0.0004125,
572
- "loss": 2.4275,
573
  "step": 94
574
  },
575
  {
576
- "epoch": 1.18,
577
  "learning_rate": 0.00040625000000000004,
578
- "loss": 2.4458,
579
  "step": 95
580
  },
581
  {
582
- "epoch": 1.19,
583
  "learning_rate": 0.0004,
584
- "loss": 2.5226,
585
  "step": 96
586
  },
587
  {
588
- "epoch": 1.21,
589
  "learning_rate": 0.00039375,
590
- "loss": 2.4048,
591
  "step": 97
592
  },
593
  {
594
- "epoch": 1.22,
595
  "learning_rate": 0.00038750000000000004,
596
- "loss": 2.5091,
597
  "step": 98
598
  },
599
  {
600
- "epoch": 1.23,
601
  "learning_rate": 0.00038124999999999997,
602
- "loss": 2.4583,
603
  "step": 99
604
  },
605
  {
606
- "epoch": 1.24,
607
  "learning_rate": 0.000375,
608
- "loss": 2.4678,
609
  "step": 100
610
  }
611
  ],
612
- "logging_steps": 1,
613
  "max_steps": 160,
614
- "num_train_epochs": 2,
615
- "save_steps": 100,
616
- "total_flos": 2.588289853513728e+16,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.488335925349922,
 
5
  "global_step": 100,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.02,
12
  "learning_rate": 0.00099375,
13
+ "loss": 2.9857,
14
  "step": 1
15
  },
16
  {
17
+ "epoch": 0.05,
18
  "learning_rate": 0.0009875,
19
+ "loss": 2.8497,
20
  "step": 2
21
  },
22
  {
23
+ "epoch": 0.07,
24
  "learning_rate": 0.00098125,
25
+ "loss": 2.8853,
26
  "step": 3
27
  },
28
  {
29
+ "epoch": 0.1,
30
  "learning_rate": 0.000975,
31
+ "loss": 2.7689,
32
  "step": 4
33
  },
34
  {
35
+ "epoch": 0.12,
36
  "learning_rate": 0.00096875,
37
+ "loss": 2.7474,
38
  "step": 5
39
  },
40
  {
41
+ "epoch": 0.15,
42
  "learning_rate": 0.0009625,
43
+ "loss": 2.7695,
44
  "step": 6
45
  },
46
  {
47
+ "epoch": 0.17,
48
  "learning_rate": 0.0009562500000000001,
49
+ "loss": 2.6897,
50
  "step": 7
51
  },
52
  {
53
+ "epoch": 0.2,
54
  "learning_rate": 0.00095,
55
+ "loss": 2.6285,
56
  "step": 8
57
  },
58
  {
59
+ "epoch": 0.22,
60
  "learning_rate": 0.00094375,
61
+ "loss": 2.7107,
62
  "step": 9
63
  },
64
  {
65
+ "epoch": 0.25,
66
  "learning_rate": 0.0009375,
67
+ "loss": 2.6897,
68
  "step": 10
69
  },
70
  {
71
+ "epoch": 0.27,
72
  "learning_rate": 0.00093125,
73
+ "loss": 2.6634,
74
  "step": 11
75
  },
76
  {
77
+ "epoch": 0.3,
78
  "learning_rate": 0.000925,
79
+ "loss": 2.635,
80
  "step": 12
81
  },
82
  {
83
+ "epoch": 0.32,
84
  "learning_rate": 0.00091875,
85
+ "loss": 2.643,
86
  "step": 13
87
  },
88
  {
89
+ "epoch": 0.35,
90
  "learning_rate": 0.0009125,
91
+ "loss": 2.7166,
92
  "step": 14
93
  },
94
  {
95
+ "epoch": 0.37,
96
  "learning_rate": 0.00090625,
97
+ "loss": 2.7354,
98
  "step": 15
99
  },
100
  {
101
+ "epoch": 0.4,
102
  "learning_rate": 0.0009000000000000001,
103
+ "loss": 2.615,
104
  "step": 16
105
  },
106
  {
107
+ "epoch": 0.42,
108
  "learning_rate": 0.00089375,
109
+ "loss": 2.7225,
110
  "step": 17
111
  },
112
  {
113
+ "epoch": 0.45,
114
  "learning_rate": 0.0008874999999999999,
115
+ "loss": 2.6189,
116
  "step": 18
117
  },
118
  {
119
+ "epoch": 0.47,
120
  "learning_rate": 0.00088125,
121
+ "loss": 2.7269,
122
  "step": 19
123
  },
124
  {
125
+ "epoch": 0.5,
126
  "learning_rate": 0.000875,
127
+ "loss": 2.6312,
128
  "step": 20
129
  },
130
  {
131
+ "epoch": 0.52,
132
  "learning_rate": 0.0008687500000000001,
133
+ "loss": 2.6758,
134
  "step": 21
135
  },
136
  {
137
+ "epoch": 0.55,
138
  "learning_rate": 0.0008625000000000001,
139
+ "loss": 2.6997,
140
  "step": 22
141
  },
142
  {
143
+ "epoch": 0.57,
144
  "learning_rate": 0.00085625,
145
+ "loss": 2.6878,
146
  "step": 23
147
  },
148
  {
149
+ "epoch": 0.6,
150
  "learning_rate": 0.00085,
151
+ "loss": 2.6433,
152
  "step": 24
153
  },
154
  {
155
+ "epoch": 0.62,
156
  "learning_rate": 0.00084375,
157
+ "loss": 2.7053,
158
  "step": 25
159
  },
160
  {
161
+ "epoch": 0.65,
162
  "learning_rate": 0.0008375,
163
+ "loss": 2.6745,
164
  "step": 26
165
  },
166
  {
167
+ "epoch": 0.67,
168
  "learning_rate": 0.0008312500000000001,
169
+ "loss": 2.7376,
170
  "step": 27
171
  },
172
  {
173
+ "epoch": 0.7,
174
  "learning_rate": 0.000825,
175
+ "loss": 2.6577,
176
  "step": 28
177
  },
178
  {
179
+ "epoch": 0.72,
180
  "learning_rate": 0.00081875,
181
+ "loss": 2.7428,
182
  "step": 29
183
  },
184
  {
185
+ "epoch": 0.75,
186
  "learning_rate": 0.0008125000000000001,
187
+ "loss": 2.6231,
188
  "step": 30
189
  },
190
  {
191
+ "epoch": 0.77,
192
  "learning_rate": 0.00080625,
193
+ "loss": 2.7078,
194
  "step": 31
195
  },
196
  {
197
+ "epoch": 0.8,
198
  "learning_rate": 0.0008,
199
+ "loss": 2.7057,
200
  "step": 32
201
  },
202
  {
203
+ "epoch": 0.82,
204
  "learning_rate": 0.00079375,
205
+ "loss": 2.6594,
206
  "step": 33
207
  },
208
  {
209
+ "epoch": 0.85,
210
  "learning_rate": 0.0007875,
211
+ "loss": 2.6718,
212
  "step": 34
213
  },
214
  {
215
+ "epoch": 0.87,
216
  "learning_rate": 0.00078125,
217
+ "loss": 2.6459,
218
  "step": 35
219
  },
220
  {
221
+ "epoch": 0.9,
222
  "learning_rate": 0.0007750000000000001,
223
+ "loss": 2.6558,
224
  "step": 36
225
  },
226
  {
227
+ "epoch": 0.92,
228
  "learning_rate": 0.00076875,
229
+ "loss": 2.6559,
230
  "step": 37
231
  },
232
  {
233
+ "epoch": 0.95,
234
  "learning_rate": 0.0007624999999999999,
235
+ "loss": 2.6488,
236
  "step": 38
237
  },
238
  {
239
+ "epoch": 0.97,
240
  "learning_rate": 0.00075625,
241
+ "loss": 2.5902,
242
  "step": 39
243
  },
244
  {
245
+ "epoch": 1.0,
246
  "learning_rate": 0.00075,
247
+ "loss": 2.6244,
248
  "step": 40
249
  },
250
  {
251
+ "epoch": 1.02,
252
  "learning_rate": 0.00074375,
253
+ "loss": 2.5278,
254
  "step": 41
255
  },
256
  {
257
+ "epoch": 1.05,
258
  "learning_rate": 0.0007375000000000001,
259
+ "loss": 2.5213,
260
  "step": 42
261
  },
262
  {
263
+ "epoch": 1.07,
264
  "learning_rate": 0.00073125,
265
+ "loss": 2.5515,
266
  "step": 43
267
  },
268
  {
269
+ "epoch": 1.09,
270
  "learning_rate": 0.000725,
271
+ "loss": 2.5196,
272
  "step": 44
273
  },
274
  {
275
+ "epoch": 1.12,
276
  "learning_rate": 0.00071875,
277
+ "loss": 2.5725,
278
  "step": 45
279
  },
280
  {
281
+ "epoch": 1.14,
282
  "learning_rate": 0.0007125,
283
+ "loss": 2.4971,
284
  "step": 46
285
  },
286
  {
287
+ "epoch": 1.17,
288
  "learning_rate": 0.0007062500000000001,
289
+ "loss": 2.4976,
290
  "step": 47
291
  },
292
  {
293
+ "epoch": 1.19,
294
  "learning_rate": 0.0007,
295
+ "loss": 2.5467,
296
  "step": 48
297
  },
298
  {
299
+ "epoch": 1.22,
300
  "learning_rate": 0.00069375,
301
+ "loss": 2.5302,
302
  "step": 49
303
  },
304
  {
305
+ "epoch": 1.24,
306
  "learning_rate": 0.0006875,
307
+ "loss": 2.5224,
308
  "step": 50
309
  },
310
  {
311
+ "epoch": 1.27,
312
  "learning_rate": 0.00068125,
313
+ "loss": 2.5451,
314
  "step": 51
315
  },
316
  {
317
+ "epoch": 1.29,
318
  "learning_rate": 0.000675,
319
+ "loss": 2.4609,
320
  "step": 52
321
  },
322
  {
323
+ "epoch": 1.32,
324
  "learning_rate": 0.00066875,
325
+ "loss": 2.4897,
326
  "step": 53
327
  },
328
  {
329
+ "epoch": 1.34,
330
  "learning_rate": 0.0006625,
331
+ "loss": 2.5583,
332
  "step": 54
333
  },
334
  {
335
+ "epoch": 1.37,
336
  "learning_rate": 0.00065625,
337
+ "loss": 2.5675,
338
  "step": 55
339
  },
340
  {
341
+ "epoch": 1.39,
342
  "learning_rate": 0.0006500000000000001,
343
+ "loss": 2.5598,
344
  "step": 56
345
  },
346
  {
347
+ "epoch": 1.42,
348
  "learning_rate": 0.00064375,
349
+ "loss": 2.48,
350
  "step": 57
351
  },
352
  {
353
+ "epoch": 1.44,
354
  "learning_rate": 0.0006374999999999999,
355
+ "loss": 2.4761,
356
  "step": 58
357
  },
358
  {
359
+ "epoch": 1.47,
360
  "learning_rate": 0.00063125,
361
+ "loss": 2.5599,
362
  "step": 59
363
  },
364
  {
365
+ "epoch": 1.49,
366
  "learning_rate": 0.000625,
367
+ "loss": 2.5199,
368
  "step": 60
369
  },
370
  {
371
+ "epoch": 1.52,
372
  "learning_rate": 0.00061875,
373
+ "loss": 2.5161,
374
  "step": 61
375
  },
376
  {
377
+ "epoch": 1.54,
378
  "learning_rate": 0.0006125000000000001,
379
+ "loss": 2.5797,
380
  "step": 62
381
  },
382
  {
383
+ "epoch": 1.57,
384
  "learning_rate": 0.00060625,
385
+ "loss": 2.5345,
386
  "step": 63
387
  },
388
  {
389
+ "epoch": 1.59,
390
  "learning_rate": 0.0006,
391
+ "loss": 2.4726,
392
  "step": 64
393
  },
394
  {
395
+ "epoch": 1.62,
396
  "learning_rate": 0.00059375,
397
+ "loss": 2.448,
398
  "step": 65
399
  },
400
  {
401
+ "epoch": 1.64,
402
  "learning_rate": 0.0005875,
403
+ "loss": 2.4939,
404
  "step": 66
405
  },
406
  {
407
+ "epoch": 1.67,
408
  "learning_rate": 0.0005812500000000001,
409
+ "loss": 2.4881,
410
  "step": 67
411
  },
412
  {
413
+ "epoch": 1.69,
414
  "learning_rate": 0.000575,
415
+ "loss": 2.5731,
416
  "step": 68
417
  },
418
  {
419
+ "epoch": 1.72,
420
  "learning_rate": 0.00056875,
421
+ "loss": 2.5114,
422
  "step": 69
423
  },
424
  {
425
+ "epoch": 1.74,
426
  "learning_rate": 0.0005625000000000001,
427
+ "loss": 2.5049,
428
  "step": 70
429
  },
430
  {
431
+ "epoch": 1.77,
432
  "learning_rate": 0.00055625,
433
+ "loss": 2.4856,
434
  "step": 71
435
  },
436
  {
437
+ "epoch": 1.79,
438
  "learning_rate": 0.00055,
439
+ "loss": 2.5077,
440
  "step": 72
441
  },
442
  {
443
+ "epoch": 1.82,
444
  "learning_rate": 0.00054375,
445
+ "loss": 2.4801,
446
  "step": 73
447
  },
448
  {
449
+ "epoch": 1.84,
450
  "learning_rate": 0.0005375,
451
+ "loss": 2.5599,
452
  "step": 74
453
  },
454
  {
455
+ "epoch": 1.87,
456
  "learning_rate": 0.00053125,
457
+ "loss": 2.5269,
458
  "step": 75
459
  },
460
  {
461
+ "epoch": 1.89,
462
  "learning_rate": 0.0005250000000000001,
463
+ "loss": 2.4963,
464
  "step": 76
465
  },
466
  {
467
+ "epoch": 1.92,
468
  "learning_rate": 0.00051875,
469
+ "loss": 2.4595,
470
  "step": 77
471
  },
472
  {
473
+ "epoch": 1.94,
474
  "learning_rate": 0.0005124999999999999,
475
+ "loss": 2.4801,
476
  "step": 78
477
  },
478
  {
479
+ "epoch": 1.97,
480
  "learning_rate": 0.00050625,
481
+ "loss": 2.5154,
482
  "step": 79
483
  },
484
  {
485
+ "epoch": 1.99,
486
  "learning_rate": 0.0005,
487
+ "loss": 2.5179,
488
  "step": 80
489
  },
490
  {
491
+ "epoch": 2.02,
492
  "learning_rate": 0.00049375,
493
+ "loss": 2.4121,
494
  "step": 81
495
  },
496
  {
497
+ "epoch": 2.04,
498
  "learning_rate": 0.0004875,
499
+ "loss": 2.354,
500
  "step": 82
501
  },
502
  {
503
+ "epoch": 2.07,
504
  "learning_rate": 0.00048125,
505
+ "loss": 2.3033,
506
  "step": 83
507
  },
508
  {
509
+ "epoch": 2.09,
510
  "learning_rate": 0.000475,
511
+ "loss": 2.4023,
512
  "step": 84
513
  },
514
  {
515
+ "epoch": 2.12,
516
  "learning_rate": 0.00046875,
517
+ "loss": 2.3755,
518
  "step": 85
519
  },
520
  {
521
+ "epoch": 2.14,
522
  "learning_rate": 0.0004625,
523
+ "loss": 2.3892,
524
  "step": 86
525
  },
526
  {
527
+ "epoch": 2.16,
528
  "learning_rate": 0.00045625,
529
+ "loss": 2.3534,
530
  "step": 87
531
  },
532
  {
533
+ "epoch": 2.19,
534
  "learning_rate": 0.00045000000000000004,
535
+ "loss": 2.3634,
536
  "step": 88
537
  },
538
  {
539
+ "epoch": 2.21,
540
  "learning_rate": 0.00044374999999999997,
541
+ "loss": 2.3336,
542
  "step": 89
543
  },
544
  {
545
+ "epoch": 2.24,
546
  "learning_rate": 0.0004375,
547
+ "loss": 2.3646,
548
  "step": 90
549
  },
550
  {
551
+ "epoch": 2.26,
552
  "learning_rate": 0.00043125000000000005,
553
+ "loss": 2.3333,
554
  "step": 91
555
  },
556
  {
557
+ "epoch": 2.29,
558
  "learning_rate": 0.000425,
559
+ "loss": 2.3344,
560
  "step": 92
561
  },
562
  {
563
+ "epoch": 2.31,
564
  "learning_rate": 0.00041875,
565
+ "loss": 2.3308,
566
  "step": 93
567
  },
568
  {
569
+ "epoch": 2.34,
570
  "learning_rate": 0.0004125,
571
+ "loss": 2.3677,
572
  "step": 94
573
  },
574
  {
575
+ "epoch": 2.36,
576
  "learning_rate": 0.00040625000000000004,
577
+ "loss": 2.3755,
578
  "step": 95
579
  },
580
  {
581
+ "epoch": 2.39,
582
  "learning_rate": 0.0004,
583
+ "loss": 2.3721,
584
  "step": 96
585
  },
586
  {
587
+ "epoch": 2.41,
588
  "learning_rate": 0.00039375,
589
+ "loss": 2.3463,
590
  "step": 97
591
  },
592
  {
593
+ "epoch": 2.44,
594
  "learning_rate": 0.00038750000000000004,
595
+ "loss": 2.3536,
596
  "step": 98
597
  },
598
  {
599
+ "epoch": 2.46,
600
  "learning_rate": 0.00038124999999999997,
601
+ "loss": 2.3895,
602
  "step": 99
603
  },
604
  {
605
+ "epoch": 2.49,
606
  "learning_rate": 0.000375,
607
+ "loss": 2.3767,
608
  "step": 100
609
  }
610
  ],
 
611
  "max_steps": 160,
612
+ "num_train_epochs": 4,
613
+ "total_flos": 5.95613039443968e+16,
 
614
  "trial_name": null,
615
  "trial_params": null
616
  }
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81575686b087f7a47dde3ff139848be704047c90bfa69764a56ebff4b68eac6b
3
- size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29aa9882c4c56a1507dbc7c22ccd2d199e9b5b94a8c3844ada75fa3f9e9d5e41
3
+ size 3963
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81575686b087f7a47dde3ff139848be704047c90bfa69764a56ebff4b68eac6b
3
- size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29aa9882c4c56a1507dbc7c22ccd2d199e9b5b94a8c3844ada75fa3f9e9d5e41
3
+ size 3963