CreatorPhan commited on
Commit
9c99b97
1 Parent(s): 514b0e7

Upload folder using huggingface_hub (#5)

Browse files

- Upload folder using huggingface_hub (7bf47f8d245cb9b01e49872c35478b9a319b0fb7)

Files changed (7) hide show
  1. README.md +78 -0
  2. adapter_model.bin +1 -1
  3. optimizer.pt +1 -1
  4. scheduler.pt +1 -1
  5. tokenizer.json +2 -2
  6. trainer_state.json +304 -304
  7. training_args.bin +1 -1
README.md CHANGED
@@ -4,6 +4,78 @@ library_name: peft
4
  ## Training procedure
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  The following `bitsandbytes` quantization config was used during training:
8
  - quant_method: bitsandbytes
9
  - load_in_8bit: True
@@ -77,6 +149,12 @@ The following `bitsandbytes` quantization config was used during training:
77
  - bnb_4bit_compute_dtype: float32
78
  ### Framework versions
79
 
 
 
 
 
 
 
80
  - PEFT 0.6.0.dev0
81
  - PEFT 0.6.0.dev0
82
  - PEFT 0.6.0.dev0
 
4
  ## Training procedure
5
 
6
 
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: True
10
+ - load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+
19
+ The following `bitsandbytes` quantization config was used during training:
20
+ - quant_method: bitsandbytes
21
+ - load_in_8bit: True
22
+ - load_in_4bit: False
23
+ - llm_int8_threshold: 6.0
24
+ - llm_int8_skip_modules: None
25
+ - llm_int8_enable_fp32_cpu_offload: False
26
+ - llm_int8_has_fp16_weight: False
27
+ - bnb_4bit_quant_type: fp4
28
+ - bnb_4bit_use_double_quant: False
29
+ - bnb_4bit_compute_dtype: float32
30
+
31
+ The following `bitsandbytes` quantization config was used during training:
32
+ - quant_method: bitsandbytes
33
+ - load_in_8bit: True
34
+ - load_in_4bit: False
35
+ - llm_int8_threshold: 6.0
36
+ - llm_int8_skip_modules: None
37
+ - llm_int8_enable_fp32_cpu_offload: False
38
+ - llm_int8_has_fp16_weight: False
39
+ - bnb_4bit_quant_type: fp4
40
+ - bnb_4bit_use_double_quant: False
41
+ - bnb_4bit_compute_dtype: float32
42
+
43
+ The following `bitsandbytes` quantization config was used during training:
44
+ - quant_method: bitsandbytes
45
+ - load_in_8bit: True
46
+ - load_in_4bit: False
47
+ - llm_int8_threshold: 6.0
48
+ - llm_int8_skip_modules: None
49
+ - llm_int8_enable_fp32_cpu_offload: False
50
+ - llm_int8_has_fp16_weight: False
51
+ - bnb_4bit_quant_type: fp4
52
+ - bnb_4bit_use_double_quant: False
53
+ - bnb_4bit_compute_dtype: float32
54
+
55
+ The following `bitsandbytes` quantization config was used during training:
56
+ - quant_method: bitsandbytes
57
+ - load_in_8bit: True
58
+ - load_in_4bit: False
59
+ - llm_int8_threshold: 6.0
60
+ - llm_int8_skip_modules: None
61
+ - llm_int8_enable_fp32_cpu_offload: False
62
+ - llm_int8_has_fp16_weight: False
63
+ - bnb_4bit_quant_type: fp4
64
+ - bnb_4bit_use_double_quant: False
65
+ - bnb_4bit_compute_dtype: float32
66
+
67
+ The following `bitsandbytes` quantization config was used during training:
68
+ - quant_method: bitsandbytes
69
+ - load_in_8bit: True
70
+ - load_in_4bit: False
71
+ - llm_int8_threshold: 6.0
72
+ - llm_int8_skip_modules: None
73
+ - llm_int8_enable_fp32_cpu_offload: False
74
+ - llm_int8_has_fp16_weight: False
75
+ - bnb_4bit_quant_type: fp4
76
+ - bnb_4bit_use_double_quant: False
77
+ - bnb_4bit_compute_dtype: float32
78
+
79
  The following `bitsandbytes` quantization config was used during training:
80
  - quant_method: bitsandbytes
81
  - load_in_8bit: True
 
149
  - bnb_4bit_compute_dtype: float32
150
  ### Framework versions
151
 
152
+ - PEFT 0.6.0.dev0
153
+ - PEFT 0.6.0.dev0
154
+ - PEFT 0.6.0.dev0
155
+ - PEFT 0.6.0.dev0
156
+ - PEFT 0.6.0.dev0
157
+ - PEFT 0.6.0.dev0
158
  - PEFT 0.6.0.dev0
159
  - PEFT 0.6.0.dev0
160
  - PEFT 0.6.0.dev0
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55a0094d743e7e13c7a0ebcf10d19f1fe8ff887d02d075e6c5e7b69adb639411
3
  size 39409357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab7d7332df354d85019ef8dbda22bf275f3e4612ebbe07ca7d3538dd755384e9
3
  size 39409357
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b9657617f6a4d6bdaeed4dec604e705c37715edc8bcaacfc4f4d9eeeedd18ab
3
  size 78844421
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e7ecf4519b7c7fe30096c1ce0de750678f8cf403d65a166a096e8155b6d1665
3
  size 78844421
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9949a4a87ed7ae3838928a0d8e8fa579957588c0727794513b6eaf6d89de58e5
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02822a64d6ab3629baf2f69c5adf000658973be0bc2532154cc2534085175f34
3
  size 627
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a208233d2ee8d8c83b23bc214df737c44806a1919f444e89b31e586cd956ba
3
- size 14500471
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85b00d7db4df5df2e3f01cacc3feda246002a672f3356eec7f4b04a22eb0dfbe
3
+ size 14500570
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9626955475330926,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
@@ -9,611 +9,611 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
- "learning_rate": 0.00019902912621359224,
14
- "loss": 2.6183,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.02,
19
- "learning_rate": 0.00019805825242718447,
20
- "loss": 2.2731,
21
  "step": 2
22
  },
23
  {
24
- "epoch": 0.03,
25
- "learning_rate": 0.0001970873786407767,
26
- "loss": 2.4264,
27
  "step": 3
28
  },
29
  {
30
- "epoch": 0.04,
31
- "learning_rate": 0.00019611650485436895,
32
- "loss": 2.3803,
33
  "step": 4
34
  },
35
  {
36
- "epoch": 0.05,
37
- "learning_rate": 0.00019514563106796118,
38
- "loss": 2.2789,
39
  "step": 5
40
  },
41
  {
42
- "epoch": 0.06,
43
- "learning_rate": 0.0001941747572815534,
44
- "loss": 2.5586,
45
  "step": 6
46
  },
47
  {
48
- "epoch": 0.07,
49
- "learning_rate": 0.00019320388349514564,
50
- "loss": 2.3255,
51
  "step": 7
52
  },
53
  {
54
- "epoch": 0.08,
55
- "learning_rate": 0.00019223300970873787,
56
- "loss": 2.2983,
57
  "step": 8
58
  },
59
  {
60
- "epoch": 0.09,
61
- "learning_rate": 0.0001912621359223301,
62
- "loss": 2.1903,
63
  "step": 9
64
  },
65
  {
66
- "epoch": 0.1,
67
- "learning_rate": 0.00019029126213592236,
68
- "loss": 2.3516,
69
  "step": 10
70
  },
71
  {
72
- "epoch": 0.11,
73
- "learning_rate": 0.00018932038834951458,
74
- "loss": 2.215,
75
  "step": 11
76
  },
77
  {
78
- "epoch": 0.12,
79
- "learning_rate": 0.00018834951456310681,
80
- "loss": 2.2354,
81
  "step": 12
82
  },
83
  {
84
- "epoch": 0.13,
85
- "learning_rate": 0.00018737864077669904,
86
- "loss": 2.2487,
87
  "step": 13
88
  },
89
  {
90
- "epoch": 0.13,
91
- "learning_rate": 0.00018640776699029127,
92
- "loss": 2.1957,
93
  "step": 14
94
  },
95
  {
96
- "epoch": 0.14,
97
- "learning_rate": 0.0001854368932038835,
98
- "loss": 2.2036,
99
  "step": 15
100
  },
101
  {
102
- "epoch": 0.15,
103
- "learning_rate": 0.00018446601941747576,
104
- "loss": 2.1787,
105
  "step": 16
106
  },
107
  {
108
- "epoch": 0.16,
109
- "learning_rate": 0.00018349514563106799,
110
- "loss": 2.1839,
111
  "step": 17
112
  },
113
  {
114
- "epoch": 0.17,
115
- "learning_rate": 0.00018252427184466022,
116
- "loss": 2.1533,
117
  "step": 18
118
  },
119
  {
120
- "epoch": 0.18,
121
- "learning_rate": 0.00018155339805825244,
122
- "loss": 2.2554,
123
  "step": 19
124
  },
125
  {
126
- "epoch": 0.19,
127
- "learning_rate": 0.00018058252427184467,
128
- "loss": 2.2778,
129
  "step": 20
130
  },
131
  {
132
- "epoch": 0.2,
133
- "learning_rate": 0.0001796116504854369,
134
- "loss": 2.382,
135
  "step": 21
136
  },
137
  {
138
- "epoch": 0.21,
139
- "learning_rate": 0.00017864077669902913,
140
- "loss": 2.0803,
141
  "step": 22
142
  },
143
  {
144
- "epoch": 0.22,
145
- "learning_rate": 0.0001776699029126214,
146
- "loss": 2.26,
147
  "step": 23
148
  },
149
  {
150
- "epoch": 0.23,
151
- "learning_rate": 0.00017669902912621362,
152
- "loss": 2.2557,
153
  "step": 24
154
  },
155
  {
156
- "epoch": 0.24,
157
- "learning_rate": 0.00017572815533980585,
158
- "loss": 2.0614,
159
  "step": 25
160
  },
161
  {
162
- "epoch": 0.25,
163
- "learning_rate": 0.00017475728155339805,
164
- "loss": 2.1342,
165
  "step": 26
166
  },
167
  {
168
- "epoch": 0.26,
169
- "learning_rate": 0.00017378640776699028,
170
- "loss": 2.1781,
171
  "step": 27
172
  },
173
  {
174
- "epoch": 0.27,
175
- "learning_rate": 0.00017281553398058253,
176
- "loss": 2.1828,
177
  "step": 28
178
  },
179
  {
180
- "epoch": 0.28,
181
- "learning_rate": 0.00017184466019417476,
182
- "loss": 2.0557,
183
  "step": 29
184
  },
185
  {
186
- "epoch": 0.29,
187
- "learning_rate": 0.000170873786407767,
188
- "loss": 2.1914,
189
  "step": 30
190
  },
191
  {
192
- "epoch": 0.3,
193
- "learning_rate": 0.00016990291262135922,
194
- "loss": 2.3306,
195
  "step": 31
196
  },
197
  {
198
- "epoch": 0.31,
199
- "learning_rate": 0.00016893203883495145,
200
- "loss": 2.2901,
201
  "step": 32
202
  },
203
  {
204
- "epoch": 0.32,
205
- "learning_rate": 0.00016796116504854368,
206
- "loss": 2.1166,
207
  "step": 33
208
  },
209
  {
210
- "epoch": 0.33,
211
- "learning_rate": 0.00016699029126213594,
212
- "loss": 2.2927,
213
  "step": 34
214
  },
215
  {
216
- "epoch": 0.34,
217
- "learning_rate": 0.00016601941747572817,
218
- "loss": 2.2732,
219
  "step": 35
220
  },
221
  {
222
- "epoch": 0.35,
223
- "learning_rate": 0.0001650485436893204,
224
- "loss": 2.1614,
225
  "step": 36
226
  },
227
  {
228
- "epoch": 0.36,
229
- "learning_rate": 0.00016407766990291262,
230
- "loss": 2.1986,
231
  "step": 37
232
  },
233
  {
234
- "epoch": 0.37,
235
- "learning_rate": 0.00016310679611650485,
236
- "loss": 2.3506,
237
  "step": 38
238
  },
239
  {
240
- "epoch": 0.38,
241
- "learning_rate": 0.00016213592233009708,
242
- "loss": 2.2425,
243
  "step": 39
244
  },
245
  {
246
- "epoch": 0.39,
247
- "learning_rate": 0.0001611650485436893,
248
- "loss": 2.2483,
249
  "step": 40
250
  },
251
  {
252
- "epoch": 0.39,
253
- "learning_rate": 0.00016019417475728157,
254
- "loss": 2.031,
255
  "step": 41
256
  },
257
  {
258
- "epoch": 0.4,
259
- "learning_rate": 0.0001592233009708738,
260
- "loss": 2.1587,
261
  "step": 42
262
  },
263
  {
264
- "epoch": 0.41,
265
- "learning_rate": 0.00015825242718446603,
266
- "loss": 2.1529,
267
  "step": 43
268
  },
269
  {
270
- "epoch": 0.42,
271
- "learning_rate": 0.00015728155339805825,
272
- "loss": 2.181,
273
  "step": 44
274
  },
275
  {
276
- "epoch": 0.43,
277
- "learning_rate": 0.00015631067961165048,
278
- "loss": 2.1168,
279
  "step": 45
280
  },
281
  {
282
- "epoch": 0.44,
283
- "learning_rate": 0.0001553398058252427,
284
- "loss": 2.2189,
285
  "step": 46
286
  },
287
  {
288
- "epoch": 0.45,
289
- "learning_rate": 0.00015436893203883497,
290
- "loss": 2.1362,
291
  "step": 47
292
  },
293
  {
294
- "epoch": 0.46,
295
- "learning_rate": 0.0001533980582524272,
296
- "loss": 2.0704,
297
  "step": 48
298
  },
299
  {
300
- "epoch": 0.47,
301
- "learning_rate": 0.00015242718446601943,
302
- "loss": 2.1273,
303
  "step": 49
304
  },
305
  {
306
- "epoch": 0.48,
307
- "learning_rate": 0.00015145631067961166,
308
- "loss": 2.1639,
309
  "step": 50
310
  },
311
  {
312
- "epoch": 0.49,
313
- "learning_rate": 0.00015048543689320389,
314
- "loss": 2.1639,
315
  "step": 51
316
  },
317
  {
318
- "epoch": 0.5,
319
- "learning_rate": 0.00014951456310679611,
320
- "loss": 2.0664,
321
  "step": 52
322
  },
323
  {
324
- "epoch": 0.51,
325
- "learning_rate": 0.00014854368932038834,
326
- "loss": 2.0539,
327
  "step": 53
328
  },
329
  {
330
- "epoch": 0.52,
331
- "learning_rate": 0.0001475728155339806,
332
- "loss": 2.206,
333
  "step": 54
334
  },
335
  {
336
- "epoch": 0.53,
337
- "learning_rate": 0.00014660194174757283,
338
- "loss": 2.1366,
339
  "step": 55
340
  },
341
  {
342
- "epoch": 0.54,
343
- "learning_rate": 0.00014563106796116506,
344
- "loss": 2.1016,
345
  "step": 56
346
  },
347
  {
348
- "epoch": 0.55,
349
- "learning_rate": 0.0001446601941747573,
350
- "loss": 2.1042,
351
  "step": 57
352
  },
353
  {
354
- "epoch": 0.56,
355
- "learning_rate": 0.00014368932038834952,
356
- "loss": 2.144,
357
  "step": 58
358
  },
359
  {
360
- "epoch": 0.57,
361
- "learning_rate": 0.00014271844660194175,
362
- "loss": 2.0834,
363
  "step": 59
364
  },
365
  {
366
- "epoch": 0.58,
367
- "learning_rate": 0.000141747572815534,
368
- "loss": 2.0255,
369
  "step": 60
370
  },
371
  {
372
- "epoch": 0.59,
373
- "learning_rate": 0.00014077669902912623,
374
- "loss": 2.131,
375
  "step": 61
376
  },
377
  {
378
- "epoch": 0.6,
379
- "learning_rate": 0.00013980582524271846,
380
- "loss": 2.2428,
381
  "step": 62
382
  },
383
  {
384
- "epoch": 0.61,
385
- "learning_rate": 0.0001388349514563107,
386
- "loss": 2.0831,
387
  "step": 63
388
  },
389
  {
390
- "epoch": 0.62,
391
- "learning_rate": 0.00013786407766990292,
392
- "loss": 2.1633,
393
  "step": 64
394
  },
395
  {
396
- "epoch": 0.63,
397
- "learning_rate": 0.00013689320388349515,
398
- "loss": 2.2224,
399
  "step": 65
400
  },
401
  {
402
- "epoch": 0.64,
403
- "learning_rate": 0.0001359223300970874,
404
- "loss": 2.0999,
405
  "step": 66
406
  },
407
  {
408
- "epoch": 0.65,
409
- "learning_rate": 0.00013495145631067963,
410
- "loss": 2.1749,
411
  "step": 67
412
  },
413
  {
414
- "epoch": 0.65,
415
- "learning_rate": 0.00013398058252427186,
416
- "loss": 1.9726,
417
  "step": 68
418
  },
419
  {
420
- "epoch": 0.66,
421
- "learning_rate": 0.0001330097087378641,
422
- "loss": 2.1678,
423
  "step": 69
424
  },
425
  {
426
- "epoch": 0.67,
427
- "learning_rate": 0.00013203883495145632,
428
- "loss": 2.0646,
429
  "step": 70
430
  },
431
  {
432
- "epoch": 0.68,
433
- "learning_rate": 0.00013106796116504855,
434
- "loss": 2.0049,
435
  "step": 71
436
  },
437
  {
438
- "epoch": 0.69,
439
- "learning_rate": 0.00013009708737864078,
440
- "loss": 2.0944,
441
  "step": 72
442
  },
443
  {
444
- "epoch": 0.7,
445
- "learning_rate": 0.00012912621359223304,
446
- "loss": 2.2013,
447
  "step": 73
448
  },
449
  {
450
- "epoch": 0.71,
451
- "learning_rate": 0.00012815533980582526,
452
- "loss": 2.311,
453
  "step": 74
454
  },
455
  {
456
- "epoch": 0.72,
457
- "learning_rate": 0.0001271844660194175,
458
- "loss": 2.0863,
459
  "step": 75
460
  },
461
  {
462
- "epoch": 0.73,
463
- "learning_rate": 0.00012621359223300972,
464
- "loss": 2.2028,
465
  "step": 76
466
  },
467
  {
468
- "epoch": 0.74,
469
- "learning_rate": 0.00012524271844660195,
470
- "loss": 2.0283,
471
  "step": 77
472
  },
473
  {
474
- "epoch": 0.75,
475
- "learning_rate": 0.00012427184466019418,
476
- "loss": 2.2133,
477
  "step": 78
478
  },
479
  {
480
- "epoch": 0.76,
481
- "learning_rate": 0.0001233009708737864,
482
- "loss": 2.1084,
483
  "step": 79
484
  },
485
  {
486
- "epoch": 0.77,
487
- "learning_rate": 0.00012233009708737864,
488
- "loss": 1.967,
489
  "step": 80
490
  },
491
  {
492
- "epoch": 0.78,
493
- "learning_rate": 0.00012135922330097087,
494
- "loss": 2.3109,
495
  "step": 81
496
  },
497
  {
498
- "epoch": 0.79,
499
- "learning_rate": 0.0001203883495145631,
500
- "loss": 2.2248,
501
  "step": 82
502
  },
503
  {
504
- "epoch": 0.8,
505
- "learning_rate": 0.00011941747572815534,
506
- "loss": 2.1178,
507
  "step": 83
508
  },
509
  {
510
- "epoch": 0.81,
511
- "learning_rate": 0.00011844660194174757,
512
- "loss": 2.161,
513
  "step": 84
514
  },
515
  {
516
- "epoch": 0.82,
517
- "learning_rate": 0.0001174757281553398,
518
- "loss": 2.0778,
519
  "step": 85
520
  },
521
  {
522
- "epoch": 0.83,
523
- "learning_rate": 0.00011650485436893204,
524
- "loss": 2.2326,
525
  "step": 86
526
  },
527
  {
528
- "epoch": 0.84,
529
- "learning_rate": 0.00011553398058252427,
530
- "loss": 2.0262,
531
  "step": 87
532
  },
533
  {
534
- "epoch": 0.85,
535
- "learning_rate": 0.0001145631067961165,
536
- "loss": 2.076,
537
  "step": 88
538
  },
539
  {
540
- "epoch": 0.86,
541
- "learning_rate": 0.00011359223300970874,
542
- "loss": 2.0044,
543
  "step": 89
544
  },
545
  {
546
- "epoch": 0.87,
547
- "learning_rate": 0.00011262135922330097,
548
- "loss": 2.0397,
549
  "step": 90
550
  },
551
  {
552
- "epoch": 0.88,
553
- "learning_rate": 0.0001116504854368932,
554
- "loss": 2.1135,
555
  "step": 91
556
  },
557
  {
558
- "epoch": 0.89,
559
- "learning_rate": 0.00011067961165048544,
560
- "loss": 2.29,
561
  "step": 92
562
  },
563
  {
564
- "epoch": 0.9,
565
- "learning_rate": 0.00010970873786407767,
566
- "loss": 1.949,
567
  "step": 93
568
  },
569
  {
570
- "epoch": 0.9,
571
- "learning_rate": 0.0001087378640776699,
572
- "loss": 2.1177,
573
  "step": 94
574
  },
575
  {
576
- "epoch": 0.91,
577
- "learning_rate": 0.00010776699029126213,
578
- "loss": 2.1405,
579
  "step": 95
580
  },
581
  {
582
- "epoch": 0.92,
583
- "learning_rate": 0.00010679611650485437,
584
- "loss": 2.2089,
585
  "step": 96
586
  },
587
  {
588
- "epoch": 0.93,
589
- "learning_rate": 0.0001058252427184466,
590
- "loss": 2.1267,
591
  "step": 97
592
  },
593
  {
594
- "epoch": 0.94,
595
- "learning_rate": 0.00010485436893203883,
596
- "loss": 2.0522,
597
  "step": 98
598
  },
599
  {
600
- "epoch": 0.95,
601
- "learning_rate": 0.00010388349514563107,
602
- "loss": 2.0273,
603
  "step": 99
604
  },
605
  {
606
- "epoch": 0.96,
607
- "learning_rate": 0.0001029126213592233,
608
- "loss": 2.1042,
609
  "step": 100
610
  }
611
  ],
612
  "logging_steps": 1,
613
- "max_steps": 206,
614
- "num_train_epochs": 2,
615
  "save_steps": 100,
616
- "total_flos": 2.9528801422848e+16,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3429796355841372,
5
  "eval_steps": 500,
6
  "global_step": 100,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 0.00019931271477663232,
14
+ "loss": 2.5587,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.01,
19
+ "learning_rate": 0.0001986254295532646,
20
+ "loss": 2.3914,
21
  "step": 2
22
  },
23
  {
24
+ "epoch": 0.01,
25
+ "learning_rate": 0.00019793814432989693,
26
+ "loss": 2.4218,
27
  "step": 3
28
  },
29
  {
30
+ "epoch": 0.01,
31
+ "learning_rate": 0.00019725085910652924,
32
+ "loss": 2.3414,
33
  "step": 4
34
  },
35
  {
36
+ "epoch": 0.02,
37
+ "learning_rate": 0.0001965635738831615,
38
+ "loss": 2.2469,
39
  "step": 5
40
  },
41
  {
42
+ "epoch": 0.02,
43
+ "learning_rate": 0.00019587628865979381,
44
+ "loss": 2.3241,
45
  "step": 6
46
  },
47
  {
48
+ "epoch": 0.02,
49
+ "learning_rate": 0.00019518900343642613,
50
+ "loss": 2.3266,
51
  "step": 7
52
  },
53
  {
54
+ "epoch": 0.03,
55
+ "learning_rate": 0.00019450171821305842,
56
+ "loss": 2.1856,
57
  "step": 8
58
  },
59
  {
60
+ "epoch": 0.03,
61
+ "learning_rate": 0.00019381443298969073,
62
+ "loss": 2.3247,
63
  "step": 9
64
  },
65
  {
66
+ "epoch": 0.03,
67
+ "learning_rate": 0.00019312714776632305,
68
+ "loss": 2.3245,
69
  "step": 10
70
  },
71
  {
72
+ "epoch": 0.04,
73
+ "learning_rate": 0.00019243986254295533,
74
+ "loss": 2.2591,
75
  "step": 11
76
  },
77
  {
78
+ "epoch": 0.04,
79
+ "learning_rate": 0.00019175257731958765,
80
+ "loss": 2.1767,
81
  "step": 12
82
  },
83
  {
84
+ "epoch": 0.04,
85
+ "learning_rate": 0.00019106529209621996,
86
+ "loss": 2.3478,
87
  "step": 13
88
  },
89
  {
90
+ "epoch": 0.05,
91
+ "learning_rate": 0.00019037800687285222,
92
+ "loss": 2.3339,
93
  "step": 14
94
  },
95
  {
96
+ "epoch": 0.05,
97
+ "learning_rate": 0.00018969072164948454,
98
+ "loss": 2.234,
99
  "step": 15
100
  },
101
  {
102
+ "epoch": 0.05,
103
+ "learning_rate": 0.00018900343642611685,
104
+ "loss": 2.2651,
105
  "step": 16
106
  },
107
  {
108
+ "epoch": 0.06,
109
+ "learning_rate": 0.00018831615120274914,
110
+ "loss": 2.1831,
111
  "step": 17
112
  },
113
  {
114
+ "epoch": 0.06,
115
+ "learning_rate": 0.00018762886597938145,
116
+ "loss": 2.216,
117
  "step": 18
118
  },
119
  {
120
+ "epoch": 0.07,
121
+ "learning_rate": 0.00018694158075601377,
122
+ "loss": 2.1359,
123
  "step": 19
124
  },
125
  {
126
+ "epoch": 0.07,
127
+ "learning_rate": 0.00018625429553264605,
128
+ "loss": 2.1215,
129
  "step": 20
130
  },
131
  {
132
+ "epoch": 0.07,
133
+ "learning_rate": 0.00018556701030927837,
134
+ "loss": 2.2179,
135
  "step": 21
136
  },
137
  {
138
+ "epoch": 0.08,
139
+ "learning_rate": 0.00018487972508591068,
140
+ "loss": 2.2598,
141
  "step": 22
142
  },
143
  {
144
+ "epoch": 0.08,
145
+ "learning_rate": 0.00018419243986254294,
146
+ "loss": 2.1813,
147
  "step": 23
148
  },
149
  {
150
+ "epoch": 0.08,
151
+ "learning_rate": 0.00018350515463917526,
152
+ "loss": 2.2006,
153
  "step": 24
154
  },
155
  {
156
+ "epoch": 0.09,
157
+ "learning_rate": 0.00018281786941580757,
158
+ "loss": 2.1564,
159
  "step": 25
160
  },
161
  {
162
+ "epoch": 0.09,
163
+ "learning_rate": 0.00018213058419243986,
164
+ "loss": 2.2537,
165
  "step": 26
166
  },
167
  {
168
+ "epoch": 0.09,
169
+ "learning_rate": 0.00018144329896907217,
170
+ "loss": 2.1975,
171
  "step": 27
172
  },
173
  {
174
+ "epoch": 0.1,
175
+ "learning_rate": 0.0001807560137457045,
176
+ "loss": 2.2566,
177
  "step": 28
178
  },
179
  {
180
+ "epoch": 0.1,
181
+ "learning_rate": 0.00018006872852233677,
182
+ "loss": 2.1464,
183
  "step": 29
184
  },
185
  {
186
+ "epoch": 0.1,
187
+ "learning_rate": 0.0001793814432989691,
188
+ "loss": 2.1421,
189
  "step": 30
190
  },
191
  {
192
+ "epoch": 0.11,
193
+ "learning_rate": 0.0001786941580756014,
194
+ "loss": 2.1276,
195
  "step": 31
196
  },
197
  {
198
+ "epoch": 0.11,
199
+ "learning_rate": 0.00017800687285223366,
200
+ "loss": 2.0649,
201
  "step": 32
202
  },
203
  {
204
+ "epoch": 0.11,
205
+ "learning_rate": 0.00017731958762886598,
206
+ "loss": 2.1835,
207
  "step": 33
208
  },
209
  {
210
+ "epoch": 0.12,
211
+ "learning_rate": 0.0001766323024054983,
212
+ "loss": 2.1711,
213
  "step": 34
214
  },
215
  {
216
+ "epoch": 0.12,
217
+ "learning_rate": 0.00017594501718213058,
218
+ "loss": 2.2591,
219
  "step": 35
220
  },
221
  {
222
+ "epoch": 0.12,
223
+ "learning_rate": 0.0001752577319587629,
224
+ "loss": 2.1471,
225
  "step": 36
226
  },
227
  {
228
+ "epoch": 0.13,
229
+ "learning_rate": 0.0001745704467353952,
230
+ "loss": 2.0861,
231
  "step": 37
232
  },
233
  {
234
+ "epoch": 0.13,
235
+ "learning_rate": 0.0001738831615120275,
236
+ "loss": 2.0702,
237
  "step": 38
238
  },
239
  {
240
+ "epoch": 0.13,
241
+ "learning_rate": 0.0001731958762886598,
242
+ "loss": 2.1096,
243
  "step": 39
244
  },
245
  {
246
+ "epoch": 0.14,
247
+ "learning_rate": 0.00017250859106529212,
248
+ "loss": 2.1062,
249
  "step": 40
250
  },
251
  {
252
+ "epoch": 0.14,
253
+ "learning_rate": 0.00017182130584192438,
254
+ "loss": 2.2545,
255
  "step": 41
256
  },
257
  {
258
+ "epoch": 0.14,
259
+ "learning_rate": 0.0001711340206185567,
260
+ "loss": 2.1572,
261
  "step": 42
262
  },
263
  {
264
+ "epoch": 0.15,
265
+ "learning_rate": 0.000170446735395189,
266
+ "loss": 2.0749,
267
  "step": 43
268
  },
269
  {
270
+ "epoch": 0.15,
271
+ "learning_rate": 0.0001697594501718213,
272
+ "loss": 2.1922,
273
  "step": 44
274
  },
275
  {
276
+ "epoch": 0.15,
277
+ "learning_rate": 0.00016907216494845361,
278
+ "loss": 2.1915,
279
  "step": 45
280
  },
281
  {
282
+ "epoch": 0.16,
283
+ "learning_rate": 0.00016838487972508593,
284
+ "loss": 2.1594,
285
  "step": 46
286
  },
287
  {
288
+ "epoch": 0.16,
289
+ "learning_rate": 0.00016769759450171822,
290
+ "loss": 2.176,
291
  "step": 47
292
  },
293
  {
294
+ "epoch": 0.16,
295
+ "learning_rate": 0.00016701030927835053,
296
+ "loss": 2.1223,
297
  "step": 48
298
  },
299
  {
300
+ "epoch": 0.17,
301
+ "learning_rate": 0.00016632302405498285,
302
+ "loss": 2.1263,
303
  "step": 49
304
  },
305
  {
306
+ "epoch": 0.17,
307
+ "learning_rate": 0.00016563573883161513,
308
+ "loss": 2.0481,
309
  "step": 50
310
  },
311
  {
312
+ "epoch": 0.17,
313
+ "learning_rate": 0.00016494845360824742,
314
+ "loss": 2.1043,
315
  "step": 51
316
  },
317
  {
318
+ "epoch": 0.18,
319
+ "learning_rate": 0.00016426116838487973,
320
+ "loss": 2.1678,
321
  "step": 52
322
  },
323
  {
324
+ "epoch": 0.18,
325
+ "learning_rate": 0.00016357388316151202,
326
+ "loss": 2.1602,
327
  "step": 53
328
  },
329
  {
330
+ "epoch": 0.19,
331
+ "learning_rate": 0.00016288659793814434,
332
+ "loss": 2.1448,
333
  "step": 54
334
  },
335
  {
336
+ "epoch": 0.19,
337
+ "learning_rate": 0.00016219931271477665,
338
+ "loss": 2.1536,
339
  "step": 55
340
  },
341
  {
342
+ "epoch": 0.19,
343
+ "learning_rate": 0.00016151202749140894,
344
+ "loss": 2.0339,
345
  "step": 56
346
  },
347
  {
348
+ "epoch": 0.2,
349
+ "learning_rate": 0.00016082474226804125,
350
+ "loss": 2.023,
351
  "step": 57
352
  },
353
  {
354
+ "epoch": 0.2,
355
+ "learning_rate": 0.00016013745704467357,
356
+ "loss": 2.1407,
357
  "step": 58
358
  },
359
  {
360
+ "epoch": 0.2,
361
+ "learning_rate": 0.00015945017182130585,
362
+ "loss": 2.1134,
363
  "step": 59
364
  },
365
  {
366
+ "epoch": 0.21,
367
+ "learning_rate": 0.00015876288659793814,
368
+ "loss": 2.1652,
369
  "step": 60
370
  },
371
  {
372
+ "epoch": 0.21,
373
+ "learning_rate": 0.00015807560137457046,
374
+ "loss": 2.0051,
375
  "step": 61
376
  },
377
  {
378
+ "epoch": 0.21,
379
+ "learning_rate": 0.00015738831615120274,
380
+ "loss": 2.0604,
381
  "step": 62
382
  },
383
  {
384
+ "epoch": 0.22,
385
+ "learning_rate": 0.00015670103092783506,
386
+ "loss": 2.1708,
387
  "step": 63
388
  },
389
  {
390
+ "epoch": 0.22,
391
+ "learning_rate": 0.00015601374570446737,
392
+ "loss": 2.1106,
393
  "step": 64
394
  },
395
  {
396
+ "epoch": 0.22,
397
+ "learning_rate": 0.00015532646048109966,
398
+ "loss": 2.1445,
399
  "step": 65
400
  },
401
  {
402
+ "epoch": 0.23,
403
+ "learning_rate": 0.00015463917525773197,
404
+ "loss": 2.0879,
405
  "step": 66
406
  },
407
  {
408
+ "epoch": 0.23,
409
+ "learning_rate": 0.0001539518900343643,
410
+ "loss": 2.1498,
411
  "step": 67
412
  },
413
  {
414
+ "epoch": 0.23,
415
+ "learning_rate": 0.00015326460481099657,
416
+ "loss": 2.0719,
417
  "step": 68
418
  },
419
  {
420
+ "epoch": 0.24,
421
+ "learning_rate": 0.00015257731958762886,
422
+ "loss": 2.2167,
423
  "step": 69
424
  },
425
  {
426
+ "epoch": 0.24,
427
+ "learning_rate": 0.00015189003436426118,
428
+ "loss": 2.0811,
429
  "step": 70
430
  },
431
  {
432
+ "epoch": 0.24,
433
+ "learning_rate": 0.00015120274914089346,
434
+ "loss": 2.1058,
435
  "step": 71
436
  },
437
  {
438
+ "epoch": 0.25,
439
+ "learning_rate": 0.00015051546391752578,
440
+ "loss": 2.0392,
441
  "step": 72
442
  },
443
  {
444
+ "epoch": 0.25,
445
+ "learning_rate": 0.0001498281786941581,
446
+ "loss": 2.0957,
447
  "step": 73
448
  },
449
  {
450
+ "epoch": 0.25,
451
+ "learning_rate": 0.00014914089347079038,
452
+ "loss": 1.9994,
453
  "step": 74
454
  },
455
  {
456
+ "epoch": 0.26,
457
+ "learning_rate": 0.0001484536082474227,
458
+ "loss": 2.0464,
459
  "step": 75
460
  },
461
  {
462
+ "epoch": 0.26,
463
+ "learning_rate": 0.000147766323024055,
464
+ "loss": 2.0417,
465
  "step": 76
466
  },
467
  {
468
+ "epoch": 0.26,
469
+ "learning_rate": 0.0001470790378006873,
470
+ "loss": 2.105,
471
  "step": 77
472
  },
473
  {
474
+ "epoch": 0.27,
475
+ "learning_rate": 0.00014639175257731958,
476
+ "loss": 2.1147,
477
  "step": 78
478
  },
479
  {
480
+ "epoch": 0.27,
481
+ "learning_rate": 0.0001457044673539519,
482
+ "loss": 1.9964,
483
  "step": 79
484
  },
485
  {
486
+ "epoch": 0.27,
487
+ "learning_rate": 0.00014501718213058418,
488
+ "loss": 1.9723,
489
  "step": 80
490
  },
491
  {
492
+ "epoch": 0.28,
493
+ "learning_rate": 0.0001443298969072165,
494
+ "loss": 2.0621,
495
  "step": 81
496
  },
497
  {
498
+ "epoch": 0.28,
499
+ "learning_rate": 0.00014364261168384881,
500
+ "loss": 2.2703,
501
  "step": 82
502
  },
503
  {
504
+ "epoch": 0.28,
505
+ "learning_rate": 0.0001429553264604811,
506
+ "loss": 2.0815,
507
  "step": 83
508
  },
509
  {
510
+ "epoch": 0.29,
511
+ "learning_rate": 0.00014226804123711342,
512
+ "loss": 2.0774,
513
  "step": 84
514
  },
515
  {
516
+ "epoch": 0.29,
517
+ "learning_rate": 0.00014158075601374573,
518
+ "loss": 2.066,
519
  "step": 85
520
  },
521
  {
522
+ "epoch": 0.29,
523
+ "learning_rate": 0.00014089347079037802,
524
+ "loss": 2.03,
525
  "step": 86
526
  },
527
  {
528
+ "epoch": 0.3,
529
+ "learning_rate": 0.0001402061855670103,
530
+ "loss": 2.1433,
531
  "step": 87
532
  },
533
  {
534
+ "epoch": 0.3,
535
+ "learning_rate": 0.00013951890034364262,
536
+ "loss": 2.0811,
537
  "step": 88
538
  },
539
  {
540
+ "epoch": 0.31,
541
+ "learning_rate": 0.0001388316151202749,
542
+ "loss": 1.9791,
543
  "step": 89
544
  },
545
  {
546
+ "epoch": 0.31,
547
+ "learning_rate": 0.00013814432989690722,
548
+ "loss": 2.0876,
549
  "step": 90
550
  },
551
  {
552
+ "epoch": 0.31,
553
+ "learning_rate": 0.00013745704467353953,
554
+ "loss": 2.0314,
555
  "step": 91
556
  },
557
  {
558
+ "epoch": 0.32,
559
+ "learning_rate": 0.00013676975945017182,
560
+ "loss": 1.9485,
561
  "step": 92
562
  },
563
  {
564
+ "epoch": 0.32,
565
+ "learning_rate": 0.00013608247422680414,
566
+ "loss": 2.078,
567
  "step": 93
568
  },
569
  {
570
+ "epoch": 0.32,
571
+ "learning_rate": 0.00013539518900343645,
572
+ "loss": 2.1251,
573
  "step": 94
574
  },
575
  {
576
+ "epoch": 0.33,
577
+ "learning_rate": 0.00013470790378006874,
578
+ "loss": 1.9736,
579
  "step": 95
580
  },
581
  {
582
+ "epoch": 0.33,
583
+ "learning_rate": 0.00013402061855670103,
584
+ "loss": 2.0189,
585
  "step": 96
586
  },
587
  {
588
+ "epoch": 0.33,
589
+ "learning_rate": 0.00013333333333333334,
590
+ "loss": 2.0061,
591
  "step": 97
592
  },
593
  {
594
+ "epoch": 0.34,
595
+ "learning_rate": 0.00013264604810996563,
596
+ "loss": 1.9595,
597
  "step": 98
598
  },
599
  {
600
+ "epoch": 0.34,
601
+ "learning_rate": 0.00013195876288659794,
602
+ "loss": 1.9702,
603
  "step": 99
604
  },
605
  {
606
+ "epoch": 0.34,
607
+ "learning_rate": 0.00013127147766323026,
608
+ "loss": 2.0322,
609
  "step": 100
610
  }
611
  ],
612
  "logging_steps": 1,
613
+ "max_steps": 291,
614
+ "num_train_epochs": 1,
615
  "save_steps": 100,
616
+ "total_flos": 6.083398113601536e+16,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c85ca10d57026468ea370fc1e96b5bd54f01f7c7e107e8275fff6dab39b89727
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3410eea0ea0eb09ca576511099334880accaab360c0279f3099c9e4d2e877a
3
  size 4027