ccore commited on
Commit
54dca21
1 Parent(s): 86f875a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -16,8 +16,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [./core2/](https://huggingface.co/./core2/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 2.5534
20
- - Accuracy: 0.4330
21
 
22
  ## Model description
23
 
@@ -40,10 +40,10 @@ The following hyperparameters were used during training:
40
  - train_batch_size: 1
41
  - eval_batch_size: 8
42
  - seed: 42
43
- - gradient_accumulation_steps: 8
44
- - total_train_batch_size: 8
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
- - lr_scheduler_type: linear
47
  - num_epochs: 1.0
48
 
49
  ### Training results
 
16
 
17
  This model is a fine-tuned version of [./core2/](https://huggingface.co/./core2/) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 2.2227
20
+ - Accuracy: 0.4777
21
 
22
  ## Model description
23
 
 
40
  - train_batch_size: 1
41
  - eval_batch_size: 8
42
  - seed: 42
43
+ - gradient_accumulation_steps: 512
44
+ - total_train_batch_size: 512
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: constant
47
  - num_epochs: 1.0
48
 
49
  ### Training results
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.43297187933346976,
4
- "eval_loss": 2.553417444229126,
5
- "eval_runtime": 1.7282,
6
  "eval_samples": 129,
7
- "eval_samples_per_second": 74.646,
8
- "eval_steps_per_second": 9.837,
9
- "perplexity": 12.850946217445413,
10
- "train_loss": 3.3343640817306177,
11
- "train_runtime": 1458.5581,
12
- "train_samples": 51585,
13
- "train_samples_per_second": 35.367,
14
- "train_steps_per_second": 4.421
15
  }
 
1
  {
2
+ "epoch": 0.9,
3
+ "eval_accuracy": 0.4777027590230891,
4
+ "eval_loss": 2.222717761993408,
5
+ "eval_runtime": 1.797,
6
  "eval_samples": 129,
7
+ "eval_samples_per_second": 71.786,
8
+ "eval_steps_per_second": 9.46,
9
+ "perplexity": 9.2323882343288,
10
+ "train_loss": 2.689473125669691,
11
+ "train_runtime": 118.9327,
12
+ "train_samples": 5117,
13
+ "train_samples_per_second": 43.024,
14
+ "train_steps_per_second": 0.076
15
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.43297187933346976,
4
- "eval_loss": 2.553417444229126,
5
- "eval_runtime": 1.7282,
6
  "eval_samples": 129,
7
- "eval_samples_per_second": 74.646,
8
- "eval_steps_per_second": 9.837,
9
- "perplexity": 12.850946217445413
10
  }
 
1
  {
2
+ "epoch": 0.9,
3
+ "eval_accuracy": 0.4777027590230891,
4
+ "eval_loss": 2.222717761993408,
5
+ "eval_runtime": 1.797,
6
  "eval_samples": 129,
7
+ "eval_samples_per_second": 71.786,
8
+ "eval_steps_per_second": 9.46,
9
+ "perplexity": 9.2323882343288
10
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dd324b6cf685a7b5f40c8fbea96175df25641cafbbe0135d2c0bd3da3ac73e3
3
  size 929067029
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57f122b3164f3ec83b608251f67e4b6d4771c756f4fc5a1e7b9481acfbd9921e
3
  size 929067029
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 3.3343640817306177,
4
- "train_runtime": 1458.5581,
5
- "train_samples": 51585,
6
- "train_samples_per_second": 35.367,
7
- "train_steps_per_second": 4.421
8
  }
 
1
  {
2
+ "epoch": 0.9,
3
+ "train_loss": 2.689473125669691,
4
+ "train_runtime": 118.9327,
5
+ "train_samples": 5117,
6
+ "train_samples_per_second": 43.024,
7
+ "train_steps_per_second": 0.076
8
  }
trainer_state.json CHANGED
@@ -1,796 +1,82 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9999806145197248,
5
  "eval_steps": 500,
6
- "global_step": 6448,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.01,
13
- "learning_rate": 0.0009922456575682382,
14
- "loss": 4.0749,
15
- "step": 50
16
- },
17
- {
18
- "epoch": 0.02,
19
- "learning_rate": 0.0009844913151364765,
20
- "loss": 3.9181,
21
- "step": 100
22
- },
23
- {
24
- "epoch": 0.02,
25
- "learning_rate": 0.0009767369727047147,
26
- "loss": 3.8669,
27
- "step": 150
28
- },
29
- {
30
- "epoch": 0.03,
31
- "learning_rate": 0.0009689826302729528,
32
- "loss": 3.8069,
33
- "step": 200
34
- },
35
- {
36
- "epoch": 0.04,
37
- "learning_rate": 0.000961228287841191,
38
- "loss": 3.7749,
39
- "step": 250
40
- },
41
- {
42
- "epoch": 0.05,
43
- "learning_rate": 0.0009534739454094294,
44
- "loss": 3.7757,
45
- "step": 300
46
- },
47
- {
48
- "epoch": 0.05,
49
- "learning_rate": 0.0009457196029776675,
50
- "loss": 3.7511,
51
- "step": 350
52
- },
53
- {
54
- "epoch": 0.06,
55
- "learning_rate": 0.0009379652605459057,
56
- "loss": 3.684,
57
- "step": 400
58
- },
59
- {
60
- "epoch": 0.07,
61
- "learning_rate": 0.000930210918114144,
62
- "loss": 3.6771,
63
- "step": 450
64
- },
65
- {
66
- "epoch": 0.08,
67
- "learning_rate": 0.0009224565756823822,
68
- "loss": 3.6402,
69
- "step": 500
70
- },
71
- {
72
- "epoch": 0.09,
73
- "learning_rate": 0.0009147022332506204,
74
- "loss": 3.6889,
75
- "step": 550
76
- },
77
- {
78
- "epoch": 0.09,
79
- "learning_rate": 0.0009069478908188585,
80
- "loss": 3.6868,
81
- "step": 600
82
- },
83
  {
84
  "epoch": 0.1,
85
- "learning_rate": 0.0008991935483870968,
86
- "loss": 3.6779,
87
- "step": 650
88
- },
89
- {
90
- "epoch": 0.11,
91
- "learning_rate": 0.000891439205955335,
92
- "loss": 3.6314,
93
- "step": 700
94
- },
95
- {
96
- "epoch": 0.12,
97
- "learning_rate": 0.0008836848635235732,
98
- "loss": 3.6376,
99
- "step": 750
100
- },
101
- {
102
- "epoch": 0.12,
103
- "learning_rate": 0.0008759305210918114,
104
- "loss": 3.6291,
105
- "step": 800
106
- },
107
- {
108
- "epoch": 0.13,
109
- "learning_rate": 0.0008681761786600497,
110
- "loss": 3.629,
111
- "step": 850
112
- },
113
- {
114
- "epoch": 0.14,
115
- "learning_rate": 0.0008604218362282879,
116
- "loss": 3.5972,
117
- "step": 900
118
- },
119
- {
120
- "epoch": 0.15,
121
- "learning_rate": 0.0008526674937965261,
122
- "loss": 3.6299,
123
- "step": 950
124
- },
125
- {
126
- "epoch": 0.16,
127
- "learning_rate": 0.0008449131513647643,
128
- "loss": 3.551,
129
- "step": 1000
130
- },
131
- {
132
- "epoch": 0.16,
133
- "learning_rate": 0.0008371588089330025,
134
- "loss": 3.5943,
135
- "step": 1050
136
- },
137
- {
138
- "epoch": 0.17,
139
- "learning_rate": 0.0008294044665012407,
140
- "loss": 3.5458,
141
- "step": 1100
142
- },
143
- {
144
- "epoch": 0.18,
145
- "learning_rate": 0.0008216501240694789,
146
- "loss": 3.581,
147
- "step": 1150
148
- },
149
- {
150
- "epoch": 0.19,
151
- "learning_rate": 0.0008138957816377171,
152
- "loss": 3.542,
153
- "step": 1200
154
- },
155
- {
156
- "epoch": 0.19,
157
- "learning_rate": 0.0008061414392059554,
158
- "loss": 3.5666,
159
- "step": 1250
160
  },
161
  {
162
  "epoch": 0.2,
163
- "learning_rate": 0.0007983870967741935,
164
- "loss": 3.5265,
165
- "step": 1300
166
- },
167
- {
168
- "epoch": 0.21,
169
- "learning_rate": 0.0007906327543424317,
170
- "loss": 3.5315,
171
- "step": 1350
172
- },
173
- {
174
- "epoch": 0.22,
175
- "learning_rate": 0.00078287841191067,
176
- "loss": 3.4934,
177
- "step": 1400
178
- },
179
- {
180
- "epoch": 0.22,
181
- "learning_rate": 0.0007751240694789083,
182
- "loss": 3.5086,
183
- "step": 1450
184
- },
185
- {
186
- "epoch": 0.23,
187
- "learning_rate": 0.0007673697270471465,
188
- "loss": 3.5028,
189
- "step": 1500
190
- },
191
- {
192
- "epoch": 0.24,
193
- "learning_rate": 0.0007596153846153846,
194
- "loss": 3.4822,
195
- "step": 1550
196
- },
197
- {
198
- "epoch": 0.25,
199
- "learning_rate": 0.0007518610421836228,
200
- "loss": 3.4943,
201
- "step": 1600
202
- },
203
- {
204
- "epoch": 0.26,
205
- "learning_rate": 0.0007441066997518611,
206
- "loss": 3.5014,
207
- "step": 1650
208
- },
209
- {
210
- "epoch": 0.26,
211
- "learning_rate": 0.0007363523573200993,
212
- "loss": 3.4705,
213
- "step": 1700
214
- },
215
- {
216
- "epoch": 0.27,
217
- "learning_rate": 0.0007285980148883374,
218
- "loss": 3.4899,
219
- "step": 1750
220
- },
221
- {
222
- "epoch": 0.28,
223
- "learning_rate": 0.0007208436724565756,
224
- "loss": 3.4403,
225
- "step": 1800
226
- },
227
- {
228
- "epoch": 0.29,
229
- "learning_rate": 0.0007130893300248139,
230
- "loss": 3.4123,
231
- "step": 1850
232
- },
233
- {
234
- "epoch": 0.29,
235
- "learning_rate": 0.0007053349875930521,
236
- "loss": 3.4231,
237
- "step": 1900
238
  },
239
  {
240
  "epoch": 0.3,
241
- "learning_rate": 0.0006975806451612903,
242
- "loss": 3.3957,
243
- "step": 1950
244
- },
245
- {
246
- "epoch": 0.31,
247
- "learning_rate": 0.0006898263027295286,
248
- "loss": 3.3722,
249
- "step": 2000
250
- },
251
- {
252
- "epoch": 0.32,
253
- "learning_rate": 0.0006820719602977668,
254
- "loss": 3.4255,
255
- "step": 2050
256
- },
257
- {
258
- "epoch": 0.33,
259
- "learning_rate": 0.000674317617866005,
260
- "loss": 3.4004,
261
- "step": 2100
262
- },
263
- {
264
- "epoch": 0.33,
265
- "learning_rate": 0.0006665632754342432,
266
- "loss": 3.3749,
267
- "step": 2150
268
- },
269
- {
270
- "epoch": 0.34,
271
- "learning_rate": 0.0006588089330024815,
272
- "loss": 3.3498,
273
- "step": 2200
274
- },
275
- {
276
- "epoch": 0.35,
277
- "learning_rate": 0.0006510545905707196,
278
- "loss": 3.4563,
279
- "step": 2250
280
- },
281
- {
282
- "epoch": 0.36,
283
- "learning_rate": 0.0006433002481389578,
284
- "loss": 3.392,
285
- "step": 2300
286
- },
287
- {
288
- "epoch": 0.36,
289
- "learning_rate": 0.000635545905707196,
290
- "loss": 3.3686,
291
- "step": 2350
292
- },
293
- {
294
- "epoch": 0.37,
295
- "learning_rate": 0.0006277915632754343,
296
- "loss": 3.3739,
297
- "step": 2400
298
- },
299
- {
300
- "epoch": 0.38,
301
- "learning_rate": 0.0006200372208436724,
302
- "loss": 3.3357,
303
- "step": 2450
304
- },
305
- {
306
- "epoch": 0.39,
307
- "learning_rate": 0.0006122828784119106,
308
- "loss": 3.3859,
309
- "step": 2500
310
- },
311
- {
312
- "epoch": 0.4,
313
- "learning_rate": 0.0006045285359801489,
314
- "loss": 3.3605,
315
- "step": 2550
316
  },
317
  {
318
  "epoch": 0.4,
319
- "learning_rate": 0.0005967741935483872,
320
- "loss": 3.361,
321
- "step": 2600
322
- },
323
- {
324
- "epoch": 0.41,
325
- "learning_rate": 0.0005890198511166254,
326
- "loss": 3.3729,
327
- "step": 2650
328
- },
329
- {
330
- "epoch": 0.42,
331
- "learning_rate": 0.0005812655086848635,
332
- "loss": 3.3592,
333
- "step": 2700
334
- },
335
- {
336
- "epoch": 0.43,
337
- "learning_rate": 0.0005735111662531017,
338
- "loss": 3.3576,
339
- "step": 2750
340
- },
341
- {
342
- "epoch": 0.43,
343
- "learning_rate": 0.00056575682382134,
344
- "loss": 3.3464,
345
- "step": 2800
346
- },
347
- {
348
- "epoch": 0.44,
349
- "learning_rate": 0.0005580024813895782,
350
- "loss": 3.3225,
351
- "step": 2850
352
- },
353
- {
354
- "epoch": 0.45,
355
- "learning_rate": 0.0005502481389578163,
356
- "loss": 3.3228,
357
- "step": 2900
358
- },
359
- {
360
- "epoch": 0.46,
361
- "learning_rate": 0.0005424937965260545,
362
- "loss": 3.3369,
363
- "step": 2950
364
- },
365
- {
366
- "epoch": 0.47,
367
- "learning_rate": 0.0005347394540942928,
368
- "loss": 3.2912,
369
- "step": 3000
370
- },
371
- {
372
- "epoch": 0.47,
373
- "learning_rate": 0.000526985111662531,
374
- "loss": 3.2722,
375
- "step": 3050
376
- },
377
- {
378
- "epoch": 0.48,
379
- "learning_rate": 0.0005192307692307693,
380
- "loss": 3.2878,
381
- "step": 3100
382
- },
383
- {
384
- "epoch": 0.49,
385
- "learning_rate": 0.0005114764267990075,
386
- "loss": 3.2558,
387
- "step": 3150
388
- },
389
- {
390
- "epoch": 0.5,
391
- "learning_rate": 0.0005037220843672457,
392
- "loss": 3.3018,
393
- "step": 3200
394
  },
395
  {
396
  "epoch": 0.5,
397
- "learning_rate": 0.0004959677419354839,
398
- "loss": 3.2785,
399
- "step": 3250
400
- },
401
- {
402
- "epoch": 0.51,
403
- "learning_rate": 0.00048821339950372213,
404
- "loss": 3.2249,
405
- "step": 3300
406
- },
407
- {
408
- "epoch": 0.52,
409
- "learning_rate": 0.0004804590570719603,
410
- "loss": 3.2703,
411
- "step": 3350
412
- },
413
- {
414
- "epoch": 0.53,
415
- "learning_rate": 0.00047270471464019853,
416
- "loss": 3.2871,
417
- "step": 3400
418
- },
419
- {
420
- "epoch": 0.54,
421
- "learning_rate": 0.0004649503722084367,
422
- "loss": 3.2357,
423
- "step": 3450
424
- },
425
- {
426
- "epoch": 0.54,
427
- "learning_rate": 0.000457196029776675,
428
- "loss": 3.2428,
429
- "step": 3500
430
- },
431
- {
432
- "epoch": 0.55,
433
- "learning_rate": 0.00044944168734491316,
434
- "loss": 3.2125,
435
- "step": 3550
436
- },
437
- {
438
- "epoch": 0.56,
439
- "learning_rate": 0.0004416873449131514,
440
- "loss": 3.2338,
441
- "step": 3600
442
- },
443
- {
444
- "epoch": 0.57,
445
- "learning_rate": 0.00043393300248138956,
446
- "loss": 3.288,
447
- "step": 3650
448
- },
449
- {
450
- "epoch": 0.57,
451
- "learning_rate": 0.0004261786600496278,
452
- "loss": 3.2564,
453
- "step": 3700
454
- },
455
- {
456
- "epoch": 0.58,
457
- "learning_rate": 0.000418424317617866,
458
- "loss": 3.1859,
459
- "step": 3750
460
- },
461
- {
462
- "epoch": 0.59,
463
- "learning_rate": 0.00041066997518610424,
464
- "loss": 3.2368,
465
- "step": 3800
466
  },
467
  {
468
  "epoch": 0.6,
469
- "learning_rate": 0.00040291563275434247,
470
- "loss": 3.2368,
471
- "step": 3850
472
- },
473
- {
474
- "epoch": 0.6,
475
- "learning_rate": 0.00039516129032258064,
476
- "loss": 3.1754,
477
- "step": 3900
478
- },
479
- {
480
- "epoch": 0.61,
481
- "learning_rate": 0.00038740694789081887,
482
- "loss": 3.2392,
483
- "step": 3950
484
- },
485
- {
486
- "epoch": 0.62,
487
- "learning_rate": 0.00037965260545905704,
488
- "loss": 3.2382,
489
- "step": 4000
490
- },
491
- {
492
- "epoch": 0.63,
493
- "learning_rate": 0.00037189826302729527,
494
- "loss": 3.2545,
495
- "step": 4050
496
- },
497
- {
498
- "epoch": 0.64,
499
- "learning_rate": 0.00036414392059553355,
500
- "loss": 3.1993,
501
- "step": 4100
502
- },
503
- {
504
- "epoch": 0.64,
505
- "learning_rate": 0.0003563895781637717,
506
- "loss": 3.1565,
507
- "step": 4150
508
- },
509
- {
510
- "epoch": 0.65,
511
- "learning_rate": 0.00034863523573200995,
512
- "loss": 3.2112,
513
- "step": 4200
514
- },
515
- {
516
- "epoch": 0.66,
517
- "learning_rate": 0.0003408808933002481,
518
- "loss": 3.1617,
519
- "step": 4250
520
- },
521
- {
522
- "epoch": 0.67,
523
- "learning_rate": 0.00033312655086848635,
524
- "loss": 3.1602,
525
- "step": 4300
526
- },
527
- {
528
- "epoch": 0.67,
529
- "learning_rate": 0.0003253722084367246,
530
- "loss": 3.1477,
531
- "step": 4350
532
- },
533
- {
534
- "epoch": 0.68,
535
- "learning_rate": 0.0003176178660049628,
536
- "loss": 3.1583,
537
- "step": 4400
538
- },
539
- {
540
- "epoch": 0.69,
541
- "learning_rate": 0.00030986352357320103,
542
- "loss": 3.207,
543
- "step": 4450
544
  },
545
  {
546
  "epoch": 0.7,
547
- "learning_rate": 0.0003021091811414392,
548
- "loss": 3.1408,
549
- "step": 4500
550
- },
551
- {
552
- "epoch": 0.71,
553
- "learning_rate": 0.00029435483870967743,
554
- "loss": 3.0965,
555
- "step": 4550
556
- },
557
- {
558
- "epoch": 0.71,
559
- "learning_rate": 0.0002866004962779156,
560
- "loss": 3.154,
561
- "step": 4600
562
- },
563
- {
564
- "epoch": 0.72,
565
- "learning_rate": 0.0002788461538461539,
566
- "loss": 3.1413,
567
- "step": 4650
568
- },
569
- {
570
- "epoch": 0.73,
571
- "learning_rate": 0.00027109181141439205,
572
- "loss": 3.1373,
573
- "step": 4700
574
- },
575
- {
576
- "epoch": 0.74,
577
- "learning_rate": 0.0002633374689826303,
578
- "loss": 3.1198,
579
- "step": 4750
580
- },
581
- {
582
- "epoch": 0.74,
583
- "learning_rate": 0.0002555831265508685,
584
- "loss": 3.1047,
585
- "step": 4800
586
- },
587
- {
588
- "epoch": 0.75,
589
- "learning_rate": 0.0002478287841191067,
590
- "loss": 3.1728,
591
- "step": 4850
592
- },
593
- {
594
- "epoch": 0.76,
595
- "learning_rate": 0.0002400744416873449,
596
- "loss": 3.1262,
597
- "step": 4900
598
- },
599
- {
600
- "epoch": 0.77,
601
- "learning_rate": 0.00023232009925558313,
602
- "loss": 3.1111,
603
- "step": 4950
604
- },
605
- {
606
- "epoch": 0.78,
607
- "learning_rate": 0.00022456575682382136,
608
- "loss": 3.1426,
609
- "step": 5000
610
- },
611
- {
612
- "epoch": 0.78,
613
- "learning_rate": 0.00021681141439205956,
614
- "loss": 3.1709,
615
- "step": 5050
616
- },
617
- {
618
- "epoch": 0.79,
619
- "learning_rate": 0.00020905707196029776,
620
- "loss": 3.1666,
621
- "step": 5100
622
  },
623
  {
624
  "epoch": 0.8,
625
- "learning_rate": 0.000201302729528536,
626
- "loss": 3.0971,
627
- "step": 5150
628
- },
629
- {
630
- "epoch": 0.81,
631
- "learning_rate": 0.0001935483870967742,
632
- "loss": 3.1585,
633
- "step": 5200
634
- },
635
- {
636
- "epoch": 0.81,
637
- "learning_rate": 0.00018579404466501241,
638
- "loss": 3.0784,
639
- "step": 5250
640
- },
641
- {
642
- "epoch": 0.82,
643
- "learning_rate": 0.00017803970223325061,
644
- "loss": 3.1064,
645
- "step": 5300
646
- },
647
- {
648
- "epoch": 0.83,
649
- "learning_rate": 0.00017028535980148884,
650
- "loss": 3.1601,
651
- "step": 5350
652
- },
653
- {
654
- "epoch": 0.84,
655
- "learning_rate": 0.00016253101736972707,
656
- "loss": 3.1306,
657
- "step": 5400
658
- },
659
- {
660
- "epoch": 0.85,
661
- "learning_rate": 0.00015477667493796527,
662
- "loss": 3.1461,
663
- "step": 5450
664
- },
665
- {
666
- "epoch": 0.85,
667
- "learning_rate": 0.00014702233250620347,
668
- "loss": 3.1139,
669
- "step": 5500
670
- },
671
- {
672
- "epoch": 0.86,
673
- "learning_rate": 0.0001392679900744417,
674
- "loss": 3.0884,
675
- "step": 5550
676
- },
677
- {
678
- "epoch": 0.87,
679
- "learning_rate": 0.0001315136476426799,
680
- "loss": 3.1221,
681
- "step": 5600
682
- },
683
- {
684
- "epoch": 0.88,
685
- "learning_rate": 0.00012375930521091812,
686
- "loss": 3.0781,
687
- "step": 5650
688
- },
689
- {
690
- "epoch": 0.88,
691
- "learning_rate": 0.00011600496277915632,
692
- "loss": 3.156,
693
- "step": 5700
694
- },
695
- {
696
- "epoch": 0.89,
697
- "learning_rate": 0.00010825062034739454,
698
- "loss": 3.1047,
699
- "step": 5750
700
  },
701
  {
702
  "epoch": 0.9,
703
- "learning_rate": 0.00010049627791563276,
704
- "loss": 3.1811,
705
- "step": 5800
706
- },
707
- {
708
- "epoch": 0.91,
709
- "learning_rate": 9.274193548387098e-05,
710
- "loss": 3.1423,
711
- "step": 5850
712
- },
713
- {
714
- "epoch": 0.91,
715
- "learning_rate": 8.498759305210918e-05,
716
- "loss": 3.1057,
717
- "step": 5900
718
- },
719
- {
720
- "epoch": 0.92,
721
- "learning_rate": 7.723325062034739e-05,
722
- "loss": 3.0984,
723
- "step": 5950
724
- },
725
- {
726
- "epoch": 0.93,
727
- "learning_rate": 6.947890818858562e-05,
728
- "loss": 3.1098,
729
- "step": 6000
730
- },
731
- {
732
- "epoch": 0.94,
733
- "learning_rate": 6.172456575682382e-05,
734
- "loss": 3.1232,
735
- "step": 6050
736
  },
737
  {
738
- "epoch": 0.95,
739
- "learning_rate": 5.3970223325062036e-05,
740
- "loss": 3.129,
741
- "step": 6100
742
- },
743
- {
744
- "epoch": 0.95,
745
- "learning_rate": 4.621588089330025e-05,
746
- "loss": 3.1156,
747
- "step": 6150
748
- },
749
- {
750
- "epoch": 0.96,
751
- "learning_rate": 3.846153846153846e-05,
752
- "loss": 3.1295,
753
- "step": 6200
754
- },
755
- {
756
- "epoch": 0.97,
757
- "learning_rate": 3.0707196029776676e-05,
758
- "loss": 3.0784,
759
- "step": 6250
760
- },
761
- {
762
- "epoch": 0.98,
763
- "learning_rate": 2.295285359801489e-05,
764
- "loss": 3.1048,
765
- "step": 6300
766
- },
767
- {
768
- "epoch": 0.98,
769
- "learning_rate": 1.5198511166253101e-05,
770
- "loss": 3.1145,
771
- "step": 6350
772
- },
773
- {
774
- "epoch": 0.99,
775
- "learning_rate": 7.444168734491316e-06,
776
- "loss": 3.185,
777
- "step": 6400
778
- },
779
- {
780
- "epoch": 1.0,
781
- "step": 6448,
782
- "total_flos": 1.0568400432109978e+17,
783
- "train_loss": 3.3343640817306177,
784
- "train_runtime": 1458.5581,
785
- "train_samples_per_second": 35.367,
786
- "train_steps_per_second": 4.421
787
  }
788
  ],
789
- "logging_steps": 50,
790
- "max_steps": 6448,
791
  "num_train_epochs": 1,
792
- "save_steps": -6448,
793
- "total_flos": 1.0568400432109978e+17,
794
  "trial_name": null,
795
  "trial_params": null
796
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9005276529216337,
5
  "eval_steps": 500,
6
+ "global_step": 9,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.1,
13
+ "learning_rate": 0.001,
14
+ "loss": 2.3313,
15
+ "step": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  },
17
  {
18
  "epoch": 0.2,
19
+ "learning_rate": 0.001,
20
+ "loss": 3.851,
21
+ "step": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
  {
24
  "epoch": 0.3,
25
+ "learning_rate": 0.001,
26
+ "loss": 3.2287,
27
+ "step": 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  },
29
  {
30
  "epoch": 0.4,
31
+ "learning_rate": 0.001,
32
+ "loss": 2.7855,
33
+ "step": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  {
36
  "epoch": 0.5,
37
+ "learning_rate": 0.001,
38
+ "loss": 2.6085,
39
+ "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  },
41
  {
42
  "epoch": 0.6,
43
+ "learning_rate": 0.001,
44
+ "loss": 2.4226,
45
+ "step": 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
  "epoch": 0.7,
49
+ "learning_rate": 0.001,
50
+ "loss": 2.3728,
51
+ "step": 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.8,
55
+ "learning_rate": 0.001,
56
+ "loss": 2.3366,
57
+ "step": 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  {
60
  "epoch": 0.9,
61
+ "learning_rate": 0.001,
62
+ "loss": 2.2684,
63
+ "step": 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  },
65
  {
66
+ "epoch": 0.9,
67
+ "step": 9,
68
+ "total_flos": 9440754728435712.0,
69
+ "train_loss": 2.689473125669691,
70
+ "train_runtime": 118.9327,
71
+ "train_samples_per_second": 43.024,
72
+ "train_steps_per_second": 0.076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  }
74
  ],
75
+ "logging_steps": 1.0,
76
+ "max_steps": 9,
77
  "num_train_epochs": 1,
78
+ "save_steps": -9,
79
+ "total_flos": 9440754728435712.0,
80
  "trial_name": null,
81
  "trial_params": null
82
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:164c4b37b97a054742e9ab666c8f07d9e7fda1d553be739f57042a8801d6d49b
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86cd19e50153cca84ec432ffe3a736db67de735000a21e789fbd7059d3ac7d6
3
  size 4027