Nadav commited on
Commit
552197f
1 Parent(s): 3eac625

Training in progress, step 1800000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c223f21c9f3d69fb40b6ad537a2d1e1726b01ec615931fd84b4f155a73edb6cb
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84eeca699785d889add4fce9e83fcf219cc03b8c3e8612092092ba4f022e339b
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4b2f64ee4b8a3f1cf3d86fb133d82c77bc0f7052c00d93cb35fb4180acc8509
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a47c42cd40edaf177247b0f81cc113941e45da543bcd8075122f86f8a439a53
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e052c7897af7d62d87b26b3f0036377845bb2408ce5c5d3e7b4078dbe5f611ef
3
  size 21643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22e615daa20a7523bf096df9dcc68366ed60a8151bafc863df6c6b53275a84a
3
  size 21643
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af6d04926cbb05a843491ada6b24ca053dbb81e1dc7c6706a5415b4d4cca0e78
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c4c724e259a52a66e7ae3019ca30f1baaafdcfcaf6dbe949cbda0206af52d55
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd9e8ca586c336641c0b85f2a85288a9eeaaab808e84d3e0180b33f991192ef6
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a24dd415d95b2d83e758fabab0d2c6d80262a248eda13bb423bd8c9ef9f0d1d
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.125,
5
- "global_step": 1750000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -13306,11 +13306,391 @@
13306
  "eval_samples_per_second": 80.645,
13307
  "eval_steps_per_second": 0.63,
13308
  "step": 1750000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13309
  }
13310
  ],
13311
  "max_steps": 2000000,
13312
  "num_train_epochs": 9223372036854775807,
13313
- "total_flos": 1.533443229351936e+22,
13314
  "trial_name": null,
13315
  "trial_params": null
13316
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.15,
5
+ "global_step": 1800000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
13306
  "eval_samples_per_second": 80.645,
13307
  "eval_steps_per_second": 0.63,
13308
  "step": 1750000
13309
+ },
13310
+ {
13311
+ "epoch": 0.13,
13312
+ "learning_rate": 1.6540859441048118e-05,
13313
+ "loss": 0.4394,
13314
+ "step": 1751000
13315
+ },
13316
+ {
13317
+ "epoch": 0.13,
13318
+ "learning_rate": 1.6489389468730806e-05,
13319
+ "loss": 0.439,
13320
+ "step": 1752000
13321
+ },
13322
+ {
13323
+ "epoch": 0.13,
13324
+ "learning_rate": 1.6438215320582125e-05,
13325
+ "loss": 0.4382,
13326
+ "step": 1753000
13327
+ },
13328
+ {
13329
+ "epoch": 0.13,
13330
+ "learning_rate": 1.638713204573334e-05,
13331
+ "loss": 0.4381,
13332
+ "step": 1754000
13333
+ },
13334
+ {
13335
+ "epoch": 0.13,
13336
+ "learning_rate": 1.6336293339368757e-05,
13337
+ "loss": 0.4392,
13338
+ "step": 1755000
13339
+ },
13340
+ {
13341
+ "epoch": 0.13,
13342
+ "eval_loss": 0.420003205537796,
13343
+ "eval_runtime": 78.6855,
13344
+ "eval_samples_per_second": 81.336,
13345
+ "eval_steps_per_second": 0.635,
13346
+ "step": 1755000
13347
+ },
13348
+ {
13349
+ "epoch": 0.13,
13350
+ "learning_rate": 1.628559757718579e-05,
13351
+ "loss": 0.439,
13352
+ "step": 1756000
13353
+ },
13354
+ {
13355
+ "epoch": 0.13,
13356
+ "learning_rate": 1.6235095900122255e-05,
13357
+ "loss": 0.4383,
13358
+ "step": 1757000
13359
+ },
13360
+ {
13361
+ "epoch": 0.13,
13362
+ "learning_rate": 1.61848386723797e-05,
13363
+ "loss": 0.4386,
13364
+ "step": 1758000
13365
+ },
13366
+ {
13367
+ "epoch": 0.13,
13368
+ "learning_rate": 1.6134725432005385e-05,
13369
+ "loss": 0.4395,
13370
+ "step": 1759000
13371
+ },
13372
+ {
13373
+ "epoch": 0.13,
13374
+ "learning_rate": 1.6084856557803128e-05,
13375
+ "loss": 0.438,
13376
+ "step": 1760000
13377
+ },
13378
+ {
13379
+ "epoch": 0.13,
13380
+ "eval_loss": 0.4174318015575409,
13381
+ "eval_runtime": 76.8661,
13382
+ "eval_samples_per_second": 83.262,
13383
+ "eval_steps_per_second": 0.65,
13384
+ "step": 1760000
13385
+ },
13386
+ {
13387
+ "epoch": 0.13,
13388
+ "learning_rate": 1.6035132364121584e-05,
13389
+ "loss": 0.4388,
13390
+ "step": 1761000
13391
+ },
13392
+ {
13393
+ "epoch": 0.13,
13394
+ "learning_rate": 1.5985603018519935e-05,
13395
+ "loss": 0.4378,
13396
+ "step": 1762000
13397
+ },
13398
+ {
13399
+ "epoch": 0.13,
13400
+ "learning_rate": 1.5936317908767756e-05,
13401
+ "loss": 0.4368,
13402
+ "step": 1763000
13403
+ },
13404
+ {
13405
+ "epoch": 0.13,
13406
+ "learning_rate": 1.5887178516132736e-05,
13407
+ "loss": 0.4412,
13408
+ "step": 1764000
13409
+ },
13410
+ {
13411
+ "epoch": 0.13,
13412
+ "learning_rate": 1.5838283268763148e-05,
13413
+ "loss": 0.4383,
13414
+ "step": 1765000
13415
+ },
13416
+ {
13417
+ "epoch": 0.13,
13418
+ "eval_loss": 0.41864004731178284,
13419
+ "eval_runtime": 86.8232,
13420
+ "eval_samples_per_second": 73.713,
13421
+ "eval_steps_per_second": 0.576,
13422
+ "step": 1765000
13423
+ },
13424
+ {
13425
+ "epoch": 0.13,
13426
+ "learning_rate": 1.5789583078410045e-05,
13427
+ "loss": 0.4389,
13428
+ "step": 1766000
13429
+ },
13430
+ {
13431
+ "epoch": 0.13,
13432
+ "learning_rate": 1.574102963743466e-05,
13433
+ "loss": 0.439,
13434
+ "step": 1767000
13435
+ },
13436
+ {
13437
+ "epoch": 0.13,
13438
+ "learning_rate": 1.5692671940427092e-05,
13439
+ "loss": 0.4385,
13440
+ "step": 1768000
13441
+ },
13442
+ {
13443
+ "epoch": 0.13,
13444
+ "learning_rate": 1.5644510134693248e-05,
13445
+ "loss": 0.4384,
13446
+ "step": 1769000
13447
+ },
13448
+ {
13449
+ "epoch": 0.14,
13450
+ "learning_rate": 1.559654436694238e-05,
13451
+ "loss": 0.4392,
13452
+ "step": 1770000
13453
+ },
13454
+ {
13455
+ "epoch": 0.14,
13456
+ "eval_loss": 0.4172964096069336,
13457
+ "eval_runtime": 79.9939,
13458
+ "eval_samples_per_second": 80.006,
13459
+ "eval_steps_per_second": 0.625,
13460
+ "step": 1770000
13461
+ },
13462
+ {
13463
+ "epoch": 0.14,
13464
+ "learning_rate": 1.5548822454827717e-05,
13465
+ "loss": 0.4393,
13466
+ "step": 1771000
13467
+ },
13468
+ {
13469
+ "epoch": 0.14,
13470
+ "learning_rate": 1.5501249004379188e-05,
13471
+ "loss": 0.4392,
13472
+ "step": 1772000
13473
+ },
13474
+ {
13475
+ "epoch": 0.14,
13476
+ "learning_rate": 1.54539193071009e-05,
13477
+ "loss": 0.4386,
13478
+ "step": 1773000
13479
+ },
13480
+ {
13481
+ "epoch": 0.14,
13482
+ "learning_rate": 1.5406738753042658e-05,
13483
+ "loss": 0.4393,
13484
+ "step": 1774000
13485
+ },
13486
+ {
13487
+ "epoch": 0.14,
13488
+ "learning_rate": 1.5359754961260252e-05,
13489
+ "loss": 0.4387,
13490
+ "step": 1775000
13491
+ },
13492
+ {
13493
+ "epoch": 0.14,
13494
+ "eval_loss": 0.4169865548610687,
13495
+ "eval_runtime": 77.7982,
13496
+ "eval_samples_per_second": 82.264,
13497
+ "eval_steps_per_second": 0.643,
13498
+ "step": 1775000
13499
+ },
13500
+ {
13501
+ "epoch": 0.14,
13502
+ "learning_rate": 1.5312968074874446e-05,
13503
+ "loss": 0.4381,
13504
+ "step": 1776000
13505
+ },
13506
+ {
13507
+ "epoch": 0.14,
13508
+ "learning_rate": 1.5266424727771944e-05,
13509
+ "loss": 0.4396,
13510
+ "step": 1777000
13511
+ },
13512
+ {
13513
+ "epoch": 0.14,
13514
+ "learning_rate": 1.522003188188146e-05,
13515
+ "loss": 0.4395,
13516
+ "step": 1778000
13517
+ },
13518
+ {
13519
+ "epoch": 0.14,
13520
+ "learning_rate": 1.517383636700831e-05,
13521
+ "loss": 0.4382,
13522
+ "step": 1779000
13523
+ },
13524
+ {
13525
+ "epoch": 0.14,
13526
+ "learning_rate": 1.5127838323872036e-05,
13527
+ "loss": 0.4364,
13528
+ "step": 1780000
13529
+ },
13530
+ {
13531
+ "epoch": 0.14,
13532
+ "eval_loss": 0.41849958896636963,
13533
+ "eval_runtime": 76.5418,
13534
+ "eval_samples_per_second": 83.614,
13535
+ "eval_steps_per_second": 0.653,
13536
+ "step": 1780000
13537
+ },
13538
+ {
13539
+ "epoch": 0.14,
13540
+ "learning_rate": 1.5082037892590664e-05,
13541
+ "loss": 0.439,
13542
+ "step": 1781000
13543
+ },
13544
+ {
13545
+ "epoch": 0.14,
13546
+ "learning_rate": 1.5036480716537045e-05,
13547
+ "loss": 0.4393,
13548
+ "step": 1782000
13549
+ },
13550
+ {
13551
+ "epoch": 0.14,
13552
+ "learning_rate": 1.4991121035047137e-05,
13553
+ "loss": 0.4383,
13554
+ "step": 1783000
13555
+ },
13556
+ {
13557
+ "epoch": 0.14,
13558
+ "learning_rate": 1.4945913877821996e-05,
13559
+ "loss": 0.4383,
13560
+ "step": 1784000
13561
+ },
13562
+ {
13563
+ "epoch": 0.14,
13564
+ "learning_rate": 1.4900904886625165e-05,
13565
+ "loss": 0.4377,
13566
+ "step": 1785000
13567
+ },
13568
+ {
13569
+ "epoch": 0.14,
13570
+ "eval_loss": 0.4204372465610504,
13571
+ "eval_runtime": 76.9476,
13572
+ "eval_samples_per_second": 83.173,
13573
+ "eval_steps_per_second": 0.65,
13574
+ "step": 1785000
13575
+ },
13576
+ {
13577
+ "epoch": 0.14,
13578
+ "learning_rate": 1.4856138910151988e-05,
13579
+ "loss": 0.4388,
13580
+ "step": 1786000
13581
+ },
13582
+ {
13583
+ "epoch": 0.14,
13584
+ "learning_rate": 1.4811526463215664e-05,
13585
+ "loss": 0.4371,
13586
+ "step": 1787000
13587
+ },
13588
+ {
13589
+ "epoch": 0.14,
13590
+ "learning_rate": 1.476715690631307e-05,
13591
+ "loss": 0.438,
13592
+ "step": 1788000
13593
+ },
13594
+ {
13595
+ "epoch": 0.14,
13596
+ "learning_rate": 1.4722941546682392e-05,
13597
+ "loss": 0.4381,
13598
+ "step": 1789000
13599
+ },
13600
+ {
13601
+ "epoch": 0.14,
13602
+ "learning_rate": 1.4678968949438921e-05,
13603
+ "loss": 0.4363,
13604
+ "step": 1790000
13605
+ },
13606
+ {
13607
+ "epoch": 0.14,
13608
+ "eval_loss": 0.4183988869190216,
13609
+ "eval_runtime": 76.9826,
13610
+ "eval_samples_per_second": 83.136,
13611
+ "eval_steps_per_second": 0.649,
13612
+ "step": 1790000
13613
+ },
13614
+ {
13615
+ "epoch": 0.15,
13616
+ "learning_rate": 1.4635151215325466e-05,
13617
+ "loss": 0.4366,
13618
+ "step": 1791000
13619
+ },
13620
+ {
13621
+ "epoch": 0.15,
13622
+ "learning_rate": 1.4591576112997706e-05,
13623
+ "loss": 0.4391,
13624
+ "step": 1792000
13625
+ },
13626
+ {
13627
+ "epoch": 0.15,
13628
+ "learning_rate": 1.4548156537772989e-05,
13629
+ "loss": 0.4391,
13630
+ "step": 1793000
13631
+ },
13632
+ {
13633
+ "epoch": 0.15,
13634
+ "learning_rate": 1.4504936340214418e-05,
13635
+ "loss": 0.4385,
13636
+ "step": 1794000
13637
+ },
13638
+ {
13639
+ "epoch": 0.15,
13640
+ "learning_rate": 1.4461958572967858e-05,
13641
+ "loss": 0.4378,
13642
+ "step": 1795000
13643
+ },
13644
+ {
13645
+ "epoch": 0.15,
13646
+ "eval_loss": 0.4223540425300598,
13647
+ "eval_runtime": 77.2417,
13648
+ "eval_samples_per_second": 82.857,
13649
+ "eval_steps_per_second": 0.647,
13650
+ "step": 1795000
13651
+ },
13652
+ {
13653
+ "epoch": 0.15,
13654
+ "learning_rate": 1.4419137325396865e-05,
13655
+ "loss": 0.4389,
13656
+ "step": 1796000
13657
+ },
13658
+ {
13659
+ "epoch": 0.15,
13660
+ "learning_rate": 1.437651584850691e-05,
13661
+ "loss": 0.4386,
13662
+ "step": 1797000
13663
+ },
13664
+ {
13665
+ "epoch": 0.15,
13666
+ "learning_rate": 1.4334094272130413e-05,
13667
+ "loss": 0.4367,
13668
+ "step": 1798000
13669
+ },
13670
+ {
13671
+ "epoch": 0.15,
13672
+ "learning_rate": 1.4291872725490842e-05,
13673
+ "loss": 0.4384,
13674
+ "step": 1799000
13675
+ },
13676
+ {
13677
+ "epoch": 0.15,
13678
+ "learning_rate": 1.4249893258568889e-05,
13679
+ "loss": 0.4384,
13680
+ "step": 1800000
13681
+ },
13682
+ {
13683
+ "epoch": 0.15,
13684
+ "eval_loss": 0.4171189069747925,
13685
+ "eval_runtime": 76.9772,
13686
+ "eval_samples_per_second": 83.142,
13687
+ "eval_steps_per_second": 0.65,
13688
+ "step": 1800000
13689
  }
13690
  ],
13691
  "max_steps": 2000000,
13692
  "num_train_epochs": 9223372036854775807,
13693
+ "total_flos": 1.5772558930477056e+22,
13694
  "trial_name": null,
13695
  "trial_params": null
13696
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4b2f64ee4b8a3f1cf3d86fb133d82c77bc0f7052c00d93cb35fb4180acc8509
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a47c42cd40edaf177247b0f81cc113941e45da543bcd8075122f86f8a439a53
3
  size 449471589