Nadav commited on
Commit
ab47c11
1 Parent(s): 6978a14

Training in progress, step 1700000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b7631287c17591d58dadd04be48644a33077deefb6d1d14f74e2f93c36aec91
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6263ea7e43a6acbefa798ce6055706ef15240d94f08fb8faefbf26e23ac3a25
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a32ccd501686e19a8620f7d7e687e1f5c24ab7eeb4d02c97b0602fd3ef6b00
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc7a8543a963e582a29e31e1e0c78fea4345a1b73b925ed6cc4d7ab61edbd1e
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d51d69ea58954b7718215d9f72065176cc7336fa6c38b909a08870ba5630677
3
  size 21643
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7eefc1725778458a372a52de0baec705be0fcd52c035947880ee6c60789db03
3
  size 21643
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27f622c4d499409c918003e0ceb129f980c9b69f4e403e154b6d10d05411edba
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f873a347744a9c52f42be277b16c7300feca4fe83dae00b3348477c6cab3f68
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58f58a35ab7d2ab951aa69a43ad235e65fe40980754d4b9fb70c41c8a8f9f3fb
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f676f9b0130b013ba493986d64992bf63d68d6bad5cd11e3728c43b657e50e05
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.075,
5
- "global_step": 1650000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -12546,11 +12546,391 @@
12546
  "eval_samples_per_second": 80.739,
12547
  "eval_steps_per_second": 0.631,
12548
  "step": 1650000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12549
  }
12550
  ],
12551
  "max_steps": 2000000,
12552
  "num_train_epochs": 9223372036854775807,
12553
- "total_flos": 1.4458179019603968e+22,
12554
  "trial_name": null,
12555
  "trial_params": null
12556
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.1,
5
+ "global_step": 1700000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
12546
  "eval_samples_per_second": 80.739,
12547
  "eval_steps_per_second": 0.631,
12548
  "step": 1650000
12549
+ },
12550
+ {
12551
+ "epoch": 0.08,
12552
+ "learning_rate": 2.263295082033955e-05,
12553
+ "loss": 0.4402,
12554
+ "step": 1651000
12555
+ },
12556
+ {
12557
+ "epoch": 0.08,
12558
+ "learning_rate": 2.256302851518958e-05,
12559
+ "loss": 0.4408,
12560
+ "step": 1652000
12561
+ },
12562
+ {
12563
+ "epoch": 0.08,
12564
+ "learning_rate": 2.2493281173015714e-05,
12565
+ "loss": 0.4392,
12566
+ "step": 1653000
12567
+ },
12568
+ {
12569
+ "epoch": 0.08,
12570
+ "learning_rate": 2.242370900628049e-05,
12571
+ "loss": 0.4401,
12572
+ "step": 1654000
12573
+ },
12574
+ {
12575
+ "epoch": 0.08,
12576
+ "learning_rate": 2.235438153601577e-05,
12577
+ "loss": 0.4399,
12578
+ "step": 1655000
12579
+ },
12580
+ {
12581
+ "epoch": 0.08,
12582
+ "eval_loss": 0.42048439383506775,
12583
+ "eval_runtime": 77.8384,
12584
+ "eval_samples_per_second": 82.222,
12585
+ "eval_steps_per_second": 0.642,
12586
+ "step": 1655000
12587
+ },
12588
+ {
12589
+ "epoch": 0.08,
12590
+ "learning_rate": 2.2285160179706007e-05,
12591
+ "loss": 0.44,
12592
+ "step": 1656000
12593
+ },
12594
+ {
12595
+ "epoch": 0.08,
12596
+ "learning_rate": 2.2216114632807524e-05,
12597
+ "loss": 0.4404,
12598
+ "step": 1657000
12599
+ },
12600
+ {
12601
+ "epoch": 0.08,
12602
+ "learning_rate": 2.214731388718044e-05,
12603
+ "loss": 0.4406,
12604
+ "step": 1658000
12605
+ },
12606
+ {
12607
+ "epoch": 0.08,
12608
+ "learning_rate": 2.2078620413208303e-05,
12609
+ "loss": 0.4402,
12610
+ "step": 1659000
12611
+ },
12612
+ {
12613
+ "epoch": 0.08,
12614
+ "learning_rate": 2.201010337780338e-05,
12615
+ "loss": 0.4405,
12616
+ "step": 1660000
12617
+ },
12618
+ {
12619
+ "epoch": 0.08,
12620
+ "eval_loss": 0.4228270649909973,
12621
+ "eval_runtime": 78.2009,
12622
+ "eval_samples_per_second": 81.841,
12623
+ "eval_steps_per_second": 0.639,
12624
+ "step": 1660000
12625
+ },
12626
+ {
12627
+ "epoch": 0.08,
12628
+ "learning_rate": 2.1941831241763897e-05,
12629
+ "loss": 0.4402,
12630
+ "step": 1661000
12631
+ },
12632
+ {
12633
+ "epoch": 0.08,
12634
+ "learning_rate": 2.1873667532140358e-05,
12635
+ "loss": 0.439,
12636
+ "step": 1662000
12637
+ },
12638
+ {
12639
+ "epoch": 0.08,
12640
+ "learning_rate": 2.1805748783540877e-05,
12641
+ "loss": 0.44,
12642
+ "step": 1663000
12643
+ },
12644
+ {
12645
+ "epoch": 0.08,
12646
+ "learning_rate": 2.1737939229421666e-05,
12647
+ "loss": 0.4407,
12648
+ "step": 1664000
12649
+ },
12650
+ {
12651
+ "epoch": 0.08,
12652
+ "learning_rate": 2.167037469500335e-05,
12653
+ "loss": 0.4404,
12654
+ "step": 1665000
12655
+ },
12656
+ {
12657
+ "epoch": 0.08,
12658
+ "eval_loss": 0.41908711194992065,
12659
+ "eval_runtime": 78.8241,
12660
+ "eval_samples_per_second": 81.193,
12661
+ "eval_steps_per_second": 0.634,
12662
+ "step": 1665000
12663
+ },
12664
+ {
12665
+ "epoch": 0.08,
12666
+ "learning_rate": 2.160292012180046e-05,
12667
+ "loss": 0.4405,
12668
+ "step": 1666000
12669
+ },
12670
+ {
12671
+ "epoch": 0.08,
12672
+ "learning_rate": 2.1535643436230335e-05,
12673
+ "loss": 0.4401,
12674
+ "step": 1667000
12675
+ },
12676
+ {
12677
+ "epoch": 0.08,
12678
+ "learning_rate": 2.146854484322948e-05,
12679
+ "loss": 0.4403,
12680
+ "step": 1668000
12681
+ },
12682
+ {
12683
+ "epoch": 0.08,
12684
+ "learning_rate": 2.140162454719184e-05,
12685
+ "loss": 0.4418,
12686
+ "step": 1669000
12687
+ },
12688
+ {
12689
+ "epoch": 0.09,
12690
+ "learning_rate": 2.1334882751968192e-05,
12691
+ "loss": 0.4397,
12692
+ "step": 1670000
12693
+ },
12694
+ {
12695
+ "epoch": 0.09,
12696
+ "eval_loss": 0.42122882604599,
12697
+ "eval_runtime": 76.417,
12698
+ "eval_samples_per_second": 83.751,
12699
+ "eval_steps_per_second": 0.654,
12700
+ "step": 1670000
12701
+ },
12702
+ {
12703
+ "epoch": 0.09,
12704
+ "learning_rate": 2.126838613462656e-05,
12705
+ "loss": 0.4387,
12706
+ "step": 1671000
12707
+ },
12708
+ {
12709
+ "epoch": 0.09,
12710
+ "learning_rate": 2.1202001771399895e-05,
12711
+ "loss": 0.4387,
12712
+ "step": 1672000
12713
+ },
12714
+ {
12715
+ "epoch": 0.09,
12716
+ "learning_rate": 2.1135796517072863e-05,
12717
+ "loss": 0.4394,
12718
+ "step": 1673000
12719
+ },
12720
+ {
12721
+ "epoch": 0.09,
12722
+ "learning_rate": 2.106977057331812e-05,
12723
+ "loss": 0.4398,
12724
+ "step": 1674000
12725
+ },
12726
+ {
12727
+ "epoch": 0.09,
12728
+ "learning_rate": 2.1003989897961326e-05,
12729
+ "loss": 0.44,
12730
+ "step": 1675000
12731
+ },
12732
+ {
12733
+ "epoch": 0.09,
12734
+ "eval_loss": 0.41976797580718994,
12735
+ "eval_runtime": 77.5035,
12736
+ "eval_samples_per_second": 82.577,
12737
+ "eval_steps_per_second": 0.645,
12738
+ "step": 1675000
12739
+ },
12740
+ {
12741
+ "epoch": 0.09,
12742
+ "learning_rate": 2.0938388575438328e-05,
12743
+ "loss": 0.4403,
12744
+ "step": 1676000
12745
+ },
12746
+ {
12747
+ "epoch": 0.09,
12748
+ "learning_rate": 2.0872901407947595e-05,
12749
+ "loss": 0.4413,
12750
+ "step": 1677000
12751
+ },
12752
+ {
12753
+ "epoch": 0.09,
12754
+ "learning_rate": 2.080759435185324e-05,
12755
+ "loss": 0.4397,
12756
+ "step": 1678000
12757
+ },
12758
+ {
12759
+ "epoch": 0.09,
12760
+ "learning_rate": 2.0742467606091935e-05,
12761
+ "loss": 0.4395,
12762
+ "step": 1679000
12763
+ },
12764
+ {
12765
+ "epoch": 0.09,
12766
+ "learning_rate": 2.0677586225058045e-05,
12767
+ "loss": 0.4407,
12768
+ "step": 1680000
12769
+ },
12770
+ {
12771
+ "epoch": 0.09,
12772
+ "eval_loss": 0.42079994082450867,
12773
+ "eval_runtime": 79.6958,
12774
+ "eval_samples_per_second": 80.305,
12775
+ "eval_steps_per_second": 0.627,
12776
+ "step": 1680000
12777
+ },
12778
+ {
12779
+ "epoch": 0.09,
12780
+ "learning_rate": 2.0612885189152567e-05,
12781
+ "loss": 0.4399,
12782
+ "step": 1681000
12783
+ },
12784
+ {
12785
+ "epoch": 0.09,
12786
+ "learning_rate": 2.0548300200510223e-05,
12787
+ "loss": 0.4382,
12788
+ "step": 1682000
12789
+ },
12790
+ {
12791
+ "epoch": 0.09,
12792
+ "learning_rate": 2.048389631205587e-05,
12793
+ "loss": 0.4393,
12794
+ "step": 1683000
12795
+ },
12796
+ {
12797
+ "epoch": 0.09,
12798
+ "learning_rate": 2.041967371997491e-05,
12799
+ "loss": 0.4392,
12800
+ "step": 1684000
12801
+ },
12802
+ {
12803
+ "epoch": 0.09,
12804
+ "learning_rate": 2.0355760520841843e-05,
12805
+ "loss": 0.4403,
12806
+ "step": 1685000
12807
+ },
12808
+ {
12809
+ "epoch": 0.09,
12810
+ "eval_loss": 0.41910338401794434,
12811
+ "eval_runtime": 79.2458,
12812
+ "eval_samples_per_second": 80.761,
12813
+ "eval_steps_per_second": 0.631,
12814
+ "step": 1685000
12815
+ },
12816
+ {
12817
+ "epoch": 0.09,
12818
+ "learning_rate": 2.0291900744285765e-05,
12819
+ "loss": 0.4397,
12820
+ "step": 1686000
12821
+ },
12822
+ {
12823
+ "epoch": 0.09,
12824
+ "learning_rate": 2.022822284895487e-05,
12825
+ "loss": 0.4401,
12826
+ "step": 1687000
12827
+ },
12828
+ {
12829
+ "epoch": 0.09,
12830
+ "learning_rate": 2.016472702882308e-05,
12831
+ "loss": 0.4395,
12832
+ "step": 1688000
12833
+ },
12834
+ {
12835
+ "epoch": 0.09,
12836
+ "learning_rate": 2.0101476699753774e-05,
12837
+ "loss": 0.4394,
12838
+ "step": 1689000
12839
+ },
12840
+ {
12841
+ "epoch": 0.1,
12842
+ "learning_rate": 2.003840846723428e-05,
12843
+ "loss": 0.4408,
12844
+ "step": 1690000
12845
+ },
12846
+ {
12847
+ "epoch": 0.1,
12848
+ "eval_loss": 0.41959914565086365,
12849
+ "eval_runtime": 79.6028,
12850
+ "eval_samples_per_second": 80.399,
12851
+ "eval_steps_per_second": 0.628,
12852
+ "step": 1690000
12853
+ },
12854
+ {
12855
+ "epoch": 0.1,
12856
+ "learning_rate": 1.9975459665494844e-05,
12857
+ "loss": 0.4406,
12858
+ "step": 1691000
12859
+ },
12860
+ {
12861
+ "epoch": 0.1,
12862
+ "learning_rate": 1.9912693708915007e-05,
12863
+ "loss": 0.4403,
12864
+ "step": 1692000
12865
+ },
12866
+ {
12867
+ "epoch": 0.1,
12868
+ "learning_rate": 1.9850110788690757e-05,
12869
+ "loss": 0.4391,
12870
+ "step": 1693000
12871
+ },
12872
+ {
12873
+ "epoch": 0.1,
12874
+ "learning_rate": 1.978771109546051e-05,
12875
+ "loss": 0.4388,
12876
+ "step": 1694000
12877
+ },
12878
+ {
12879
+ "epoch": 0.1,
12880
+ "learning_rate": 1.9725681193643978e-05,
12881
+ "loss": 0.439,
12882
+ "step": 1695000
12883
+ },
12884
+ {
12885
+ "epoch": 0.1,
12886
+ "eval_loss": 0.4219348132610321,
12887
+ "eval_runtime": 78.5115,
12888
+ "eval_samples_per_second": 81.517,
12889
+ "eval_steps_per_second": 0.637,
12890
+ "step": 1695000
12891
+ },
12892
+ {
12893
+ "epoch": 0.1,
12894
+ "learning_rate": 1.9663647972981225e-05,
12895
+ "loss": 0.4389,
12896
+ "step": 1696000
12897
+ },
12898
+ {
12899
+ "epoch": 0.1,
12900
+ "learning_rate": 1.9601798547310563e-05,
12901
+ "loss": 0.4396,
12902
+ "step": 1697000
12903
+ },
12904
+ {
12905
+ "epoch": 0.1,
12906
+ "learning_rate": 1.954019467851605e-05,
12907
+ "loss": 0.4405,
12908
+ "step": 1698000
12909
+ },
12910
+ {
12911
+ "epoch": 0.1,
12912
+ "learning_rate": 1.9478713223216454e-05,
12913
+ "loss": 0.4403,
12914
+ "step": 1699000
12915
+ },
12916
+ {
12917
+ "epoch": 0.1,
12918
+ "learning_rate": 1.9417416126252245e-05,
12919
+ "loss": 0.4394,
12920
+ "step": 1700000
12921
+ },
12922
+ {
12923
+ "epoch": 0.1,
12924
+ "eval_loss": 0.42123520374298096,
12925
+ "eval_runtime": 78.2147,
12926
+ "eval_samples_per_second": 81.826,
12927
+ "eval_steps_per_second": 0.639,
12928
+ "step": 1700000
12929
  }
12930
  ],
12931
  "max_steps": 2000000,
12932
  "num_train_epochs": 9223372036854775807,
12933
+ "total_flos": 1.4896305656561664e+22,
12934
  "trial_name": null,
12935
  "trial_params": null
12936
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a32ccd501686e19a8620f7d7e687e1f5c24ab7eeb4d02c97b0602fd3ef6b00
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc7a8543a963e582a29e31e1e0c78fea4345a1b73b925ed6cc4d7ab61edbd1e
3
  size 449471589