Nadav commited on
Commit
a667843
1 Parent(s): 951fb15

Training in progress, step 1950000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:013d806030dbfcf7439287a4abebaead32865f32b1a6bb9b00de4deffabc4438
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24480137122a3ca1298b2aa2acbf1d8e05d75ba9f182abd41ff9618c60e00071
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:768b581a3df87951d3b920a69301f7e3d38ff0a3a3da9d558409072ba37b7784
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4158aaedff079b2378ceb72199c920ad399c00fbc03838dbc3a2204ee0d64219
3
  size 449471589
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:341c3cc479ea146255e0f1fc73d571d276563929c345e49fa3a47d0d9e217d91
3
  size 21579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a16c585a386790723cc51bc4a838a254dc71110b475f7ebf887ed7011d90a8f
3
  size 21579
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9efc4ae6f5bc3f2c21d5f173e4d0bb957724e4a8c6f3a076056d590a496511a8
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abaeb1638369c701afb9b3b4e706b5c028681adb6ebf26ba2bfe37402d287efd
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52048f50586841c25e23348acd1399bc0fa9f856ed80d753e4cd61c4863473be
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c8322c0057a49117b93f76b6d690bf483c56843cf994e2b3614611effcb47d
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.05,
5
- "global_step": 1900000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -14446,11 +14446,391 @@
14446
  "eval_samples_per_second": 79.467,
14447
  "eval_steps_per_second": 0.621,
14448
  "step": 1900000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14449
  }
14450
  ],
14451
  "max_steps": 2000000,
14452
  "num_train_epochs": 9223372036854775807,
14453
- "total_flos": 1.6648812204392448e+22,
14454
  "trial_name": null,
14455
  "trial_params": null
14456
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.075,
5
+ "global_step": 1950000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
14446
  "eval_samples_per_second": 79.467,
14447
  "eval_steps_per_second": 0.621,
14448
  "step": 1900000
14449
+ },
14450
+ {
14451
+ "epoch": 0.05,
14452
+ "learning_rate": 1.105744066188684e-05,
14453
+ "loss": 0.4371,
14454
+ "step": 1901000
14455
+ },
14456
+ {
14457
+ "epoch": 0.05,
14458
+ "learning_rate": 1.1036390191576373e-05,
14459
+ "loss": 0.4379,
14460
+ "step": 1902000
14461
+ },
14462
+ {
14463
+ "epoch": 0.05,
14464
+ "learning_rate": 1.1015549796381372e-05,
14465
+ "loss": 0.4373,
14466
+ "step": 1903000
14467
+ },
14468
+ {
14469
+ "epoch": 0.05,
14470
+ "learning_rate": 1.0994960590538279e-05,
14471
+ "loss": 0.4375,
14472
+ "step": 1904000
14473
+ },
14474
+ {
14475
+ "epoch": 0.05,
14476
+ "learning_rate": 1.0974540114919287e-05,
14477
+ "loss": 0.4363,
14478
+ "step": 1905000
14479
+ },
14480
+ {
14481
+ "epoch": 0.05,
14482
+ "eval_loss": 0.4150693416595459,
14483
+ "eval_runtime": 80.8571,
14484
+ "eval_samples_per_second": 79.152,
14485
+ "eval_steps_per_second": 0.618,
14486
+ "step": 1905000
14487
+ },
14488
+ {
14489
+ "epoch": 0.05,
14490
+ "learning_rate": 1.0954329902821809e-05,
14491
+ "loss": 0.4375,
14492
+ "step": 1906000
14493
+ },
14494
+ {
14495
+ "epoch": 0.05,
14496
+ "learning_rate": 1.0934330015809674e-05,
14497
+ "loss": 0.437,
14498
+ "step": 1907000
14499
+ },
14500
+ {
14501
+ "epoch": 0.05,
14502
+ "learning_rate": 1.0914560199199067e-05,
14503
+ "loss": 0.4379,
14504
+ "step": 1908000
14505
+ },
14506
+ {
14507
+ "epoch": 0.05,
14508
+ "learning_rate": 1.0894980934009906e-05,
14509
+ "loss": 0.4372,
14510
+ "step": 1909000
14511
+ },
14512
+ {
14513
+ "epoch": 0.06,
14514
+ "learning_rate": 1.0875612174693328e-05,
14515
+ "loss": 0.437,
14516
+ "step": 1910000
14517
+ },
14518
+ {
14519
+ "epoch": 0.06,
14520
+ "eval_loss": 0.4164562225341797,
14521
+ "eval_runtime": 79.4864,
14522
+ "eval_samples_per_second": 80.517,
14523
+ "eval_steps_per_second": 0.629,
14524
+ "step": 1910000
14525
+ },
14526
+ {
14527
+ "epoch": 0.06,
14528
+ "learning_rate": 1.0856473033247752e-05,
14529
+ "loss": 0.4369,
14530
+ "step": 1911000
14531
+ },
14532
+ {
14533
+ "epoch": 0.06,
14534
+ "learning_rate": 1.0837525251384567e-05,
14535
+ "loss": 0.4367,
14536
+ "step": 1912000
14537
+ },
14538
+ {
14539
+ "epoch": 0.06,
14540
+ "learning_rate": 1.0818806782260748e-05,
14541
+ "loss": 0.4367,
14542
+ "step": 1913000
14543
+ },
14544
+ {
14545
+ "epoch": 0.06,
14546
+ "learning_rate": 1.0800280208492865e-05,
14547
+ "loss": 0.4379,
14548
+ "step": 1914000
14549
+ },
14550
+ {
14551
+ "epoch": 0.06,
14552
+ "learning_rate": 1.0781982639541429e-05,
14553
+ "loss": 0.4373,
14554
+ "step": 1915000
14555
+ },
14556
+ {
14557
+ "epoch": 0.06,
14558
+ "eval_loss": 0.4175663888454437,
14559
+ "eval_runtime": 80.7249,
14560
+ "eval_samples_per_second": 79.282,
14561
+ "eval_steps_per_second": 0.619,
14562
+ "step": 1915000
14563
+ },
14564
+ {
14565
+ "epoch": 0.06,
14566
+ "learning_rate": 1.0763895499185767e-05,
14567
+ "loss": 0.4362,
14568
+ "step": 1916000
14569
+ },
14570
+ {
14571
+ "epoch": 0.06,
14572
+ "learning_rate": 1.0746001053331784e-05,
14573
+ "loss": 0.4367,
14574
+ "step": 1917000
14575
+ },
14576
+ {
14577
+ "epoch": 0.06,
14578
+ "learning_rate": 1.0728317567168942e-05,
14579
+ "loss": 0.4373,
14580
+ "step": 1918000
14581
+ },
14582
+ {
14583
+ "epoch": 0.06,
14584
+ "learning_rate": 1.0710845094564199e-05,
14585
+ "loss": 0.4377,
14586
+ "step": 1919000
14587
+ },
14588
+ {
14589
+ "epoch": 0.06,
14590
+ "learning_rate": 1.0693583688741745e-05,
14591
+ "loss": 0.4364,
14592
+ "step": 1920000
14593
+ },
14594
+ {
14595
+ "epoch": 0.06,
14596
+ "eval_loss": 0.4133068919181824,
14597
+ "eval_runtime": 78.4611,
14598
+ "eval_samples_per_second": 81.569,
14599
+ "eval_steps_per_second": 0.637,
14600
+ "step": 1920000
14601
+ },
14602
+ {
14603
+ "epoch": 0.06,
14604
+ "learning_rate": 1.0676550347097805e-05,
14605
+ "loss": 0.4376,
14606
+ "step": 1921000
14607
+ },
14608
+ {
14609
+ "epoch": 0.06,
14610
+ "learning_rate": 1.06597110207435e-05,
14611
+ "loss": 0.437,
14612
+ "step": 1922000
14613
+ },
14614
+ {
14615
+ "epoch": 0.06,
14616
+ "learning_rate": 1.0643082916934733e-05,
14617
+ "loss": 0.4378,
14618
+ "step": 1923000
14619
+ },
14620
+ {
14621
+ "epoch": 0.06,
14622
+ "learning_rate": 1.0626682397606544e-05,
14623
+ "loss": 0.4365,
14624
+ "step": 1924000
14625
+ },
14626
+ {
14627
+ "epoch": 0.06,
14628
+ "learning_rate": 1.0610492778999931e-05,
14629
+ "loss": 0.4366,
14630
+ "step": 1925000
14631
+ },
14632
+ {
14633
+ "epoch": 0.06,
14634
+ "eval_loss": 0.41611722111701965,
14635
+ "eval_runtime": 81.6547,
14636
+ "eval_samples_per_second": 78.379,
14637
+ "eval_steps_per_second": 0.612,
14638
+ "step": 1925000
14639
+ },
14640
+ {
14641
+ "epoch": 0.06,
14642
+ "learning_rate": 1.059449822137189e-05,
14643
+ "loss": 0.4372,
14644
+ "step": 1926000
14645
+ },
14646
+ {
14647
+ "epoch": 0.06,
14648
+ "learning_rate": 1.0578715084938887e-05,
14649
+ "loss": 0.4374,
14650
+ "step": 1927000
14651
+ },
14652
+ {
14653
+ "epoch": 0.06,
14654
+ "learning_rate": 1.0563143417779096e-05,
14655
+ "loss": 0.4366,
14656
+ "step": 1928000
14657
+ },
14658
+ {
14659
+ "epoch": 0.06,
14660
+ "learning_rate": 1.0547798521808734e-05,
14661
+ "loss": 0.437,
14662
+ "step": 1929000
14663
+ },
14664
+ {
14665
+ "epoch": 0.07,
14666
+ "learning_rate": 1.0532649723266384e-05,
14667
+ "loss": 0.4365,
14668
+ "step": 1930000
14669
+ },
14670
+ {
14671
+ "epoch": 0.07,
14672
+ "eval_loss": 0.4162156581878662,
14673
+ "eval_runtime": 80.1168,
14674
+ "eval_samples_per_second": 79.883,
14675
+ "eval_steps_per_second": 0.624,
14676
+ "step": 1930000
14677
+ },
14678
+ {
14679
+ "epoch": 0.07,
14680
+ "learning_rate": 1.0517727365795085e-05,
14681
+ "loss": 0.4369,
14682
+ "step": 1931000
14683
+ },
14684
+ {
14685
+ "epoch": 0.07,
14686
+ "learning_rate": 1.0503001620268975e-05,
14687
+ "loss": 0.4373,
14688
+ "step": 1932000
14689
+ },
14690
+ {
14691
+ "epoch": 0.07,
14692
+ "learning_rate": 1.0488487574652423e-05,
14693
+ "loss": 0.4374,
14694
+ "step": 1933000
14695
+ },
14696
+ {
14697
+ "epoch": 0.07,
14698
+ "learning_rate": 1.0474199469678468e-05,
14699
+ "loss": 0.437,
14700
+ "step": 1934000
14701
+ },
14702
+ {
14703
+ "epoch": 0.07,
14704
+ "learning_rate": 1.0460108744063674e-05,
14705
+ "loss": 0.4369,
14706
+ "step": 1935000
14707
+ },
14708
+ {
14709
+ "epoch": 0.07,
14710
+ "eval_loss": 0.4142652451992035,
14711
+ "eval_runtime": 77.8865,
14712
+ "eval_samples_per_second": 82.171,
14713
+ "eval_steps_per_second": 0.642,
14714
+ "step": 1935000
14715
+ },
14716
+ {
14717
+ "epoch": 0.07,
14718
+ "learning_rate": 1.0446243622089129e-05,
14719
+ "loss": 0.4389,
14720
+ "step": 1936000
14721
+ },
14722
+ {
14723
+ "epoch": 0.07,
14724
+ "learning_rate": 1.0432576387995491e-05,
14725
+ "loss": 0.4371,
14726
+ "step": 1937000
14727
+ },
14728
+ {
14729
+ "epoch": 0.07,
14730
+ "learning_rate": 1.0419121068338878e-05,
14731
+ "loss": 0.4372,
14732
+ "step": 1938000
14733
+ },
14734
+ {
14735
+ "epoch": 0.07,
14736
+ "learning_rate": 1.0405877704106532e-05,
14737
+ "loss": 0.4366,
14738
+ "step": 1939000
14739
+ },
14740
+ {
14741
+ "epoch": 0.07,
14742
+ "learning_rate": 1.0392859261103349e-05,
14743
+ "loss": 0.4355,
14744
+ "step": 1940000
14745
+ },
14746
+ {
14747
+ "epoch": 0.07,
14748
+ "eval_loss": 0.4190742075443268,
14749
+ "eval_runtime": 80.8959,
14750
+ "eval_samples_per_second": 79.114,
14751
+ "eval_steps_per_second": 0.618,
14752
+ "step": 1940000
14753
+ },
14754
+ {
14755
+ "epoch": 0.07,
14756
+ "learning_rate": 1.0380039716043426e-05,
14757
+ "loss": 0.4357,
14758
+ "step": 1941000
14759
+ },
14760
+ {
14761
+ "epoch": 0.07,
14762
+ "learning_rate": 1.0367432245456347e-05,
14763
+ "loss": 0.4362,
14764
+ "step": 1942000
14765
+ },
14766
+ {
14767
+ "epoch": 0.07,
14768
+ "learning_rate": 1.0355049177141353e-05,
14769
+ "loss": 0.4362,
14770
+ "step": 1943000
14771
+ },
14772
+ {
14773
+ "epoch": 0.07,
14774
+ "learning_rate": 1.0342865757898152e-05,
14775
+ "loss": 0.437,
14776
+ "step": 1944000
14777
+ },
14778
+ {
14779
+ "epoch": 0.07,
14780
+ "learning_rate": 1.0330906391597708e-05,
14781
+ "loss": 0.4357,
14782
+ "step": 1945000
14783
+ },
14784
+ {
14785
+ "epoch": 0.07,
14786
+ "eval_loss": 0.4155297577381134,
14787
+ "eval_runtime": 77.8064,
14788
+ "eval_samples_per_second": 82.255,
14789
+ "eval_steps_per_second": 0.643,
14790
+ "step": 1945000
14791
+ },
14792
+ {
14793
+ "epoch": 0.07,
14794
+ "learning_rate": 1.0319147172001108e-05,
14795
+ "loss": 0.4367,
14796
+ "step": 1946000
14797
+ },
14798
+ {
14799
+ "epoch": 0.07,
14800
+ "learning_rate": 1.0307600212366596e-05,
14801
+ "loss": 0.4355,
14802
+ "step": 1947000
14803
+ },
14804
+ {
14805
+ "epoch": 0.07,
14806
+ "learning_rate": 1.029627677647975e-05,
14807
+ "loss": 0.4367,
14808
+ "step": 1948000
14809
+ },
14810
+ {
14811
+ "epoch": 0.07,
14812
+ "learning_rate": 1.0285154229298157e-05,
14813
+ "loss": 0.4369,
14814
+ "step": 1949000
14815
+ },
14816
+ {
14817
+ "epoch": 0.07,
14818
+ "learning_rate": 1.0274244045627054e-05,
14819
+ "loss": 0.4373,
14820
+ "step": 1950000
14821
+ },
14822
+ {
14823
+ "epoch": 0.07,
14824
+ "eval_loss": 0.41703131794929504,
14825
+ "eval_runtime": 77.2751,
14826
+ "eval_samples_per_second": 82.821,
14827
+ "eval_steps_per_second": 0.647,
14828
+ "step": 1950000
14829
  }
14830
  ],
14831
  "max_steps": 2000000,
14832
  "num_train_epochs": 9223372036854775807,
14833
+ "total_flos": 1.7086938841350144e+22,
14834
  "trial_name": null,
14835
  "trial_params": null
14836
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:768b581a3df87951d3b920a69301f7e3d38ff0a3a3da9d558409072ba37b7784
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4158aaedff079b2378ceb72199c920ad399c00fbc03838dbc3a2204ee0d64219
3
  size 449471589