Bingsu commited on
Commit
8313379
1 Parent(s): b97e6d1

Training in progress, step 130000

Browse files
.gitattributes CHANGED
File without changes
.gitignore CHANGED
File without changes
README.md CHANGED
File without changes
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05e2d269f333c96c29da8075e36a6de506892a84e8ab7a1d79c6b5baf653edf5
3
  size 586828837
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbf6fc53cf1912d3e7691ef7613cd375aa4ebf7b7ad451c5645721500ea0ccf0
3
  size 586828837
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cc943ae46672312ee4175b7b0df7b2bcb16bb1598452afd869122102f93e701
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a97d7b725676a32c62a89d7830c299ebb9d3dfbfb1d9ac8f927a0fd779341bb2
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf3e3ff5ca04195d00ae182843134a34efdb2e565df68413f5842b7a4a84c37b
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eb0b61bfcc70468942923a7fe3124d17f4bfdbc8fb34ab21c173f5aa5dd9170
3
  size 14503
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17a6c740782a206d1a7821b1fbc9827af7a83dbc888bd997056c93056ef861be
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c5b714fbb6e17634404af5447fa5ba38c8fc02f762871048ad92bc1ddf9e592
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2812a708855da00be5c7a2b5b6519990cb027a8d82f04f202c74834685819f6
3
  size 733555848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e15b10691a90bb623d65c87c7e8ed415b9ec774dd56c0ff077ab98b237c15c1
3
  size 733555848
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.51566991968441,
5
- "global_step": 120000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3606,11 +3606,311 @@
3606
  "learning_rate": 0.003714012897878298,
3607
  "loss": 8.5978,
3608
  "step": 120000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3609
  }
3610
  ],
3611
  "max_steps": 1000000,
3612
  "num_train_epochs": 5,
3613
- "total_flos": 1.9125959786496e+17,
3614
  "trial_name": null,
3615
  "trial_params": null
3616
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5586424129914441,
5
+ "global_step": 130000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
3606
  "learning_rate": 0.003714012897878298,
3607
  "loss": 8.5978,
3608
  "step": 120000
3609
+ },
3610
+ {
3611
+ "epoch": 0.52,
3612
+ "learning_rate": 0.0037235754018389664,
3613
+ "loss": 8.5986,
3614
+ "step": 120200
3615
+ },
3616
+ {
3617
+ "epoch": 0.52,
3618
+ "learning_rate": 0.0037331443821634266,
3619
+ "loss": 8.6062,
3620
+ "step": 120400
3621
+ },
3622
+ {
3623
+ "epoch": 0.52,
3624
+ "learning_rate": 0.0037427197968771695,
3625
+ "loss": 8.5854,
3626
+ "step": 120600
3627
+ },
3628
+ {
3629
+ "epoch": 0.52,
3630
+ "learning_rate": 0.0037523016039774605,
3631
+ "loss": 8.5959,
3632
+ "step": 120800
3633
+ },
3634
+ {
3635
+ "epoch": 0.52,
3636
+ "learning_rate": 0.003761841804919297,
3637
+ "loss": 8.6054,
3638
+ "step": 121000
3639
+ },
3640
+ {
3641
+ "epoch": 0.52,
3642
+ "learning_rate": 0.00377143623923569,
3643
+ "loss": 8.5871,
3644
+ "step": 121200
3645
+ },
3646
+ {
3647
+ "epoch": 0.52,
3648
+ "learning_rate": 0.0037810369399734253,
3649
+ "loss": 8.5885,
3650
+ "step": 121400
3651
+ },
3652
+ {
3653
+ "epoch": 0.52,
3654
+ "learning_rate": 0.0037906438650188523,
3655
+ "loss": 8.5805,
3656
+ "step": 121600
3657
+ },
3658
+ {
3659
+ "epoch": 0.52,
3660
+ "learning_rate": 0.0038002569722310163,
3661
+ "loss": 8.5889,
3662
+ "step": 121800
3663
+ },
3664
+ {
3665
+ "epoch": 0.52,
3666
+ "learning_rate": 0.0038098762194418477,
3667
+ "loss": 8.5866,
3668
+ "step": 122000
3669
+ },
3670
+ {
3671
+ "epoch": 0.53,
3672
+ "learning_rate": 0.0038195015644563388,
3673
+ "loss": 8.5782,
3674
+ "step": 122200
3675
+ },
3676
+ {
3677
+ "epoch": 0.53,
3678
+ "learning_rate": 0.0038291329650527338,
3679
+ "loss": 8.579,
3680
+ "step": 122400
3681
+ },
3682
+ {
3683
+ "epoch": 0.53,
3684
+ "learning_rate": 0.0038387703789827194,
3685
+ "loss": 8.5773,
3686
+ "step": 122600
3687
+ },
3688
+ {
3689
+ "epoch": 0.53,
3690
+ "learning_rate": 0.0038484137639716006,
3691
+ "loss": 8.582,
3692
+ "step": 122800
3693
+ },
3694
+ {
3695
+ "epoch": 0.53,
3696
+ "learning_rate": 0.0038580148164719733,
3697
+ "loss": 8.5778,
3698
+ "step": 123000
3699
+ },
3700
+ {
3701
+ "epoch": 0.53,
3702
+ "learning_rate": 0.0038676699873231536,
3703
+ "loss": 8.5789,
3704
+ "step": 123200
3705
+ },
3706
+ {
3707
+ "epoch": 0.53,
3708
+ "learning_rate": 0.0038773310024645593,
3709
+ "loss": 8.578,
3710
+ "step": 123400
3711
+ },
3712
+ {
3713
+ "epoch": 0.53,
3714
+ "learning_rate": 0.003886997819517974,
3715
+ "loss": 8.5609,
3716
+ "step": 123600
3717
+ },
3718
+ {
3719
+ "epoch": 0.53,
3720
+ "learning_rate": 0.003896670396079725,
3721
+ "loss": 8.5707,
3722
+ "step": 123800
3723
+ },
3724
+ {
3725
+ "epoch": 0.53,
3726
+ "learning_rate": 0.003906300284101649,
3727
+ "loss": 8.5732,
3728
+ "step": 124000
3729
+ },
3730
+ {
3731
+ "epoch": 0.53,
3732
+ "learning_rate": 0.003915984224100703,
3733
+ "loss": 8.5731,
3734
+ "step": 124200
3735
+ },
3736
+ {
3737
+ "epoch": 0.53,
3738
+ "learning_rate": 0.003925673796458692,
3739
+ "loss": 8.5496,
3740
+ "step": 124400
3741
+ },
3742
+ {
3743
+ "epoch": 0.54,
3744
+ "learning_rate": 0.0039353689586721285,
3745
+ "loss": 8.5692,
3746
+ "step": 124600
3747
+ },
3748
+ {
3749
+ "epoch": 0.54,
3750
+ "learning_rate": 0.0039450696682130065,
3751
+ "loss": 8.5704,
3752
+ "step": 124800
3753
+ },
3754
+ {
3755
+ "epoch": 0.54,
3756
+ "learning_rate": 0.003954775882528979,
3757
+ "loss": 8.5663,
3758
+ "step": 125000
3759
+ },
3760
+ {
3761
+ "epoch": 0.54,
3762
+ "learning_rate": 0.003964487559043562,
3763
+ "loss": 8.5697,
3764
+ "step": 125200
3765
+ },
3766
+ {
3767
+ "epoch": 0.54,
3768
+ "learning_rate": 0.003974204655156306,
3769
+ "loss": 8.5784,
3770
+ "step": 125400
3771
+ },
3772
+ {
3773
+ "epoch": 0.54,
3774
+ "learning_rate": 0.003983927128242989,
3775
+ "loss": 8.566,
3776
+ "step": 125600
3777
+ },
3778
+ {
3779
+ "epoch": 0.54,
3780
+ "learning_rate": 0.003993654935655802,
3781
+ "loss": 8.5501,
3782
+ "step": 125800
3783
+ },
3784
+ {
3785
+ "epoch": 0.54,
3786
+ "learning_rate": 0.004003388034723539,
3787
+ "loss": 8.5701,
3788
+ "step": 126000
3789
+ },
3790
+ {
3791
+ "epoch": 0.54,
3792
+ "learning_rate": 0.004013077678025505,
3793
+ "loss": 8.563,
3794
+ "step": 126200
3795
+ },
3796
+ {
3797
+ "epoch": 0.54,
3798
+ "learning_rate": 0.00402282120637189,
3799
+ "loss": 8.5541,
3800
+ "step": 126400
3801
+ },
3802
+ {
3803
+ "epoch": 0.54,
3804
+ "learning_rate": 0.004032569898434814,
3805
+ "loss": 8.5581,
3806
+ "step": 126600
3807
+ },
3808
+ {
3809
+ "epoch": 0.54,
3810
+ "learning_rate": 0.004042323711451458,
3811
+ "loss": 8.5513,
3812
+ "step": 126800
3813
+ },
3814
+ {
3815
+ "epoch": 0.55,
3816
+ "learning_rate": 0.004052082602636542,
3817
+ "loss": 8.5474,
3818
+ "step": 127000
3819
+ },
3820
+ {
3821
+ "epoch": 0.55,
3822
+ "learning_rate": 0.004061846529182508,
3823
+ "loss": 8.5427,
3824
+ "step": 127200
3825
+ },
3826
+ {
3827
+ "epoch": 0.55,
3828
+ "learning_rate": 0.004071615448259712,
3829
+ "loss": 8.5414,
3830
+ "step": 127400
3831
+ },
3832
+ {
3833
+ "epoch": 0.55,
3834
+ "learning_rate": 0.00408138931701661,
3835
+ "loss": 8.5516,
3836
+ "step": 127600
3837
+ },
3838
+ {
3839
+ "epoch": 0.55,
3840
+ "learning_rate": 0.004091168092579948,
3841
+ "loss": 8.5422,
3842
+ "step": 127800
3843
+ },
3844
+ {
3845
+ "epoch": 0.55,
3846
+ "learning_rate": 0.004100951732054943,
3847
+ "loss": 8.5457,
3848
+ "step": 128000
3849
+ },
3850
+ {
3851
+ "epoch": 0.55,
3852
+ "learning_rate": 0.004110740192525482,
3853
+ "loss": 8.5488,
3854
+ "step": 128200
3855
+ },
3856
+ {
3857
+ "epoch": 0.55,
3858
+ "learning_rate": 0.0041205334310543025,
3859
+ "loss": 8.5424,
3860
+ "step": 128400
3861
+ },
3862
+ {
3863
+ "epoch": 0.55,
3864
+ "learning_rate": 0.004130331404683179,
3865
+ "loss": 8.5408,
3866
+ "step": 128600
3867
+ },
3868
+ {
3869
+ "epoch": 0.55,
3870
+ "learning_rate": 0.004140134070433124,
3871
+ "loss": 8.5473,
3872
+ "step": 128800
3873
+ },
3874
+ {
3875
+ "epoch": 0.55,
3876
+ "learning_rate": 0.004149892337236666,
3877
+ "loss": 8.5442,
3878
+ "step": 129000
3879
+ },
3880
+ {
3881
+ "epoch": 0.56,
3882
+ "learning_rate": 0.004159704235286162,
3883
+ "loss": 8.5338,
3884
+ "step": 129200
3885
+ },
3886
+ {
3887
+ "epoch": 0.56,
3888
+ "learning_rate": 0.004169520696612262,
3889
+ "loss": 8.5434,
3890
+ "step": 129400
3891
+ },
3892
+ {
3893
+ "epoch": 0.56,
3894
+ "learning_rate": 0.004179341678154871,
3895
+ "loss": 8.5343,
3896
+ "step": 129600
3897
+ },
3898
+ {
3899
+ "epoch": 0.56,
3900
+ "learning_rate": 0.0041891671368340785,
3901
+ "loss": 8.5373,
3902
+ "step": 129800
3903
+ },
3904
+ {
3905
+ "epoch": 0.56,
3906
+ "learning_rate": 0.0041989970295503234,
3907
+ "loss": 8.5275,
3908
+ "step": 130000
3909
  }
3910
  ],
3911
  "max_steps": 1000000,
3912
  "num_train_epochs": 5,
3913
+ "total_flos": 2.0719789768704e+17,
3914
  "trial_name": null,
3915
  "trial_params": null
3916
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c33b030231937c49711cbb55890f65b86f81a68638a6a5c30e4f67ed0b41b6f
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f266c72fbc4bc9e80a8144b40b5365bf1402777bd57f354fdb5cde802d5943e
3
  size 3375
merges.txt CHANGED
File without changes
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cc943ae46672312ee4175b7b0df7b2bcb16bb1598452afd869122102f93e701
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a97d7b725676a32c62a89d7830c299ebb9d3dfbfb1d9ac8f927a0fd779341bb2
3
  size 146774203
special_tokens_map.json CHANGED
File without changes
tokenizer.json CHANGED
File without changes
tokenizer_config.json CHANGED
File without changes
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c33b030231937c49711cbb55890f65b86f81a68638a6a5c30e4f67ed0b41b6f
3
  size 3375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f266c72fbc4bc9e80a8144b40b5365bf1402777bd57f354fdb5cde802d5943e
3
  size 3375
vocab.json CHANGED
File without changes