Joemgu commited on
Commit
ac13e60
1 Parent(s): 081d7ab

Training in progress, step 800

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edc9882338455f81dd1e85ace29ccf7af7da229453c5afb0d6c4bc90350a8b18
3
  size 4736616809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b93cb0f5af4a676996d113ae67c14903f845be14efc3d75962cc5c86990b4be
3
  size 4736616809
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1ac718a5450b8f5bc3e788c6d41748c97c82f6bf2933bbce997b0073b1c8202
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b06061cdd59a61c03b74896e78f938e36d6d587093dddf2c3beb4c518798564
3
  size 2368281769
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:425eded187a10b59dd6706ea8bd8dddc527ddb901485b28745924b7945f3098d
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78177ec8a4e9181f496a71815f95534c1ccdd07dd8b38c39977074346212cd45
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb5f790e340963b8140823821a1411a83e471c6359b6fd89f0bb6a4aa0276e15
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc38c7f71de03e75da249f8cf736366cbbae8af7a495e0547a8a03a22691e8a0
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 2.047600030899048,
3
- "best_model_checkpoint": "output/checkpoint-600",
4
- "epoch": 0.41021039306488055,
5
- "global_step": 600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -3630,11 +3630,1219 @@
3630
  "eval_samples_per_second": 5.759,
3631
  "eval_steps_per_second": 5.759,
3632
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3633
  }
3634
  ],
3635
  "max_steps": 4386,
3636
  "num_train_epochs": 3,
3637
- "total_flos": 7.368734337271511e+17,
3638
  "trial_name": null,
3639
  "trial_params": null
3640
  }
 
1
  {
2
+ "best_metric": 2.0316452980041504,
3
+ "best_model_checkpoint": "output/checkpoint-800",
4
+ "epoch": 0.5469471907531741,
5
+ "global_step": 800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
3630
  "eval_samples_per_second": 5.759,
3631
  "eval_steps_per_second": 5.759,
3632
  "step": 600
3633
+ },
3634
+ {
3635
+ "epoch": 0.41,
3636
+ "learning_rate": 0.0005425226946966077,
3637
+ "loss": 2.0728,
3638
+ "step": 601
3639
+ },
3640
+ {
3641
+ "epoch": 0.41,
3642
+ "learning_rate": 0.0005423793597706641,
3643
+ "loss": 2.051,
3644
+ "step": 602
3645
+ },
3646
+ {
3647
+ "epoch": 0.41,
3648
+ "learning_rate": 0.0005422360248447204,
3649
+ "loss": 2.1657,
3650
+ "step": 603
3651
+ },
3652
+ {
3653
+ "epoch": 0.41,
3654
+ "learning_rate": 0.0005420926899187768,
3655
+ "loss": 2.1084,
3656
+ "step": 604
3657
+ },
3658
+ {
3659
+ "epoch": 0.41,
3660
+ "learning_rate": 0.0005419493549928332,
3661
+ "loss": 2.1312,
3662
+ "step": 605
3663
+ },
3664
+ {
3665
+ "epoch": 0.41,
3666
+ "learning_rate": 0.0005418060200668896,
3667
+ "loss": 2.0724,
3668
+ "step": 606
3669
+ },
3670
+ {
3671
+ "epoch": 0.41,
3672
+ "learning_rate": 0.000541662685140946,
3673
+ "loss": 2.0999,
3674
+ "step": 607
3675
+ },
3676
+ {
3677
+ "epoch": 0.42,
3678
+ "learning_rate": 0.0005415193502150024,
3679
+ "loss": 2.1649,
3680
+ "step": 608
3681
+ },
3682
+ {
3683
+ "epoch": 0.42,
3684
+ "learning_rate": 0.0005413760152890587,
3685
+ "loss": 2.1458,
3686
+ "step": 609
3687
+ },
3688
+ {
3689
+ "epoch": 0.42,
3690
+ "learning_rate": 0.0005412326803631151,
3691
+ "loss": 2.071,
3692
+ "step": 610
3693
+ },
3694
+ {
3695
+ "epoch": 0.42,
3696
+ "learning_rate": 0.0005410893454371715,
3697
+ "loss": 2.1931,
3698
+ "step": 611
3699
+ },
3700
+ {
3701
+ "epoch": 0.42,
3702
+ "learning_rate": 0.0005409460105112279,
3703
+ "loss": 2.1968,
3704
+ "step": 612
3705
+ },
3706
+ {
3707
+ "epoch": 0.42,
3708
+ "learning_rate": 0.0005408026755852842,
3709
+ "loss": 2.105,
3710
+ "step": 613
3711
+ },
3712
+ {
3713
+ "epoch": 0.42,
3714
+ "learning_rate": 0.0005406593406593407,
3715
+ "loss": 2.1429,
3716
+ "step": 614
3717
+ },
3718
+ {
3719
+ "epoch": 0.42,
3720
+ "learning_rate": 0.0005405160057333969,
3721
+ "loss": 2.0812,
3722
+ "step": 615
3723
+ },
3724
+ {
3725
+ "epoch": 0.42,
3726
+ "learning_rate": 0.0005403726708074534,
3727
+ "loss": 2.1389,
3728
+ "step": 616
3729
+ },
3730
+ {
3731
+ "epoch": 0.42,
3732
+ "learning_rate": 0.0005402293358815097,
3733
+ "loss": 2.1685,
3734
+ "step": 617
3735
+ },
3736
+ {
3737
+ "epoch": 0.42,
3738
+ "learning_rate": 0.0005400860009555662,
3739
+ "loss": 2.2248,
3740
+ "step": 618
3741
+ },
3742
+ {
3743
+ "epoch": 0.42,
3744
+ "learning_rate": 0.0005399426660296225,
3745
+ "loss": 2.1771,
3746
+ "step": 619
3747
+ },
3748
+ {
3749
+ "epoch": 0.42,
3750
+ "learning_rate": 0.0005397993311036789,
3751
+ "loss": 2.1094,
3752
+ "step": 620
3753
+ },
3754
+ {
3755
+ "epoch": 0.42,
3756
+ "learning_rate": 0.0005396559961777352,
3757
+ "loss": 2.141,
3758
+ "step": 621
3759
+ },
3760
+ {
3761
+ "epoch": 0.43,
3762
+ "learning_rate": 0.0005395126612517916,
3763
+ "loss": 2.2442,
3764
+ "step": 622
3765
+ },
3766
+ {
3767
+ "epoch": 0.43,
3768
+ "learning_rate": 0.000539369326325848,
3769
+ "loss": 2.1703,
3770
+ "step": 623
3771
+ },
3772
+ {
3773
+ "epoch": 0.43,
3774
+ "learning_rate": 0.0005392259913999044,
3775
+ "loss": 2.1351,
3776
+ "step": 624
3777
+ },
3778
+ {
3779
+ "epoch": 0.43,
3780
+ "learning_rate": 0.0005390826564739608,
3781
+ "loss": 2.2008,
3782
+ "step": 625
3783
+ },
3784
+ {
3785
+ "epoch": 0.43,
3786
+ "learning_rate": 0.0005389393215480172,
3787
+ "loss": 2.2114,
3788
+ "step": 626
3789
+ },
3790
+ {
3791
+ "epoch": 0.43,
3792
+ "learning_rate": 0.0005387959866220735,
3793
+ "loss": 2.1368,
3794
+ "step": 627
3795
+ },
3796
+ {
3797
+ "epoch": 0.43,
3798
+ "learning_rate": 0.0005386526516961299,
3799
+ "loss": 2.2127,
3800
+ "step": 628
3801
+ },
3802
+ {
3803
+ "epoch": 0.43,
3804
+ "learning_rate": 0.0005385093167701863,
3805
+ "loss": 2.2554,
3806
+ "step": 629
3807
+ },
3808
+ {
3809
+ "epoch": 0.43,
3810
+ "learning_rate": 0.0005383659818442427,
3811
+ "loss": 2.143,
3812
+ "step": 630
3813
+ },
3814
+ {
3815
+ "epoch": 0.43,
3816
+ "learning_rate": 0.0005382226469182991,
3817
+ "loss": 2.1306,
3818
+ "step": 631
3819
+ },
3820
+ {
3821
+ "epoch": 0.43,
3822
+ "learning_rate": 0.0005380793119923555,
3823
+ "loss": 2.1257,
3824
+ "step": 632
3825
+ },
3826
+ {
3827
+ "epoch": 0.43,
3828
+ "learning_rate": 0.0005379359770664118,
3829
+ "loss": 2.2013,
3830
+ "step": 633
3831
+ },
3832
+ {
3833
+ "epoch": 0.43,
3834
+ "learning_rate": 0.0005377926421404681,
3835
+ "loss": 2.1856,
3836
+ "step": 634
3837
+ },
3838
+ {
3839
+ "epoch": 0.43,
3840
+ "learning_rate": 0.0005376493072145246,
3841
+ "loss": 2.1917,
3842
+ "step": 635
3843
+ },
3844
+ {
3845
+ "epoch": 0.43,
3846
+ "learning_rate": 0.0005375059722885809,
3847
+ "loss": 2.1044,
3848
+ "step": 636
3849
+ },
3850
+ {
3851
+ "epoch": 0.44,
3852
+ "learning_rate": 0.0005373626373626374,
3853
+ "loss": 2.1909,
3854
+ "step": 637
3855
+ },
3856
+ {
3857
+ "epoch": 0.44,
3858
+ "learning_rate": 0.0005372193024366937,
3859
+ "loss": 2.0706,
3860
+ "step": 638
3861
+ },
3862
+ {
3863
+ "epoch": 0.44,
3864
+ "learning_rate": 0.0005370759675107501,
3865
+ "loss": 2.142,
3866
+ "step": 639
3867
+ },
3868
+ {
3869
+ "epoch": 0.44,
3870
+ "learning_rate": 0.0005369326325848064,
3871
+ "loss": 2.1813,
3872
+ "step": 640
3873
+ },
3874
+ {
3875
+ "epoch": 0.44,
3876
+ "learning_rate": 0.0005367892976588628,
3877
+ "loss": 2.1802,
3878
+ "step": 641
3879
+ },
3880
+ {
3881
+ "epoch": 0.44,
3882
+ "learning_rate": 0.0005366459627329192,
3883
+ "loss": 2.1461,
3884
+ "step": 642
3885
+ },
3886
+ {
3887
+ "epoch": 0.44,
3888
+ "learning_rate": 0.0005365026278069756,
3889
+ "loss": 2.045,
3890
+ "step": 643
3891
+ },
3892
+ {
3893
+ "epoch": 0.44,
3894
+ "learning_rate": 0.000536359292881032,
3895
+ "loss": 2.1449,
3896
+ "step": 644
3897
+ },
3898
+ {
3899
+ "epoch": 0.44,
3900
+ "learning_rate": 0.0005362159579550883,
3901
+ "loss": 2.0893,
3902
+ "step": 645
3903
+ },
3904
+ {
3905
+ "epoch": 0.44,
3906
+ "learning_rate": 0.0005360726230291447,
3907
+ "loss": 2.1477,
3908
+ "step": 646
3909
+ },
3910
+ {
3911
+ "epoch": 0.44,
3912
+ "learning_rate": 0.0005359292881032011,
3913
+ "loss": 2.172,
3914
+ "step": 647
3915
+ },
3916
+ {
3917
+ "epoch": 0.44,
3918
+ "learning_rate": 0.0005357859531772575,
3919
+ "loss": 2.2154,
3920
+ "step": 648
3921
+ },
3922
+ {
3923
+ "epoch": 0.44,
3924
+ "learning_rate": 0.0005356426182513139,
3925
+ "loss": 2.0651,
3926
+ "step": 649
3927
+ },
3928
+ {
3929
+ "epoch": 0.44,
3930
+ "learning_rate": 0.0005354992833253703,
3931
+ "loss": 2.2091,
3932
+ "step": 650
3933
+ },
3934
+ {
3935
+ "epoch": 0.45,
3936
+ "learning_rate": 0.0005353559483994266,
3937
+ "loss": 2.1892,
3938
+ "step": 651
3939
+ },
3940
+ {
3941
+ "epoch": 0.45,
3942
+ "learning_rate": 0.000535212613473483,
3943
+ "loss": 2.1727,
3944
+ "step": 652
3945
+ },
3946
+ {
3947
+ "epoch": 0.45,
3948
+ "learning_rate": 0.0005350692785475393,
3949
+ "loss": 2.1532,
3950
+ "step": 653
3951
+ },
3952
+ {
3953
+ "epoch": 0.45,
3954
+ "learning_rate": 0.0005349259436215958,
3955
+ "loss": 2.0929,
3956
+ "step": 654
3957
+ },
3958
+ {
3959
+ "epoch": 0.45,
3960
+ "learning_rate": 0.0005347826086956521,
3961
+ "loss": 2.1364,
3962
+ "step": 655
3963
+ },
3964
+ {
3965
+ "epoch": 0.45,
3966
+ "learning_rate": 0.0005346392737697086,
3967
+ "loss": 2.1704,
3968
+ "step": 656
3969
+ },
3970
+ {
3971
+ "epoch": 0.45,
3972
+ "learning_rate": 0.0005344959388437648,
3973
+ "loss": 2.116,
3974
+ "step": 657
3975
+ },
3976
+ {
3977
+ "epoch": 0.45,
3978
+ "learning_rate": 0.0005343526039178213,
3979
+ "loss": 2.2498,
3980
+ "step": 658
3981
+ },
3982
+ {
3983
+ "epoch": 0.45,
3984
+ "learning_rate": 0.0005342092689918776,
3985
+ "loss": 2.2484,
3986
+ "step": 659
3987
+ },
3988
+ {
3989
+ "epoch": 0.45,
3990
+ "learning_rate": 0.000534065934065934,
3991
+ "loss": 2.0954,
3992
+ "step": 660
3993
+ },
3994
+ {
3995
+ "epoch": 0.45,
3996
+ "learning_rate": 0.0005339225991399904,
3997
+ "loss": 2.1747,
3998
+ "step": 661
3999
+ },
4000
+ {
4001
+ "epoch": 0.45,
4002
+ "learning_rate": 0.0005337792642140468,
4003
+ "loss": 2.1775,
4004
+ "step": 662
4005
+ },
4006
+ {
4007
+ "epoch": 0.45,
4008
+ "learning_rate": 0.0005336359292881031,
4009
+ "loss": 2.2085,
4010
+ "step": 663
4011
+ },
4012
+ {
4013
+ "epoch": 0.45,
4014
+ "learning_rate": 0.0005334925943621595,
4015
+ "loss": 2.1576,
4016
+ "step": 664
4017
+ },
4018
+ {
4019
+ "epoch": 0.45,
4020
+ "learning_rate": 0.0005333492594362159,
4021
+ "loss": 2.1618,
4022
+ "step": 665
4023
+ },
4024
+ {
4025
+ "epoch": 0.46,
4026
+ "learning_rate": 0.0005332059245102723,
4027
+ "loss": 2.1005,
4028
+ "step": 666
4029
+ },
4030
+ {
4031
+ "epoch": 0.46,
4032
+ "learning_rate": 0.0005330625895843287,
4033
+ "loss": 2.1803,
4034
+ "step": 667
4035
+ },
4036
+ {
4037
+ "epoch": 0.46,
4038
+ "learning_rate": 0.0005329192546583851,
4039
+ "loss": 2.036,
4040
+ "step": 668
4041
+ },
4042
+ {
4043
+ "epoch": 0.46,
4044
+ "learning_rate": 0.0005327759197324414,
4045
+ "loss": 2.1275,
4046
+ "step": 669
4047
+ },
4048
+ {
4049
+ "epoch": 0.46,
4050
+ "learning_rate": 0.0005326325848064978,
4051
+ "loss": 2.1677,
4052
+ "step": 670
4053
+ },
4054
+ {
4055
+ "epoch": 0.46,
4056
+ "learning_rate": 0.0005324892498805542,
4057
+ "loss": 2.1649,
4058
+ "step": 671
4059
+ },
4060
+ {
4061
+ "epoch": 0.46,
4062
+ "learning_rate": 0.0005323459149546106,
4063
+ "loss": 2.1187,
4064
+ "step": 672
4065
+ },
4066
+ {
4067
+ "epoch": 0.46,
4068
+ "learning_rate": 0.000532202580028667,
4069
+ "loss": 2.0904,
4070
+ "step": 673
4071
+ },
4072
+ {
4073
+ "epoch": 0.46,
4074
+ "learning_rate": 0.0005320592451027233,
4075
+ "loss": 2.1749,
4076
+ "step": 674
4077
+ },
4078
+ {
4079
+ "epoch": 0.46,
4080
+ "learning_rate": 0.0005319159101767797,
4081
+ "loss": 2.1062,
4082
+ "step": 675
4083
+ },
4084
+ {
4085
+ "epoch": 0.46,
4086
+ "learning_rate": 0.000531772575250836,
4087
+ "loss": 2.1854,
4088
+ "step": 676
4089
+ },
4090
+ {
4091
+ "epoch": 0.46,
4092
+ "learning_rate": 0.0005316292403248925,
4093
+ "loss": 2.1493,
4094
+ "step": 677
4095
+ },
4096
+ {
4097
+ "epoch": 0.46,
4098
+ "learning_rate": 0.0005314859053989488,
4099
+ "loss": 2.2119,
4100
+ "step": 678
4101
+ },
4102
+ {
4103
+ "epoch": 0.46,
4104
+ "learning_rate": 0.0005313425704730053,
4105
+ "loss": 2.104,
4106
+ "step": 679
4107
+ },
4108
+ {
4109
+ "epoch": 0.46,
4110
+ "learning_rate": 0.0005311992355470616,
4111
+ "loss": 2.2132,
4112
+ "step": 680
4113
+ },
4114
+ {
4115
+ "epoch": 0.47,
4116
+ "learning_rate": 0.0005310559006211179,
4117
+ "loss": 2.2275,
4118
+ "step": 681
4119
+ },
4120
+ {
4121
+ "epoch": 0.47,
4122
+ "learning_rate": 0.0005309125656951743,
4123
+ "loss": 2.146,
4124
+ "step": 682
4125
+ },
4126
+ {
4127
+ "epoch": 0.47,
4128
+ "learning_rate": 0.0005307692307692307,
4129
+ "loss": 2.2401,
4130
+ "step": 683
4131
+ },
4132
+ {
4133
+ "epoch": 0.47,
4134
+ "learning_rate": 0.0005306258958432871,
4135
+ "loss": 2.1453,
4136
+ "step": 684
4137
+ },
4138
+ {
4139
+ "epoch": 0.47,
4140
+ "learning_rate": 0.0005304825609173435,
4141
+ "loss": 2.1154,
4142
+ "step": 685
4143
+ },
4144
+ {
4145
+ "epoch": 0.47,
4146
+ "learning_rate": 0.0005303392259913999,
4147
+ "loss": 2.2073,
4148
+ "step": 686
4149
+ },
4150
+ {
4151
+ "epoch": 0.47,
4152
+ "learning_rate": 0.0005301958910654562,
4153
+ "loss": 2.3166,
4154
+ "step": 687
4155
+ },
4156
+ {
4157
+ "epoch": 0.47,
4158
+ "learning_rate": 0.0005300525561395126,
4159
+ "loss": 2.1223,
4160
+ "step": 688
4161
+ },
4162
+ {
4163
+ "epoch": 0.47,
4164
+ "learning_rate": 0.000529909221213569,
4165
+ "loss": 2.1857,
4166
+ "step": 689
4167
+ },
4168
+ {
4169
+ "epoch": 0.47,
4170
+ "learning_rate": 0.0005297658862876254,
4171
+ "loss": 2.1932,
4172
+ "step": 690
4173
+ },
4174
+ {
4175
+ "epoch": 0.47,
4176
+ "learning_rate": 0.0005296225513616818,
4177
+ "loss": 2.1361,
4178
+ "step": 691
4179
+ },
4180
+ {
4181
+ "epoch": 0.47,
4182
+ "learning_rate": 0.0005294792164357382,
4183
+ "loss": 2.1133,
4184
+ "step": 692
4185
+ },
4186
+ {
4187
+ "epoch": 0.47,
4188
+ "learning_rate": 0.0005293358815097945,
4189
+ "loss": 2.137,
4190
+ "step": 693
4191
+ },
4192
+ {
4193
+ "epoch": 0.47,
4194
+ "learning_rate": 0.0005291925465838509,
4195
+ "loss": 2.1077,
4196
+ "step": 694
4197
+ },
4198
+ {
4199
+ "epoch": 0.48,
4200
+ "learning_rate": 0.0005290492116579072,
4201
+ "loss": 2.2026,
4202
+ "step": 695
4203
+ },
4204
+ {
4205
+ "epoch": 0.48,
4206
+ "learning_rate": 0.0005289058767319637,
4207
+ "loss": 2.1443,
4208
+ "step": 696
4209
+ },
4210
+ {
4211
+ "epoch": 0.48,
4212
+ "learning_rate": 0.00052876254180602,
4213
+ "loss": 2.0933,
4214
+ "step": 697
4215
+ },
4216
+ {
4217
+ "epoch": 0.48,
4218
+ "learning_rate": 0.0005286192068800765,
4219
+ "loss": 2.1728,
4220
+ "step": 698
4221
+ },
4222
+ {
4223
+ "epoch": 0.48,
4224
+ "learning_rate": 0.0005284758719541327,
4225
+ "loss": 2.1026,
4226
+ "step": 699
4227
+ },
4228
+ {
4229
+ "epoch": 0.48,
4230
+ "learning_rate": 0.0005283325370281892,
4231
+ "loss": 2.1373,
4232
+ "step": 700
4233
+ },
4234
+ {
4235
+ "epoch": 0.48,
4236
+ "learning_rate": 0.0005281892021022455,
4237
+ "loss": 2.0439,
4238
+ "step": 701
4239
+ },
4240
+ {
4241
+ "epoch": 0.48,
4242
+ "learning_rate": 0.0005280458671763019,
4243
+ "loss": 2.1893,
4244
+ "step": 702
4245
+ },
4246
+ {
4247
+ "epoch": 0.48,
4248
+ "learning_rate": 0.0005279025322503583,
4249
+ "loss": 2.2672,
4250
+ "step": 703
4251
+ },
4252
+ {
4253
+ "epoch": 0.48,
4254
+ "learning_rate": 0.0005277591973244147,
4255
+ "loss": 2.176,
4256
+ "step": 704
4257
+ },
4258
+ {
4259
+ "epoch": 0.48,
4260
+ "learning_rate": 0.000527615862398471,
4261
+ "loss": 2.1351,
4262
+ "step": 705
4263
+ },
4264
+ {
4265
+ "epoch": 0.48,
4266
+ "learning_rate": 0.0005274725274725274,
4267
+ "loss": 2.0995,
4268
+ "step": 706
4269
+ },
4270
+ {
4271
+ "epoch": 0.48,
4272
+ "learning_rate": 0.0005273291925465838,
4273
+ "loss": 2.1155,
4274
+ "step": 707
4275
+ },
4276
+ {
4277
+ "epoch": 0.48,
4278
+ "learning_rate": 0.0005271858576206402,
4279
+ "loss": 2.1424,
4280
+ "step": 708
4281
+ },
4282
+ {
4283
+ "epoch": 0.48,
4284
+ "learning_rate": 0.0005270425226946966,
4285
+ "loss": 2.2112,
4286
+ "step": 709
4287
+ },
4288
+ {
4289
+ "epoch": 0.49,
4290
+ "learning_rate": 0.000526899187768753,
4291
+ "loss": 2.1792,
4292
+ "step": 710
4293
+ },
4294
+ {
4295
+ "epoch": 0.49,
4296
+ "learning_rate": 0.0005267558528428093,
4297
+ "loss": 2.133,
4298
+ "step": 711
4299
+ },
4300
+ {
4301
+ "epoch": 0.49,
4302
+ "learning_rate": 0.0005266125179168657,
4303
+ "loss": 2.1542,
4304
+ "step": 712
4305
+ },
4306
+ {
4307
+ "epoch": 0.49,
4308
+ "learning_rate": 0.0005264691829909221,
4309
+ "loss": 2.1184,
4310
+ "step": 713
4311
+ },
4312
+ {
4313
+ "epoch": 0.49,
4314
+ "learning_rate": 0.0005263258480649785,
4315
+ "loss": 2.0157,
4316
+ "step": 714
4317
+ },
4318
+ {
4319
+ "epoch": 0.49,
4320
+ "learning_rate": 0.0005261825131390349,
4321
+ "loss": 2.1661,
4322
+ "step": 715
4323
+ },
4324
+ {
4325
+ "epoch": 0.49,
4326
+ "learning_rate": 0.0005260391782130912,
4327
+ "loss": 2.2518,
4328
+ "step": 716
4329
+ },
4330
+ {
4331
+ "epoch": 0.49,
4332
+ "learning_rate": 0.0005258958432871477,
4333
+ "loss": 2.0832,
4334
+ "step": 717
4335
+ },
4336
+ {
4337
+ "epoch": 0.49,
4338
+ "learning_rate": 0.0005257525083612039,
4339
+ "loss": 2.2078,
4340
+ "step": 718
4341
+ },
4342
+ {
4343
+ "epoch": 0.49,
4344
+ "learning_rate": 0.0005256091734352604,
4345
+ "loss": 2.085,
4346
+ "step": 719
4347
+ },
4348
+ {
4349
+ "epoch": 0.49,
4350
+ "learning_rate": 0.0005254658385093167,
4351
+ "loss": 2.1,
4352
+ "step": 720
4353
+ },
4354
+ {
4355
+ "epoch": 0.49,
4356
+ "learning_rate": 0.0005253225035833732,
4357
+ "loss": 2.0984,
4358
+ "step": 721
4359
+ },
4360
+ {
4361
+ "epoch": 0.49,
4362
+ "learning_rate": 0.0005251791686574295,
4363
+ "loss": 2.1313,
4364
+ "step": 722
4365
+ },
4366
+ {
4367
+ "epoch": 0.49,
4368
+ "learning_rate": 0.0005250358337314858,
4369
+ "loss": 2.0344,
4370
+ "step": 723
4371
+ },
4372
+ {
4373
+ "epoch": 0.49,
4374
+ "learning_rate": 0.0005248924988055422,
4375
+ "loss": 2.2206,
4376
+ "step": 724
4377
+ },
4378
+ {
4379
+ "epoch": 0.5,
4380
+ "learning_rate": 0.0005247491638795986,
4381
+ "loss": 2.2121,
4382
+ "step": 725
4383
+ },
4384
+ {
4385
+ "epoch": 0.5,
4386
+ "learning_rate": 0.000524605828953655,
4387
+ "loss": 2.1267,
4388
+ "step": 726
4389
+ },
4390
+ {
4391
+ "epoch": 0.5,
4392
+ "learning_rate": 0.0005244624940277114,
4393
+ "loss": 2.0358,
4394
+ "step": 727
4395
+ },
4396
+ {
4397
+ "epoch": 0.5,
4398
+ "learning_rate": 0.0005243191591017678,
4399
+ "loss": 2.1301,
4400
+ "step": 728
4401
+ },
4402
+ {
4403
+ "epoch": 0.5,
4404
+ "learning_rate": 0.0005241758241758241,
4405
+ "loss": 2.1678,
4406
+ "step": 729
4407
+ },
4408
+ {
4409
+ "epoch": 0.5,
4410
+ "learning_rate": 0.0005240324892498805,
4411
+ "loss": 2.0717,
4412
+ "step": 730
4413
+ },
4414
+ {
4415
+ "epoch": 0.5,
4416
+ "learning_rate": 0.0005238891543239369,
4417
+ "loss": 2.1041,
4418
+ "step": 731
4419
+ },
4420
+ {
4421
+ "epoch": 0.5,
4422
+ "learning_rate": 0.0005237458193979933,
4423
+ "loss": 2.1661,
4424
+ "step": 732
4425
+ },
4426
+ {
4427
+ "epoch": 0.5,
4428
+ "learning_rate": 0.0005236024844720497,
4429
+ "loss": 2.2038,
4430
+ "step": 733
4431
+ },
4432
+ {
4433
+ "epoch": 0.5,
4434
+ "learning_rate": 0.0005234591495461061,
4435
+ "loss": 2.0797,
4436
+ "step": 734
4437
+ },
4438
+ {
4439
+ "epoch": 0.5,
4440
+ "learning_rate": 0.0005233158146201624,
4441
+ "loss": 1.9513,
4442
+ "step": 735
4443
+ },
4444
+ {
4445
+ "epoch": 0.5,
4446
+ "learning_rate": 0.0005231724796942188,
4447
+ "loss": 2.0952,
4448
+ "step": 736
4449
+ },
4450
+ {
4451
+ "epoch": 0.5,
4452
+ "learning_rate": 0.0005230291447682751,
4453
+ "loss": 2.0112,
4454
+ "step": 737
4455
+ },
4456
+ {
4457
+ "epoch": 0.5,
4458
+ "learning_rate": 0.0005228858098423316,
4459
+ "loss": 2.1552,
4460
+ "step": 738
4461
+ },
4462
+ {
4463
+ "epoch": 0.51,
4464
+ "learning_rate": 0.0005227424749163879,
4465
+ "loss": 2.1163,
4466
+ "step": 739
4467
+ },
4468
+ {
4469
+ "epoch": 0.51,
4470
+ "learning_rate": 0.0005225991399904444,
4471
+ "loss": 2.1879,
4472
+ "step": 740
4473
+ },
4474
+ {
4475
+ "epoch": 0.51,
4476
+ "learning_rate": 0.0005224558050645006,
4477
+ "loss": 2.1494,
4478
+ "step": 741
4479
+ },
4480
+ {
4481
+ "epoch": 0.51,
4482
+ "learning_rate": 0.000522312470138557,
4483
+ "loss": 2.1554,
4484
+ "step": 742
4485
+ },
4486
+ {
4487
+ "epoch": 0.51,
4488
+ "learning_rate": 0.0005221691352126134,
4489
+ "loss": 2.1672,
4490
+ "step": 743
4491
+ },
4492
+ {
4493
+ "epoch": 0.51,
4494
+ "learning_rate": 0.0005220258002866698,
4495
+ "loss": 2.1297,
4496
+ "step": 744
4497
+ },
4498
+ {
4499
+ "epoch": 0.51,
4500
+ "learning_rate": 0.0005218824653607262,
4501
+ "loss": 2.1553,
4502
+ "step": 745
4503
+ },
4504
+ {
4505
+ "epoch": 0.51,
4506
+ "learning_rate": 0.0005217391304347826,
4507
+ "loss": 2.1184,
4508
+ "step": 746
4509
+ },
4510
+ {
4511
+ "epoch": 0.51,
4512
+ "learning_rate": 0.000521595795508839,
4513
+ "loss": 2.1945,
4514
+ "step": 747
4515
+ },
4516
+ {
4517
+ "epoch": 0.51,
4518
+ "learning_rate": 0.0005214524605828953,
4519
+ "loss": 2.1973,
4520
+ "step": 748
4521
+ },
4522
+ {
4523
+ "epoch": 0.51,
4524
+ "learning_rate": 0.0005213091256569517,
4525
+ "loss": 2.076,
4526
+ "step": 749
4527
+ },
4528
+ {
4529
+ "epoch": 0.51,
4530
+ "learning_rate": 0.0005211657907310081,
4531
+ "loss": 2.1019,
4532
+ "step": 750
4533
+ },
4534
+ {
4535
+ "epoch": 0.51,
4536
+ "learning_rate": 0.0005210224558050645,
4537
+ "loss": 2.1234,
4538
+ "step": 751
4539
+ },
4540
+ {
4541
+ "epoch": 0.51,
4542
+ "learning_rate": 0.0005208791208791209,
4543
+ "loss": 2.221,
4544
+ "step": 752
4545
+ },
4546
+ {
4547
+ "epoch": 0.51,
4548
+ "learning_rate": 0.0005207357859531772,
4549
+ "loss": 2.0994,
4550
+ "step": 753
4551
+ },
4552
+ {
4553
+ "epoch": 0.52,
4554
+ "learning_rate": 0.0005205924510272336,
4555
+ "loss": 2.0699,
4556
+ "step": 754
4557
+ },
4558
+ {
4559
+ "epoch": 0.52,
4560
+ "learning_rate": 0.00052044911610129,
4561
+ "loss": 2.1554,
4562
+ "step": 755
4563
+ },
4564
+ {
4565
+ "epoch": 0.52,
4566
+ "learning_rate": 0.0005203057811753463,
4567
+ "loss": 2.1254,
4568
+ "step": 756
4569
+ },
4570
+ {
4571
+ "epoch": 0.52,
4572
+ "learning_rate": 0.0005201624462494028,
4573
+ "loss": 2.2119,
4574
+ "step": 757
4575
+ },
4576
+ {
4577
+ "epoch": 0.52,
4578
+ "learning_rate": 0.0005200191113234591,
4579
+ "loss": 2.0509,
4580
+ "step": 758
4581
+ },
4582
+ {
4583
+ "epoch": 0.52,
4584
+ "learning_rate": 0.0005198757763975154,
4585
+ "loss": 2.2616,
4586
+ "step": 759
4587
+ },
4588
+ {
4589
+ "epoch": 0.52,
4590
+ "learning_rate": 0.0005197324414715718,
4591
+ "loss": 2.0417,
4592
+ "step": 760
4593
+ },
4594
+ {
4595
+ "epoch": 0.52,
4596
+ "learning_rate": 0.0005195891065456282,
4597
+ "loss": 2.1356,
4598
+ "step": 761
4599
+ },
4600
+ {
4601
+ "epoch": 0.52,
4602
+ "learning_rate": 0.0005194457716196846,
4603
+ "loss": 2.0701,
4604
+ "step": 762
4605
+ },
4606
+ {
4607
+ "epoch": 0.52,
4608
+ "learning_rate": 0.000519302436693741,
4609
+ "loss": 2.1677,
4610
+ "step": 763
4611
+ },
4612
+ {
4613
+ "epoch": 0.52,
4614
+ "learning_rate": 0.0005191591017677974,
4615
+ "loss": 2.0766,
4616
+ "step": 764
4617
+ },
4618
+ {
4619
+ "epoch": 0.52,
4620
+ "learning_rate": 0.0005190157668418537,
4621
+ "loss": 2.1733,
4622
+ "step": 765
4623
+ },
4624
+ {
4625
+ "epoch": 0.52,
4626
+ "learning_rate": 0.0005188724319159101,
4627
+ "loss": 2.1307,
4628
+ "step": 766
4629
+ },
4630
+ {
4631
+ "epoch": 0.52,
4632
+ "learning_rate": 0.0005187290969899665,
4633
+ "loss": 2.0885,
4634
+ "step": 767
4635
+ },
4636
+ {
4637
+ "epoch": 0.53,
4638
+ "learning_rate": 0.0005185857620640229,
4639
+ "loss": 2.1836,
4640
+ "step": 768
4641
+ },
4642
+ {
4643
+ "epoch": 0.53,
4644
+ "learning_rate": 0.0005184424271380793,
4645
+ "loss": 2.1477,
4646
+ "step": 769
4647
+ },
4648
+ {
4649
+ "epoch": 0.53,
4650
+ "learning_rate": 0.0005182990922121357,
4651
+ "loss": 2.1539,
4652
+ "step": 770
4653
+ },
4654
+ {
4655
+ "epoch": 0.53,
4656
+ "learning_rate": 0.000518155757286192,
4657
+ "loss": 2.1302,
4658
+ "step": 771
4659
+ },
4660
+ {
4661
+ "epoch": 0.53,
4662
+ "learning_rate": 0.0005180124223602484,
4663
+ "loss": 2.1686,
4664
+ "step": 772
4665
+ },
4666
+ {
4667
+ "epoch": 0.53,
4668
+ "learning_rate": 0.0005178690874343048,
4669
+ "loss": 2.1822,
4670
+ "step": 773
4671
+ },
4672
+ {
4673
+ "epoch": 0.53,
4674
+ "learning_rate": 0.0005177257525083612,
4675
+ "loss": 2.1349,
4676
+ "step": 774
4677
+ },
4678
+ {
4679
+ "epoch": 0.53,
4680
+ "learning_rate": 0.0005175824175824176,
4681
+ "loss": 2.1254,
4682
+ "step": 775
4683
+ },
4684
+ {
4685
+ "epoch": 0.53,
4686
+ "learning_rate": 0.0005174390826564739,
4687
+ "loss": 2.1617,
4688
+ "step": 776
4689
+ },
4690
+ {
4691
+ "epoch": 0.53,
4692
+ "learning_rate": 0.0005172957477305302,
4693
+ "loss": 2.0366,
4694
+ "step": 777
4695
+ },
4696
+ {
4697
+ "epoch": 0.53,
4698
+ "learning_rate": 0.0005171524128045866,
4699
+ "loss": 2.1776,
4700
+ "step": 778
4701
+ },
4702
+ {
4703
+ "epoch": 0.53,
4704
+ "learning_rate": 0.000517009077878643,
4705
+ "loss": 2.0636,
4706
+ "step": 779
4707
+ },
4708
+ {
4709
+ "epoch": 0.53,
4710
+ "learning_rate": 0.0005168657429526994,
4711
+ "loss": 2.0966,
4712
+ "step": 780
4713
+ },
4714
+ {
4715
+ "epoch": 0.53,
4716
+ "learning_rate": 0.0005167224080267558,
4717
+ "loss": 2.0935,
4718
+ "step": 781
4719
+ },
4720
+ {
4721
+ "epoch": 0.53,
4722
+ "learning_rate": 0.0005165790731008122,
4723
+ "loss": 2.0194,
4724
+ "step": 782
4725
+ },
4726
+ {
4727
+ "epoch": 0.54,
4728
+ "learning_rate": 0.0005164357381748685,
4729
+ "loss": 2.1995,
4730
+ "step": 783
4731
+ },
4732
+ {
4733
+ "epoch": 0.54,
4734
+ "learning_rate": 0.0005162924032489249,
4735
+ "loss": 2.0758,
4736
+ "step": 784
4737
+ },
4738
+ {
4739
+ "epoch": 0.54,
4740
+ "learning_rate": 0.0005161490683229813,
4741
+ "loss": 2.134,
4742
+ "step": 785
4743
+ },
4744
+ {
4745
+ "epoch": 0.54,
4746
+ "learning_rate": 0.0005160057333970377,
4747
+ "loss": 2.1708,
4748
+ "step": 786
4749
+ },
4750
+ {
4751
+ "epoch": 0.54,
4752
+ "learning_rate": 0.0005158623984710941,
4753
+ "loss": 2.155,
4754
+ "step": 787
4755
+ },
4756
+ {
4757
+ "epoch": 0.54,
4758
+ "learning_rate": 0.0005157190635451505,
4759
+ "loss": 2.2275,
4760
+ "step": 788
4761
+ },
4762
+ {
4763
+ "epoch": 0.54,
4764
+ "learning_rate": 0.0005155757286192068,
4765
+ "loss": 2.151,
4766
+ "step": 789
4767
+ },
4768
+ {
4769
+ "epoch": 0.54,
4770
+ "learning_rate": 0.0005154323936932632,
4771
+ "loss": 2.1853,
4772
+ "step": 790
4773
+ },
4774
+ {
4775
+ "epoch": 0.54,
4776
+ "learning_rate": 0.0005152890587673196,
4777
+ "loss": 2.1455,
4778
+ "step": 791
4779
+ },
4780
+ {
4781
+ "epoch": 0.54,
4782
+ "learning_rate": 0.000515145723841376,
4783
+ "loss": 2.1934,
4784
+ "step": 792
4785
+ },
4786
+ {
4787
+ "epoch": 0.54,
4788
+ "learning_rate": 0.0005150023889154323,
4789
+ "loss": 2.0501,
4790
+ "step": 793
4791
+ },
4792
+ {
4793
+ "epoch": 0.54,
4794
+ "learning_rate": 0.0005148590539894888,
4795
+ "loss": 2.197,
4796
+ "step": 794
4797
+ },
4798
+ {
4799
+ "epoch": 0.54,
4800
+ "learning_rate": 0.000514715719063545,
4801
+ "loss": 2.0713,
4802
+ "step": 795
4803
+ },
4804
+ {
4805
+ "epoch": 0.54,
4806
+ "learning_rate": 0.0005145723841376015,
4807
+ "loss": 2.1562,
4808
+ "step": 796
4809
+ },
4810
+ {
4811
+ "epoch": 0.54,
4812
+ "learning_rate": 0.0005144290492116578,
4813
+ "loss": 2.158,
4814
+ "step": 797
4815
+ },
4816
+ {
4817
+ "epoch": 0.55,
4818
+ "learning_rate": 0.0005142857142857142,
4819
+ "loss": 2.0456,
4820
+ "step": 798
4821
+ },
4822
+ {
4823
+ "epoch": 0.55,
4824
+ "learning_rate": 0.0005141423793597706,
4825
+ "loss": 2.157,
4826
+ "step": 799
4827
+ },
4828
+ {
4829
+ "epoch": 0.55,
4830
+ "learning_rate": 0.000513999044433827,
4831
+ "loss": 2.0934,
4832
+ "step": 800
4833
+ },
4834
+ {
4835
+ "epoch": 0.55,
4836
+ "eval_loss": 2.0316452980041504,
4837
+ "eval_runtime": 1689.8048,
4838
+ "eval_samples_per_second": 5.918,
4839
+ "eval_steps_per_second": 5.918,
4840
+ "step": 800
4841
  }
4842
  ],
4843
  "max_steps": 4386,
4844
  "num_train_epochs": 3,
4845
+ "total_flos": 9.813169375148851e+17,
4846
  "trial_name": null,
4847
  "trial_params": null
4848
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1ac718a5450b8f5bc3e788c6d41748c97c82f6bf2933bbce997b0073b1c8202
3
  size 2368281769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b06061cdd59a61c03b74896e78f938e36d6d587093dddf2c3beb4c518798564
3
  size 2368281769