Bingsu commited on
Commit
6130161
1 Parent(s): 3eece0a

Training in progress, step 100000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bfd1649a8e5d50f87eac5a90bc2a42a9752d57108ef7a99b6144b52498eb940
3
  size 586828837
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0978bc6ba8af6107f37b1ce0de2823f6ffed0e6f3357ff497f2e879a2e834ef3
3
  size 586828837
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d445756ee98f80cf0e1304f296d090935c2b25cf534d2e63f2d43ea00b4692
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbd29e5163387c30aab8a8890d9ee62efc996b56b102107768b0eabf5e23817e
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5252389f795734ddbef71d5136d373788bd8e3e087e463a80c7908b475ee5a4e
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1dca87fa841c98848654f4170210b8999092daa8068e42b3812f3ab2a9ca99
3
  size 14503
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1244415d9126f95bf4be9e42d3ed475037eeb14b80d36ceaa79ab4bc1c4e659
3
  size 559
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43c91aaff4049dd76fbb2e0bcf40a0522c406dbf03765ebea50f6fb1be9645c2
3
  size 559
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e05de43b6a246107bb811a862a5a339039221c691d4bd76ad5704b725f4612
3
  size 733555848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fc0cfe80fceb3705f7126b83f3fe0e36d87dafa6df093df20b056316ba4fd28
3
  size 733555848
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.38675243976330753,
5
- "global_step": 90000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -2706,11 +2706,311 @@
2706
  "learning_rate": 0.0023768946429473976,
2707
  "loss": 8.4846,
2708
  "step": 90000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2709
  }
2710
  ],
2711
  "max_steps": 1000000,
2712
  "num_train_epochs": 5,
2713
- "total_flos": 1.4344469839872e+17,
2714
  "trial_name": null,
2715
  "trial_params": null
2716
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.42972493307034165,
5
+ "global_step": 100000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
2706
  "learning_rate": 0.0023768946429473976,
2707
  "loss": 8.4846,
2708
  "step": 90000
2709
+ },
2710
+ {
2711
+ "epoch": 0.39,
2712
+ "learning_rate": 0.002385031338735963,
2713
+ "loss": 8.4866,
2714
+ "step": 90200
2715
+ },
2716
+ {
2717
+ "epoch": 0.39,
2718
+ "learning_rate": 0.0023931803824365962,
2719
+ "loss": 8.4847,
2720
+ "step": 90400
2721
+ },
2722
+ {
2723
+ "epoch": 0.39,
2724
+ "learning_rate": 0.002401300900956714,
2725
+ "loss": 8.4934,
2726
+ "step": 90600
2727
+ },
2728
+ {
2729
+ "epoch": 0.39,
2730
+ "learning_rate": 0.002409474471896992,
2731
+ "loss": 8.4872,
2732
+ "step": 90800
2733
+ },
2734
+ {
2735
+ "epoch": 0.39,
2736
+ "learning_rate": 0.0024176602835290807,
2737
+ "loss": 8.4977,
2738
+ "step": 91000
2739
+ },
2740
+ {
2741
+ "epoch": 0.39,
2742
+ "learning_rate": 0.0024258582999457665,
2743
+ "loss": 8.4967,
2744
+ "step": 91200
2745
+ },
2746
+ {
2747
+ "epoch": 0.39,
2748
+ "learning_rate": 0.0024340684851863,
2749
+ "loss": 8.505,
2750
+ "step": 91400
2751
+ },
2752
+ {
2753
+ "epoch": 0.39,
2754
+ "learning_rate": 0.002442290803236551,
2755
+ "loss": 8.5126,
2756
+ "step": 91600
2757
+ },
2758
+ {
2759
+ "epoch": 0.39,
2760
+ "learning_rate": 0.0024505252180291688,
2761
+ "loss": 8.5033,
2762
+ "step": 91800
2763
+ },
2764
+ {
2765
+ "epoch": 0.4,
2766
+ "learning_rate": 0.0024587304311256865,
2767
+ "loss": 8.513,
2768
+ "step": 92000
2769
+ },
2770
+ {
2771
+ "epoch": 0.4,
2772
+ "learning_rate": 0.0024669888709567232,
2773
+ "loss": 8.5082,
2774
+ "step": 92200
2775
+ },
2776
+ {
2777
+ "epoch": 0.4,
2778
+ "learning_rate": 0.0024752592991915973,
2779
+ "loss": 8.517,
2780
+ "step": 92400
2781
+ },
2782
+ {
2783
+ "epoch": 0.4,
2784
+ "learning_rate": 0.0024835416795519205,
2785
+ "loss": 8.5293,
2786
+ "step": 92600
2787
+ },
2788
+ {
2789
+ "epoch": 0.4,
2790
+ "learning_rate": 0.002491835975706881,
2791
+ "loss": 8.5094,
2792
+ "step": 92800
2793
+ },
2794
+ {
2795
+ "epoch": 0.4,
2796
+ "learning_rate": 0.0025001421512733943,
2797
+ "loss": 8.5139,
2798
+ "step": 93000
2799
+ },
2800
+ {
2801
+ "epoch": 0.4,
2802
+ "learning_rate": 0.0025084601698162666,
2803
+ "loss": 8.5099,
2804
+ "step": 93200
2805
+ },
2806
+ {
2807
+ "epoch": 0.4,
2808
+ "learning_rate": 0.0025167899948483575,
2809
+ "loss": 8.5185,
2810
+ "step": 93400
2811
+ },
2812
+ {
2813
+ "epoch": 0.4,
2814
+ "learning_rate": 0.0025251315898307336,
2815
+ "loss": 8.5143,
2816
+ "step": 93600
2817
+ },
2818
+ {
2819
+ "epoch": 0.4,
2820
+ "learning_rate": 0.002533484918172837,
2821
+ "loss": 8.5277,
2822
+ "step": 93800
2823
+ },
2824
+ {
2825
+ "epoch": 0.4,
2826
+ "learning_rate": 0.0025418499432326358,
2827
+ "loss": 8.5231,
2828
+ "step": 94000
2829
+ },
2830
+ {
2831
+ "epoch": 0.4,
2832
+ "learning_rate": 0.002550184715947826,
2833
+ "loss": 8.5436,
2834
+ "step": 94200
2835
+ },
2836
+ {
2837
+ "epoch": 0.41,
2838
+ "learning_rate": 0.0025585729662869474,
2839
+ "loss": 8.5373,
2840
+ "step": 94400
2841
+ },
2842
+ {
2843
+ "epoch": 0.41,
2844
+ "learning_rate": 0.002566972803294579,
2845
+ "loss": 8.5347,
2846
+ "step": 94600
2847
+ },
2848
+ {
2849
+ "epoch": 0.41,
2850
+ "learning_rate": 0.00257538419012468,
2851
+ "loss": 8.5544,
2852
+ "step": 94800
2853
+ },
2854
+ {
2855
+ "epoch": 0.41,
2856
+ "learning_rate": 0.0025838070898805453,
2857
+ "loss": 8.5339,
2858
+ "step": 95000
2859
+ },
2860
+ {
2861
+ "epoch": 0.41,
2862
+ "learning_rate": 0.002592241465614974,
2863
+ "loss": 8.5405,
2864
+ "step": 95200
2865
+ },
2866
+ {
2867
+ "epoch": 0.41,
2868
+ "learning_rate": 0.002600687280330416,
2869
+ "loss": 8.5501,
2870
+ "step": 95400
2871
+ },
2872
+ {
2873
+ "epoch": 0.41,
2874
+ "learning_rate": 0.0026091444969791513,
2875
+ "loss": 8.5344,
2876
+ "step": 95600
2877
+ },
2878
+ {
2879
+ "epoch": 0.41,
2880
+ "learning_rate": 0.002617613078463441,
2881
+ "loss": 8.5477,
2882
+ "step": 95800
2883
+ },
2884
+ {
2885
+ "epoch": 0.41,
2886
+ "learning_rate": 0.002626092987635699,
2887
+ "loss": 8.5443,
2888
+ "step": 96000
2889
+ },
2890
+ {
2891
+ "epoch": 0.41,
2892
+ "learning_rate": 0.002634541703276827,
2893
+ "loss": 8.5398,
2894
+ "step": 96200
2895
+ },
2896
+ {
2897
+ "epoch": 0.41,
2898
+ "learning_rate": 0.002643044100010169,
2899
+ "loss": 8.5523,
2900
+ "step": 96400
2901
+ },
2902
+ {
2903
+ "epoch": 0.42,
2904
+ "learning_rate": 0.002651557712877833,
2905
+ "loss": 8.5562,
2906
+ "step": 96600
2907
+ },
2908
+ {
2909
+ "epoch": 0.42,
2910
+ "learning_rate": 0.0026600825045346955,
2911
+ "loss": 8.5525,
2912
+ "step": 96800
2913
+ },
2914
+ {
2915
+ "epoch": 0.42,
2916
+ "learning_rate": 0.0026686184375866043,
2917
+ "loss": 8.5728,
2918
+ "step": 97000
2919
+ },
2920
+ {
2921
+ "epoch": 0.42,
2922
+ "learning_rate": 0.002677165474590528,
2923
+ "loss": 8.5631,
2924
+ "step": 97200
2925
+ },
2926
+ {
2927
+ "epoch": 0.42,
2928
+ "learning_rate": 0.002685723578054729,
2929
+ "loss": 8.5658,
2930
+ "step": 97400
2931
+ },
2932
+ {
2933
+ "epoch": 0.42,
2934
+ "learning_rate": 0.0026942927104389334,
2935
+ "loss": 8.566,
2936
+ "step": 97600
2937
+ },
2938
+ {
2939
+ "epoch": 0.42,
2940
+ "learning_rate": 0.002702872834154482,
2941
+ "loss": 8.5716,
2942
+ "step": 97800
2943
+ },
2944
+ {
2945
+ "epoch": 0.42,
2946
+ "learning_rate": 0.0027114639115645017,
2947
+ "loss": 8.5697,
2948
+ "step": 98000
2949
+ },
2950
+ {
2951
+ "epoch": 0.42,
2952
+ "learning_rate": 0.002720022867925799,
2953
+ "loss": 8.5726,
2954
+ "step": 98200
2955
+ },
2956
+ {
2957
+ "epoch": 0.42,
2958
+ "learning_rate": 0.0027286356853246747,
2959
+ "loss": 8.5718,
2960
+ "step": 98400
2961
+ },
2962
+ {
2963
+ "epoch": 0.42,
2964
+ "learning_rate": 0.0027372593434088002,
2965
+ "loss": 8.5716,
2966
+ "step": 98600
2967
+ },
2968
+ {
2969
+ "epoch": 0.42,
2970
+ "learning_rate": 0.002745893804350339,
2971
+ "loss": 8.5767,
2972
+ "step": 98800
2973
+ },
2974
+ {
2975
+ "epoch": 0.43,
2976
+ "learning_rate": 0.00275453903027407,
2977
+ "loss": 8.5957,
2978
+ "step": 99000
2979
+ },
2980
+ {
2981
+ "epoch": 0.43,
2982
+ "learning_rate": 0.0027631949832575475,
2983
+ "loss": 8.5881,
2984
+ "step": 99200
2985
+ },
2986
+ {
2987
+ "epoch": 0.43,
2988
+ "learning_rate": 0.002771861625331276,
2989
+ "loss": 8.5835,
2990
+ "step": 99400
2991
+ },
2992
+ {
2993
+ "epoch": 0.43,
2994
+ "learning_rate": 0.002780495505581529,
2995
+ "loss": 8.5905,
2996
+ "step": 99600
2997
+ },
2998
+ {
2999
+ "epoch": 0.43,
3000
+ "learning_rate": 0.002789183358769584,
3001
+ "loss": 8.5938,
3002
+ "step": 99800
3003
+ },
3004
+ {
3005
+ "epoch": 0.43,
3006
+ "learning_rate": 0.0027978817870494,
3007
+ "loss": 8.5906,
3008
+ "step": 100000
3009
  }
3010
  ],
3011
  "max_steps": 1000000,
3012
  "num_train_epochs": 5,
3013
+ "total_flos": 1.593829982208e+17,
3014
  "trial_name": null,
3015
  "trial_params": null
3016
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d445756ee98f80cf0e1304f296d090935c2b25cf534d2e63f2d43ea00b4692
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbd29e5163387c30aab8a8890d9ee62efc996b56b102107768b0eabf5e23817e
3
  size 146774203