jdannem6 commited on
Commit
70488ba
1 Parent(s): 50672be

Uploaded checkpoint-20000

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55542cf68a7a56641df7a59e91b124f6d16eb72304a4aab6742a0f93a5b3d6a9
3
  size 4986380064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ae8fd2899cae9fbf1de2288673a19564009a6254b0b592383efcc519ad621a
3
  size 4986380064
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5ecbf27a334befe14f1464c73a6a77128f6598de400961bb7d5097ecfb48f69
3
  size 399532808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fef9985c248615a2bb23a47f171d94be90d8c93afd5aa990b1587b71e71171c6
3
  size 399532808
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10a07aad0a46264c45185ced9dd0645d835455a11a08613ac0cc316256bf2101
3
  size 2699039674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c0438b7ed3e4c55f24fa3150fa982da8011360f12c7b707448e8db4ad3f902
3
  size 2699039674
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fc99115bf5f04a1f69339b55b87574e78f76c0017fb7fbc54425e463c53fe09
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b0f27722247239511c0d33808326948f3078297be0bebdd0214846f3609f1e
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d75cd0c4e544f7391f9754fd838738017fc0e36a7e8de482816ca502f9dc5c07
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4375,
5
  "eval_steps": 100,
6
- "global_step": 17500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2632,6 +2632,381 @@
2632
  "eval_samples_per_second": 10.168,
2633
  "eval_steps_per_second": 10.168,
2634
  "step": 17500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2635
  }
2636
  ],
2637
  "logging_steps": 100,
@@ -2639,7 +3014,7 @@
2639
  "num_input_tokens_seen": 0,
2640
  "num_train_epochs": 1,
2641
  "save_steps": 2500,
2642
- "total_flos": 2.7533972078592e+17,
2643
  "train_batch_size": 1,
2644
  "trial_name": null,
2645
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5,
5
  "eval_steps": 100,
6
+ "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2632
  "eval_samples_per_second": 10.168,
2633
  "eval_steps_per_second": 10.168,
2634
  "step": 17500
2635
+ },
2636
+ {
2637
+ "epoch": 0.44,
2638
+ "grad_norm": 4.079812526702881,
2639
+ "learning_rate": 2.461538461538462e-06,
2640
+ "loss": 0.6369,
2641
+ "step": 17600
2642
+ },
2643
+ {
2644
+ "epoch": 0.44,
2645
+ "eval_loss": 0.6634325385093689,
2646
+ "eval_runtime": 98.3802,
2647
+ "eval_samples_per_second": 10.165,
2648
+ "eval_steps_per_second": 10.165,
2649
+ "step": 17600
2650
+ },
2651
+ {
2652
+ "epoch": 0.44,
2653
+ "grad_norm": 3.0387985706329346,
2654
+ "learning_rate": 2.358974358974359e-06,
2655
+ "loss": 0.6244,
2656
+ "step": 17700
2657
+ },
2658
+ {
2659
+ "epoch": 0.44,
2660
+ "eval_loss": 0.658197283744812,
2661
+ "eval_runtime": 98.3504,
2662
+ "eval_samples_per_second": 10.168,
2663
+ "eval_steps_per_second": 10.168,
2664
+ "step": 17700
2665
+ },
2666
+ {
2667
+ "epoch": 0.45,
2668
+ "grad_norm": 4.429220199584961,
2669
+ "learning_rate": 2.2564102564102566e-06,
2670
+ "loss": 0.663,
2671
+ "step": 17800
2672
+ },
2673
+ {
2674
+ "epoch": 0.45,
2675
+ "eval_loss": 0.663162350654602,
2676
+ "eval_runtime": 98.3612,
2677
+ "eval_samples_per_second": 10.167,
2678
+ "eval_steps_per_second": 10.167,
2679
+ "step": 17800
2680
+ },
2681
+ {
2682
+ "epoch": 0.45,
2683
+ "grad_norm": 7.286205291748047,
2684
+ "learning_rate": 2.153846153846154e-06,
2685
+ "loss": 0.6692,
2686
+ "step": 17900
2687
+ },
2688
+ {
2689
+ "epoch": 0.45,
2690
+ "eval_loss": 0.6493304371833801,
2691
+ "eval_runtime": 98.3416,
2692
+ "eval_samples_per_second": 10.169,
2693
+ "eval_steps_per_second": 10.169,
2694
+ "step": 17900
2695
+ },
2696
+ {
2697
+ "epoch": 0.45,
2698
+ "grad_norm": 3.1025142669677734,
2699
+ "learning_rate": 2.0512820512820513e-06,
2700
+ "loss": 0.6515,
2701
+ "step": 18000
2702
+ },
2703
+ {
2704
+ "epoch": 0.45,
2705
+ "eval_loss": 0.6505727767944336,
2706
+ "eval_runtime": 98.3601,
2707
+ "eval_samples_per_second": 10.167,
2708
+ "eval_steps_per_second": 10.167,
2709
+ "step": 18000
2710
+ },
2711
+ {
2712
+ "epoch": 0.45,
2713
+ "grad_norm": 5.170887470245361,
2714
+ "learning_rate": 1.948717948717949e-06,
2715
+ "loss": 0.6554,
2716
+ "step": 18100
2717
+ },
2718
+ {
2719
+ "epoch": 0.45,
2720
+ "eval_loss": 0.640792191028595,
2721
+ "eval_runtime": 98.3695,
2722
+ "eval_samples_per_second": 10.166,
2723
+ "eval_steps_per_second": 10.166,
2724
+ "step": 18100
2725
+ },
2726
+ {
2727
+ "epoch": 0.46,
2728
+ "grad_norm": 8.582660675048828,
2729
+ "learning_rate": 1.8461538461538465e-06,
2730
+ "loss": 0.6481,
2731
+ "step": 18200
2732
+ },
2733
+ {
2734
+ "epoch": 0.46,
2735
+ "eval_loss": 0.6477890014648438,
2736
+ "eval_runtime": 98.3746,
2737
+ "eval_samples_per_second": 10.165,
2738
+ "eval_steps_per_second": 10.165,
2739
+ "step": 18200
2740
+ },
2741
+ {
2742
+ "epoch": 0.46,
2743
+ "grad_norm": 7.4102301597595215,
2744
+ "learning_rate": 1.7435897435897436e-06,
2745
+ "loss": 0.6284,
2746
+ "step": 18300
2747
+ },
2748
+ {
2749
+ "epoch": 0.46,
2750
+ "eval_loss": 0.6732329726219177,
2751
+ "eval_runtime": 98.3003,
2752
+ "eval_samples_per_second": 10.173,
2753
+ "eval_steps_per_second": 10.173,
2754
+ "step": 18300
2755
+ },
2756
+ {
2757
+ "epoch": 0.46,
2758
+ "grad_norm": 3.6458799839019775,
2759
+ "learning_rate": 1.6410256410256412e-06,
2760
+ "loss": 0.6546,
2761
+ "step": 18400
2762
+ },
2763
+ {
2764
+ "epoch": 0.46,
2765
+ "eval_loss": 0.6304293274879456,
2766
+ "eval_runtime": 98.3207,
2767
+ "eval_samples_per_second": 10.171,
2768
+ "eval_steps_per_second": 10.171,
2769
+ "step": 18400
2770
+ },
2771
+ {
2772
+ "epoch": 0.46,
2773
+ "grad_norm": 3.8108537197113037,
2774
+ "learning_rate": 1.5384615384615387e-06,
2775
+ "loss": 0.6665,
2776
+ "step": 18500
2777
+ },
2778
+ {
2779
+ "epoch": 0.46,
2780
+ "eval_loss": 0.634360134601593,
2781
+ "eval_runtime": 98.3511,
2782
+ "eval_samples_per_second": 10.168,
2783
+ "eval_steps_per_second": 10.168,
2784
+ "step": 18500
2785
+ },
2786
+ {
2787
+ "epoch": 0.47,
2788
+ "grad_norm": 9.306204795837402,
2789
+ "learning_rate": 1.4358974358974359e-06,
2790
+ "loss": 0.6559,
2791
+ "step": 18600
2792
+ },
2793
+ {
2794
+ "epoch": 0.47,
2795
+ "eval_loss": 0.6353975534439087,
2796
+ "eval_runtime": 98.3306,
2797
+ "eval_samples_per_second": 10.17,
2798
+ "eval_steps_per_second": 10.17,
2799
+ "step": 18600
2800
+ },
2801
+ {
2802
+ "epoch": 0.47,
2803
+ "grad_norm": 4.001993656158447,
2804
+ "learning_rate": 1.3333333333333334e-06,
2805
+ "loss": 0.6386,
2806
+ "step": 18700
2807
+ },
2808
+ {
2809
+ "epoch": 0.47,
2810
+ "eval_loss": 0.6281168460845947,
2811
+ "eval_runtime": 98.2978,
2812
+ "eval_samples_per_second": 10.173,
2813
+ "eval_steps_per_second": 10.173,
2814
+ "step": 18700
2815
+ },
2816
+ {
2817
+ "epoch": 0.47,
2818
+ "grad_norm": 4.9915924072265625,
2819
+ "learning_rate": 1.230769230769231e-06,
2820
+ "loss": 0.6509,
2821
+ "step": 18800
2822
+ },
2823
+ {
2824
+ "epoch": 0.47,
2825
+ "eval_loss": 0.6473774313926697,
2826
+ "eval_runtime": 98.3123,
2827
+ "eval_samples_per_second": 10.172,
2828
+ "eval_steps_per_second": 10.172,
2829
+ "step": 18800
2830
+ },
2831
+ {
2832
+ "epoch": 0.47,
2833
+ "grad_norm": 3.8955130577087402,
2834
+ "learning_rate": 1.1282051282051283e-06,
2835
+ "loss": 0.6272,
2836
+ "step": 18900
2837
+ },
2838
+ {
2839
+ "epoch": 0.47,
2840
+ "eval_loss": 0.6473217606544495,
2841
+ "eval_runtime": 98.3521,
2842
+ "eval_samples_per_second": 10.168,
2843
+ "eval_steps_per_second": 10.168,
2844
+ "step": 18900
2845
+ },
2846
+ {
2847
+ "epoch": 0.47,
2848
+ "grad_norm": 9.715432167053223,
2849
+ "learning_rate": 1.0256410256410257e-06,
2850
+ "loss": 0.6086,
2851
+ "step": 19000
2852
+ },
2853
+ {
2854
+ "epoch": 0.47,
2855
+ "eval_loss": 0.620388925075531,
2856
+ "eval_runtime": 98.3358,
2857
+ "eval_samples_per_second": 10.169,
2858
+ "eval_steps_per_second": 10.169,
2859
+ "step": 19000
2860
+ },
2861
+ {
2862
+ "epoch": 0.48,
2863
+ "grad_norm": 4.22080659866333,
2864
+ "learning_rate": 9.230769230769232e-07,
2865
+ "loss": 0.6553,
2866
+ "step": 19100
2867
+ },
2868
+ {
2869
+ "epoch": 0.48,
2870
+ "eval_loss": 0.6517868041992188,
2871
+ "eval_runtime": 98.326,
2872
+ "eval_samples_per_second": 10.17,
2873
+ "eval_steps_per_second": 10.17,
2874
+ "step": 19100
2875
+ },
2876
+ {
2877
+ "epoch": 0.48,
2878
+ "grad_norm": 3.7308225631713867,
2879
+ "learning_rate": 8.205128205128206e-07,
2880
+ "loss": 0.6119,
2881
+ "step": 19200
2882
+ },
2883
+ {
2884
+ "epoch": 0.48,
2885
+ "eval_loss": 0.639901340007782,
2886
+ "eval_runtime": 98.3,
2887
+ "eval_samples_per_second": 10.173,
2888
+ "eval_steps_per_second": 10.173,
2889
+ "step": 19200
2890
+ },
2891
+ {
2892
+ "epoch": 0.48,
2893
+ "grad_norm": 4.890966892242432,
2894
+ "learning_rate": 7.179487179487179e-07,
2895
+ "loss": 0.6363,
2896
+ "step": 19300
2897
+ },
2898
+ {
2899
+ "epoch": 0.48,
2900
+ "eval_loss": 0.6639401912689209,
2901
+ "eval_runtime": 98.3404,
2902
+ "eval_samples_per_second": 10.169,
2903
+ "eval_steps_per_second": 10.169,
2904
+ "step": 19300
2905
+ },
2906
+ {
2907
+ "epoch": 0.48,
2908
+ "grad_norm": 6.855196952819824,
2909
+ "learning_rate": 6.153846153846155e-07,
2910
+ "loss": 0.6261,
2911
+ "step": 19400
2912
+ },
2913
+ {
2914
+ "epoch": 0.48,
2915
+ "eval_loss": 0.6473622918128967,
2916
+ "eval_runtime": 98.3312,
2917
+ "eval_samples_per_second": 10.17,
2918
+ "eval_steps_per_second": 10.17,
2919
+ "step": 19400
2920
+ },
2921
+ {
2922
+ "epoch": 0.49,
2923
+ "grad_norm": 2.988701581954956,
2924
+ "learning_rate": 5.128205128205128e-07,
2925
+ "loss": 0.6192,
2926
+ "step": 19500
2927
+ },
2928
+ {
2929
+ "epoch": 0.49,
2930
+ "eval_loss": 0.6319825649261475,
2931
+ "eval_runtime": 98.277,
2932
+ "eval_samples_per_second": 10.175,
2933
+ "eval_steps_per_second": 10.175,
2934
+ "step": 19500
2935
+ },
2936
+ {
2937
+ "epoch": 0.49,
2938
+ "grad_norm": 3.922107696533203,
2939
+ "learning_rate": 4.102564102564103e-07,
2940
+ "loss": 0.6123,
2941
+ "step": 19600
2942
+ },
2943
+ {
2944
+ "epoch": 0.49,
2945
+ "eval_loss": 0.6252996325492859,
2946
+ "eval_runtime": 98.3425,
2947
+ "eval_samples_per_second": 10.169,
2948
+ "eval_steps_per_second": 10.169,
2949
+ "step": 19600
2950
+ },
2951
+ {
2952
+ "epoch": 0.49,
2953
+ "grad_norm": 13.531473159790039,
2954
+ "learning_rate": 3.0769230769230774e-07,
2955
+ "loss": 0.6128,
2956
+ "step": 19700
2957
+ },
2958
+ {
2959
+ "epoch": 0.49,
2960
+ "eval_loss": 0.6362013220787048,
2961
+ "eval_runtime": 98.2906,
2962
+ "eval_samples_per_second": 10.174,
2963
+ "eval_steps_per_second": 10.174,
2964
+ "step": 19700
2965
+ },
2966
+ {
2967
+ "epoch": 0.49,
2968
+ "grad_norm": 13.287760734558105,
2969
+ "learning_rate": 2.0512820512820514e-07,
2970
+ "loss": 0.6321,
2971
+ "step": 19800
2972
+ },
2973
+ {
2974
+ "epoch": 0.49,
2975
+ "eval_loss": 0.6311822533607483,
2976
+ "eval_runtime": 98.3206,
2977
+ "eval_samples_per_second": 10.171,
2978
+ "eval_steps_per_second": 10.171,
2979
+ "step": 19800
2980
+ },
2981
+ {
2982
+ "epoch": 0.5,
2983
+ "grad_norm": 10.524887084960938,
2984
+ "learning_rate": 1.0256410256410257e-07,
2985
+ "loss": 0.6026,
2986
+ "step": 19900
2987
+ },
2988
+ {
2989
+ "epoch": 0.5,
2990
+ "eval_loss": 0.6232057809829712,
2991
+ "eval_runtime": 98.3191,
2992
+ "eval_samples_per_second": 10.171,
2993
+ "eval_steps_per_second": 10.171,
2994
+ "step": 19900
2995
+ },
2996
+ {
2997
+ "epoch": 0.5,
2998
+ "grad_norm": 4.0852131843566895,
2999
+ "learning_rate": 0.0,
3000
+ "loss": 0.6322,
3001
+ "step": 20000
3002
+ },
3003
+ {
3004
+ "epoch": 0.5,
3005
+ "eval_loss": 0.632331907749176,
3006
+ "eval_runtime": 98.3118,
3007
+ "eval_samples_per_second": 10.172,
3008
+ "eval_steps_per_second": 10.172,
3009
+ "step": 20000
3010
  }
3011
  ],
3012
  "logging_steps": 100,
 
3014
  "num_input_tokens_seen": 0,
3015
  "num_train_epochs": 1,
3016
  "save_steps": 2500,
3017
+ "total_flos": 3.1467396661248e+17,
3018
  "train_batch_size": 1,
3019
  "trial_name": null,
3020
  "trial_params": null