Trofish commited on
Commit
bf928f0
1 Parent(s): 94679f8

Upload 10 files

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +365 -5
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:839ae9db2ef12cb6c28652cf78e1ace0b5b2f613a977c1320527320e6f07713e
3
  size 20061432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0a1a2ebc0f1f1e6bae76f519dbcc21bb42eeb93020fab49ae955c26480b74e
3
  size 20061432
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79edf6307ec6399546f9a8ac635261711014b1bd4fd5cbef6a7d900714059c58
3
  size 40205626
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a5ea25c6472ccaf3b15d44dfb9cfe3f95c2d03ee19001eee198898ac6253a32
3
  size 40205626
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66fb3a85294ea76c6df4c9ebd3efd6802b5ffdd578d627b2aab67d064e04d612
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c83fbbf7f760f16043b0c1585d6a2a676ee048a2decb272da8f2e2127ffb79b3
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d4b6bb09f465063fd299be83fa63c3c3ba3f85f39eabdef7061b52b4e97bc68
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d78a469aae0653b14dcccaea45eb52458c95afae1649e5d894a88bcf0a974d36
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.610386610031128,
3
- "best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-11880",
4
- "epoch": 8.72136323160366,
5
  "eval_steps": 90,
6
- "global_step": 11970,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -9583,6 +9583,366 @@
9583
  "eval_samples_per_second": 457.867,
9584
  "eval_steps_per_second": 2.045,
9585
  "step": 11970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9586
  }
9587
  ],
9588
  "logging_steps": 10,
@@ -9590,7 +9950,7 @@
9590
  "num_input_tokens_seen": 0,
9591
  "num_train_epochs": 10,
9592
  "save_steps": 90,
9593
- "total_flos": 1.2438771143111148e+18,
9594
  "train_batch_size": 192,
9595
  "trial_name": null,
9596
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.6068978309631348,
3
+ "best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-12420",
4
+ "epoch": 9.049218813243746,
5
  "eval_steps": 90,
6
+ "global_step": 12420,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
9583
  "eval_samples_per_second": 457.867,
9584
  "eval_steps_per_second": 2.045,
9585
  "step": 11970
9586
+ },
9587
+ {
9588
+ "epoch": 8.73,
9589
+ "grad_norm": 0.289413720369339,
9590
+ "learning_rate": 5.1485148514851485e-05,
9591
+ "loss": 1.758,
9592
+ "step": 11980
9593
+ },
9594
+ {
9595
+ "epoch": 8.74,
9596
+ "grad_norm": 0.2739205062389374,
9597
+ "learning_rate": 5.0495049504950497e-05,
9598
+ "loss": 1.7579,
9599
+ "step": 11990
9600
+ },
9601
+ {
9602
+ "epoch": 8.74,
9603
+ "grad_norm": 0.26597511768341064,
9604
+ "learning_rate": 4.950495049504951e-05,
9605
+ "loss": 1.7568,
9606
+ "step": 12000
9607
+ },
9608
+ {
9609
+ "epoch": 8.75,
9610
+ "grad_norm": 0.24635004997253418,
9611
+ "learning_rate": 4.851485148514852e-05,
9612
+ "loss": 1.7584,
9613
+ "step": 12010
9614
+ },
9615
+ {
9616
+ "epoch": 8.76,
9617
+ "grad_norm": 0.2534136474132538,
9618
+ "learning_rate": 4.7524752475247525e-05,
9619
+ "loss": 1.7602,
9620
+ "step": 12020
9621
+ },
9622
+ {
9623
+ "epoch": 8.77,
9624
+ "grad_norm": 0.26007363200187683,
9625
+ "learning_rate": 4.653465346534654e-05,
9626
+ "loss": 1.7567,
9627
+ "step": 12030
9628
+ },
9629
+ {
9630
+ "epoch": 8.77,
9631
+ "grad_norm": 0.2807808816432953,
9632
+ "learning_rate": 4.554455445544554e-05,
9633
+ "loss": 1.7566,
9634
+ "step": 12040
9635
+ },
9636
+ {
9637
+ "epoch": 8.78,
9638
+ "grad_norm": 0.2677513360977173,
9639
+ "learning_rate": 4.455445544554455e-05,
9640
+ "loss": 1.7567,
9641
+ "step": 12050
9642
+ },
9643
+ {
9644
+ "epoch": 8.79,
9645
+ "grad_norm": 0.2691977620124817,
9646
+ "learning_rate": 4.3564356435643565e-05,
9647
+ "loss": 1.757,
9648
+ "step": 12060
9649
+ },
9650
+ {
9651
+ "epoch": 8.79,
9652
+ "eval_accuracy": 0.6521601327172856,
9653
+ "eval_loss": 1.60829758644104,
9654
+ "eval_runtime": 1089.928,
9655
+ "eval_samples_per_second": 458.177,
9656
+ "eval_steps_per_second": 2.046,
9657
+ "step": 12060
9658
+ },
9659
+ {
9660
+ "epoch": 8.79,
9661
+ "grad_norm": 0.2577356696128845,
9662
+ "learning_rate": 4.257425742574258e-05,
9663
+ "loss": 1.7584,
9664
+ "step": 12070
9665
+ },
9666
+ {
9667
+ "epoch": 8.8,
9668
+ "grad_norm": 0.2654874324798584,
9669
+ "learning_rate": 4.158415841584159e-05,
9670
+ "loss": 1.7571,
9671
+ "step": 12080
9672
+ },
9673
+ {
9674
+ "epoch": 8.81,
9675
+ "grad_norm": 0.25344353914260864,
9676
+ "learning_rate": 4.0594059405940594e-05,
9677
+ "loss": 1.7581,
9678
+ "step": 12090
9679
+ },
9680
+ {
9681
+ "epoch": 8.82,
9682
+ "grad_norm": 0.25865158438682556,
9683
+ "learning_rate": 3.9603960396039605e-05,
9684
+ "loss": 1.7552,
9685
+ "step": 12100
9686
+ },
9687
+ {
9688
+ "epoch": 8.82,
9689
+ "grad_norm": 0.28875982761383057,
9690
+ "learning_rate": 3.861386138613862e-05,
9691
+ "loss": 1.757,
9692
+ "step": 12110
9693
+ },
9694
+ {
9695
+ "epoch": 8.83,
9696
+ "grad_norm": 0.2697414755821228,
9697
+ "learning_rate": 3.762376237623762e-05,
9698
+ "loss": 1.7579,
9699
+ "step": 12120
9700
+ },
9701
+ {
9702
+ "epoch": 8.84,
9703
+ "grad_norm": 0.2786589562892914,
9704
+ "learning_rate": 3.6633663366336634e-05,
9705
+ "loss": 1.7583,
9706
+ "step": 12130
9707
+ },
9708
+ {
9709
+ "epoch": 8.85,
9710
+ "grad_norm": 0.258486270904541,
9711
+ "learning_rate": 3.564356435643564e-05,
9712
+ "loss": 1.7581,
9713
+ "step": 12140
9714
+ },
9715
+ {
9716
+ "epoch": 8.85,
9717
+ "grad_norm": 0.2595365345478058,
9718
+ "learning_rate": 3.465346534653466e-05,
9719
+ "loss": 1.757,
9720
+ "step": 12150
9721
+ },
9722
+ {
9723
+ "epoch": 8.85,
9724
+ "eval_accuracy": 0.652040482066107,
9725
+ "eval_loss": 1.6086018085479736,
9726
+ "eval_runtime": 1089.5635,
9727
+ "eval_samples_per_second": 458.33,
9728
+ "eval_steps_per_second": 2.047,
9729
+ "step": 12150
9730
+ },
9731
+ {
9732
+ "epoch": 8.86,
9733
+ "grad_norm": 0.25674012303352356,
9734
+ "learning_rate": 3.366336633663367e-05,
9735
+ "loss": 1.7595,
9736
+ "step": 12160
9737
+ },
9738
+ {
9739
+ "epoch": 8.87,
9740
+ "grad_norm": 0.23194921016693115,
9741
+ "learning_rate": 3.2673267326732674e-05,
9742
+ "loss": 1.7574,
9743
+ "step": 12170
9744
+ },
9745
+ {
9746
+ "epoch": 8.87,
9747
+ "grad_norm": 0.2626875936985016,
9748
+ "learning_rate": 3.1683168316831686e-05,
9749
+ "loss": 1.7571,
9750
+ "step": 12180
9751
+ },
9752
+ {
9753
+ "epoch": 8.88,
9754
+ "grad_norm": 0.2361476868391037,
9755
+ "learning_rate": 3.069306930693069e-05,
9756
+ "loss": 1.7573,
9757
+ "step": 12190
9758
+ },
9759
+ {
9760
+ "epoch": 8.89,
9761
+ "grad_norm": 0.2606755793094635,
9762
+ "learning_rate": 2.9702970297029702e-05,
9763
+ "loss": 1.7567,
9764
+ "step": 12200
9765
+ },
9766
+ {
9767
+ "epoch": 8.9,
9768
+ "grad_norm": 0.27499887347221375,
9769
+ "learning_rate": 2.8712871287128714e-05,
9770
+ "loss": 1.7579,
9771
+ "step": 12210
9772
+ },
9773
+ {
9774
+ "epoch": 8.9,
9775
+ "grad_norm": 0.24832656979560852,
9776
+ "learning_rate": 2.7722772277227722e-05,
9777
+ "loss": 1.7566,
9778
+ "step": 12220
9779
+ },
9780
+ {
9781
+ "epoch": 8.91,
9782
+ "grad_norm": 0.24898388981819153,
9783
+ "learning_rate": 2.6732673267326734e-05,
9784
+ "loss": 1.7544,
9785
+ "step": 12230
9786
+ },
9787
+ {
9788
+ "epoch": 8.92,
9789
+ "grad_norm": 0.24266423285007477,
9790
+ "learning_rate": 2.5742574257425742e-05,
9791
+ "loss": 1.7559,
9792
+ "step": 12240
9793
+ },
9794
+ {
9795
+ "epoch": 8.92,
9796
+ "eval_accuracy": 0.6522573824099933,
9797
+ "eval_loss": 1.6079708337783813,
9798
+ "eval_runtime": 1089.9176,
9799
+ "eval_samples_per_second": 458.181,
9800
+ "eval_steps_per_second": 2.046,
9801
+ "step": 12240
9802
+ },
9803
+ {
9804
+ "epoch": 8.93,
9805
+ "grad_norm": 0.2438860386610031,
9806
+ "learning_rate": 2.4752475247524754e-05,
9807
+ "loss": 1.7554,
9808
+ "step": 12250
9809
+ },
9810
+ {
9811
+ "epoch": 8.93,
9812
+ "grad_norm": 0.22911418974399567,
9813
+ "learning_rate": 2.3762376237623762e-05,
9814
+ "loss": 1.7547,
9815
+ "step": 12260
9816
+ },
9817
+ {
9818
+ "epoch": 8.94,
9819
+ "grad_norm": 0.2550877034664154,
9820
+ "learning_rate": 2.277227722772277e-05,
9821
+ "loss": 1.7567,
9822
+ "step": 12270
9823
+ },
9824
+ {
9825
+ "epoch": 8.95,
9826
+ "grad_norm": 0.2409505546092987,
9827
+ "learning_rate": 2.1782178217821783e-05,
9828
+ "loss": 1.7556,
9829
+ "step": 12280
9830
+ },
9831
+ {
9832
+ "epoch": 8.95,
9833
+ "grad_norm": 0.23632997274398804,
9834
+ "learning_rate": 2.0792079207920794e-05,
9835
+ "loss": 1.7573,
9836
+ "step": 12290
9837
+ },
9838
+ {
9839
+ "epoch": 8.96,
9840
+ "grad_norm": 0.22292740643024445,
9841
+ "learning_rate": 1.9801980198019803e-05,
9842
+ "loss": 1.757,
9843
+ "step": 12300
9844
+ },
9845
+ {
9846
+ "epoch": 8.97,
9847
+ "grad_norm": 0.2350420504808426,
9848
+ "learning_rate": 1.881188118811881e-05,
9849
+ "loss": 1.756,
9850
+ "step": 12310
9851
+ },
9852
+ {
9853
+ "epoch": 8.98,
9854
+ "grad_norm": 0.22938278317451477,
9855
+ "learning_rate": 1.782178217821782e-05,
9856
+ "loss": 1.7562,
9857
+ "step": 12320
9858
+ },
9859
+ {
9860
+ "epoch": 8.98,
9861
+ "grad_norm": 0.2246268391609192,
9862
+ "learning_rate": 1.6831683168316834e-05,
9863
+ "loss": 1.7556,
9864
+ "step": 12330
9865
+ },
9866
+ {
9867
+ "epoch": 8.98,
9868
+ "eval_accuracy": 0.652376308176148,
9869
+ "eval_loss": 1.6073620319366455,
9870
+ "eval_runtime": 1088.9818,
9871
+ "eval_samples_per_second": 458.575,
9872
+ "eval_steps_per_second": 2.048,
9873
+ "step": 12330
9874
+ },
9875
+ {
9876
+ "epoch": 8.99,
9877
+ "grad_norm": 0.22820483148097992,
9878
+ "learning_rate": 1.5841584158415843e-05,
9879
+ "loss": 1.7564,
9880
+ "step": 12340
9881
+ },
9882
+ {
9883
+ "epoch": 9.0,
9884
+ "grad_norm": 0.2315167486667633,
9885
+ "learning_rate": 1.4851485148514851e-05,
9886
+ "loss": 1.7558,
9887
+ "step": 12350
9888
+ },
9889
+ {
9890
+ "epoch": 9.01,
9891
+ "grad_norm": 0.21513350307941437,
9892
+ "learning_rate": 1.3861386138613861e-05,
9893
+ "loss": 1.757,
9894
+ "step": 12360
9895
+ },
9896
+ {
9897
+ "epoch": 9.01,
9898
+ "grad_norm": 0.21538245677947998,
9899
+ "learning_rate": 1.2871287128712871e-05,
9900
+ "loss": 1.7527,
9901
+ "step": 12370
9902
+ },
9903
+ {
9904
+ "epoch": 9.02,
9905
+ "grad_norm": 0.22796376049518585,
9906
+ "learning_rate": 1.1881188118811881e-05,
9907
+ "loss": 1.7549,
9908
+ "step": 12380
9909
+ },
9910
+ {
9911
+ "epoch": 9.03,
9912
+ "grad_norm": 0.21846508979797363,
9913
+ "learning_rate": 1.0891089108910891e-05,
9914
+ "loss": 1.7527,
9915
+ "step": 12390
9916
+ },
9917
+ {
9918
+ "epoch": 9.03,
9919
+ "grad_norm": 0.2252340316772461,
9920
+ "learning_rate": 9.900990099009901e-06,
9921
+ "loss": 1.757,
9922
+ "step": 12400
9923
+ },
9924
+ {
9925
+ "epoch": 9.04,
9926
+ "grad_norm": 0.22679966688156128,
9927
+ "learning_rate": 8.91089108910891e-06,
9928
+ "loss": 1.7547,
9929
+ "step": 12410
9930
+ },
9931
+ {
9932
+ "epoch": 9.05,
9933
+ "grad_norm": 0.21749068796634674,
9934
+ "learning_rate": 7.920792079207921e-06,
9935
+ "loss": 1.755,
9936
+ "step": 12420
9937
+ },
9938
+ {
9939
+ "epoch": 9.05,
9940
+ "eval_accuracy": 0.6525192559694988,
9941
+ "eval_loss": 1.6068978309631348,
9942
+ "eval_runtime": 1087.147,
9943
+ "eval_samples_per_second": 459.349,
9944
+ "eval_steps_per_second": 2.051,
9945
+ "step": 12420
9946
  }
9947
  ],
9948
  "logging_steps": 10,
 
9950
  "num_input_tokens_seen": 0,
9951
  "num_train_epochs": 10,
9952
  "save_steps": 90,
9953
+ "total_flos": 1.28938481325833e+18,
9954
  "train_batch_size": 192,
9955
  "trial_name": null,
9956
  "trial_params": null