Wilsonwin commited on
Commit
8dbb3f2
·
verified ·
1 Parent(s): f186bbb

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb42262712eb0446298aefaf9502d1bce878381fa4256b4c412cb875cf7676dd
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9da6829b1edfacc61441699b4ac6d5dc6abb737be9152be8f29e5862abecd54
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37b61fef6d8dab3892dcb676937372c6938b18c4b8be84f3a00936c78dd241b6
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9c41bcb1f7e3d0ff7cf1e9246c52eba5532bd32a5af7bbe5d88c8501561fc3
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:482021b320968c1aef3bb227f66c018b401e7317860a8a4bae46f36ed2c71427
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f02b717c272316648da49ca6391d63601d6d8e37a3b73ce0655aa44e0b1efd
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4784f3b1ac308d4093c525f58ebfb1ed5c4e7ca17828bd58e2e6a8e2baed20b5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53471871a37f3cc35b4a656a6f0cfda18046c304a91d9bf8b29b14eea2ccc156
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6050008447372868,
6
  "eval_steps": 500,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6817,6 +6817,364 @@
6817
  "eval_samples_per_second": 129.654,
6818
  "eval_steps_per_second": 2.723,
6819
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6820
  }
6821
  ],
6822
  "logging_steps": 10,
@@ -6836,7 +7194,7 @@
6836
  "attributes": {}
6837
  }
6838
  },
6839
- "total_flos": 3.177318894608056e+17,
6840
  "train_batch_size": 48,
6841
  "trial_name": null,
6842
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.6894745734076704,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6817
  "eval_samples_per_second": 129.654,
6818
  "eval_steps_per_second": 2.723,
6819
  "step": 9500
6820
+ },
6821
+ {
6822
+ "epoch": 1.6066903193106943,
6823
+ "grad_norm": 0.4606820344924927,
6824
+ "learning_rate": 3.960727047894527e-05,
6825
+ "loss": 4.359199523925781,
6826
+ "step": 9510
6827
+ },
6828
+ {
6829
+ "epoch": 1.608379793884102,
6830
+ "grad_norm": 0.48804476857185364,
6831
+ "learning_rate": 3.928353538569023e-05,
6832
+ "loss": 4.32340087890625,
6833
+ "step": 9520
6834
+ },
6835
+ {
6836
+ "epoch": 1.6100692684575098,
6837
+ "grad_norm": 0.4648666977882385,
6838
+ "learning_rate": 3.8960929302853074e-05,
6839
+ "loss": 4.31898078918457,
6840
+ "step": 9530
6841
+ },
6842
+ {
6843
+ "epoch": 1.6117587430309173,
6844
+ "grad_norm": 0.48212724924087524,
6845
+ "learning_rate": 3.863945552014892e-05,
6846
+ "loss": 4.320017242431641,
6847
+ "step": 9540
6848
+ },
6849
+ {
6850
+ "epoch": 1.6134482176043252,
6851
+ "grad_norm": 0.46979817748069763,
6852
+ "learning_rate": 3.831911731574648e-05,
6853
+ "loss": 4.365304946899414,
6854
+ "step": 9550
6855
+ },
6856
+ {
6857
+ "epoch": 1.6151376921777327,
6858
+ "grad_norm": 0.47188496589660645,
6859
+ "learning_rate": 3.799991795623471e-05,
6860
+ "loss": 4.329359817504883,
6861
+ "step": 9560
6862
+ },
6863
+ {
6864
+ "epoch": 1.6168271667511402,
6865
+ "grad_norm": 0.47442197799682617,
6866
+ "learning_rate": 3.7681860696589216e-05,
6867
+ "loss": 4.333200836181641,
6868
+ "step": 9570
6869
+ },
6870
+ {
6871
+ "epoch": 1.6185166413245482,
6872
+ "grad_norm": 0.46460849046707153,
6873
+ "learning_rate": 3.7364948780139344e-05,
6874
+ "loss": 4.2955772399902346,
6875
+ "step": 9580
6876
+ },
6877
+ {
6878
+ "epoch": 1.6202061158979557,
6879
+ "grad_norm": 0.4687038064002991,
6880
+ "learning_rate": 3.70491854385351e-05,
6881
+ "loss": 4.287596893310547,
6882
+ "step": 9590
6883
+ },
6884
+ {
6885
+ "epoch": 1.6218955904713634,
6886
+ "grad_norm": 0.4717998802661896,
6887
+ "learning_rate": 3.673457389171401e-05,
6888
+ "loss": 4.3026374816894535,
6889
+ "step": 9600
6890
+ },
6891
+ {
6892
+ "epoch": 1.6235850650447712,
6893
+ "grad_norm": 0.47237226366996765,
6894
+ "learning_rate": 3.642111734786833e-05,
6895
+ "loss": 4.3385662078857425,
6896
+ "step": 9610
6897
+ },
6898
+ {
6899
+ "epoch": 1.6252745396181787,
6900
+ "grad_norm": 0.48337623476982117,
6901
+ "learning_rate": 3.610881900341261e-05,
6902
+ "loss": 4.29266357421875,
6903
+ "step": 9620
6904
+ },
6905
+ {
6906
+ "epoch": 1.6269640141915864,
6907
+ "grad_norm": 0.46639102697372437,
6908
+ "learning_rate": 3.579768204295063e-05,
6909
+ "loss": 4.3327476501464846,
6910
+ "step": 9630
6911
+ },
6912
+ {
6913
+ "epoch": 1.6286534887649942,
6914
+ "grad_norm": 0.4697898030281067,
6915
+ "learning_rate": 3.54877096392434e-05,
6916
+ "loss": 4.336753463745117,
6917
+ "step": 9640
6918
+ },
6919
+ {
6920
+ "epoch": 1.6303429633384017,
6921
+ "grad_norm": 0.46316251158714294,
6922
+ "learning_rate": 3.5178904953176354e-05,
6923
+ "loss": 4.306925964355469,
6924
+ "step": 9650
6925
+ },
6926
+ {
6927
+ "epoch": 1.6320324379118094,
6928
+ "grad_norm": 0.4708452820777893,
6929
+ "learning_rate": 3.487127113372755e-05,
6930
+ "loss": 4.326674270629883,
6931
+ "step": 9660
6932
+ },
6933
+ {
6934
+ "epoch": 1.6337219124852171,
6935
+ "grad_norm": 0.4727766811847687,
6936
+ "learning_rate": 3.4564811317935235e-05,
6937
+ "loss": 4.304772186279297,
6938
+ "step": 9670
6939
+ },
6940
+ {
6941
+ "epoch": 1.6354113870586247,
6942
+ "grad_norm": 0.47584787011146545,
6943
+ "learning_rate": 3.4259528630865995e-05,
6944
+ "loss": 4.3285400390625,
6945
+ "step": 9680
6946
+ },
6947
+ {
6948
+ "epoch": 1.6371008616320324,
6949
+ "grad_norm": 0.4718579947948456,
6950
+ "learning_rate": 3.3955426185582826e-05,
6951
+ "loss": 4.310879135131836,
6952
+ "step": 9690
6953
+ },
6954
+ {
6955
+ "epoch": 1.6387903362054401,
6956
+ "grad_norm": 0.466880738735199,
6957
+ "learning_rate": 3.365250708311352e-05,
6958
+ "loss": 4.325877380371094,
6959
+ "step": 9700
6960
+ },
6961
+ {
6962
+ "epoch": 1.6404798107788476,
6963
+ "grad_norm": 0.46377378702163696,
6964
+ "learning_rate": 3.335077441241895e-05,
6965
+ "loss": 4.307848358154297,
6966
+ "step": 9710
6967
+ },
6968
+ {
6969
+ "epoch": 1.6421692853522556,
6970
+ "grad_norm": 0.718170166015625,
6971
+ "learning_rate": 3.305023125036148e-05,
6972
+ "loss": 4.313734436035157,
6973
+ "step": 9720
6974
+ },
6975
+ {
6976
+ "epoch": 1.643858759925663,
6977
+ "grad_norm": 0.463375985622406,
6978
+ "learning_rate": 3.275088066167369e-05,
6979
+ "loss": 4.3089752197265625,
6980
+ "step": 9730
6981
+ },
6982
+ {
6983
+ "epoch": 1.6455482344990708,
6984
+ "grad_norm": 0.47580841183662415,
6985
+ "learning_rate": 3.245272569892727e-05,
6986
+ "loss": 4.3522186279296875,
6987
+ "step": 9740
6988
+ },
6989
+ {
6990
+ "epoch": 1.6472377090724786,
6991
+ "grad_norm": 0.46081092953681946,
6992
+ "learning_rate": 3.215576940250155e-05,
6993
+ "loss": 4.3113548278808596,
6994
+ "step": 9750
6995
+ },
6996
+ {
6997
+ "epoch": 1.648927183645886,
6998
+ "grad_norm": 0.47329118847846985,
6999
+ "learning_rate": 3.1860014800552734e-05,
7000
+ "loss": 4.3111930847167965,
7001
+ "step": 9760
7002
+ },
7003
+ {
7004
+ "epoch": 1.6506166582192938,
7005
+ "grad_norm": 0.4813630282878876,
7006
+ "learning_rate": 3.15654649089831e-05,
7007
+ "loss": 4.312236404418945,
7008
+ "step": 9770
7009
+ },
7010
+ {
7011
+ "epoch": 1.6523061327927016,
7012
+ "grad_norm": 0.5134222507476807,
7013
+ "learning_rate": 3.1272122731409916e-05,
7014
+ "loss": 4.3267356872558596,
7015
+ "step": 9780
7016
+ },
7017
+ {
7018
+ "epoch": 1.653995607366109,
7019
+ "grad_norm": 0.4687715768814087,
7020
+ "learning_rate": 3.097999125913518e-05,
7021
+ "loss": 4.311066055297852,
7022
+ "step": 9790
7023
+ },
7024
+ {
7025
+ "epoch": 1.6556850819395168,
7026
+ "grad_norm": 0.4736403524875641,
7027
+ "learning_rate": 3.068907347111485e-05,
7028
+ "loss": 4.3107654571533205,
7029
+ "step": 9800
7030
+ },
7031
+ {
7032
+ "epoch": 1.6573745565129245,
7033
+ "grad_norm": 0.4813496172428131,
7034
+ "learning_rate": 3.0399372333928644e-05,
7035
+ "loss": 4.314376449584961,
7036
+ "step": 9810
7037
+ },
7038
+ {
7039
+ "epoch": 1.659064031086332,
7040
+ "grad_norm": 0.49036741256713867,
7041
+ "learning_rate": 3.0110890801749627e-05,
7042
+ "loss": 4.307826995849609,
7043
+ "step": 9820
7044
+ },
7045
+ {
7046
+ "epoch": 1.6607535056597398,
7047
+ "grad_norm": 0.4669703543186188,
7048
+ "learning_rate": 2.982363181631418e-05,
7049
+ "loss": 4.303530883789063,
7050
+ "step": 9830
7051
+ },
7052
+ {
7053
+ "epoch": 1.6624429802331475,
7054
+ "grad_norm": 0.4788713753223419,
7055
+ "learning_rate": 2.9537598306892103e-05,
7056
+ "loss": 4.308844375610351,
7057
+ "step": 9840
7058
+ },
7059
+ {
7060
+ "epoch": 1.664132454806555,
7061
+ "grad_norm": 0.5307414531707764,
7062
+ "learning_rate": 2.9252793190256447e-05,
7063
+ "loss": 4.285565567016602,
7064
+ "step": 9850
7065
+ },
7066
+ {
7067
+ "epoch": 1.665821929379963,
7068
+ "grad_norm": 0.4659578502178192,
7069
+ "learning_rate": 2.896921937065419e-05,
7070
+ "loss": 4.313910675048828,
7071
+ "step": 9860
7072
+ },
7073
+ {
7074
+ "epoch": 1.6675114039533705,
7075
+ "grad_norm": 0.46300381422042847,
7076
+ "learning_rate": 2.8686879739776137e-05,
7077
+ "loss": 4.31811408996582,
7078
+ "step": 9870
7079
+ },
7080
+ {
7081
+ "epoch": 1.669200878526778,
7082
+ "grad_norm": 0.4717971086502075,
7083
+ "learning_rate": 2.8405777176727924e-05,
7084
+ "loss": 4.318044662475586,
7085
+ "step": 9880
7086
+ },
7087
+ {
7088
+ "epoch": 1.670890353100186,
7089
+ "grad_norm": 0.45347994565963745,
7090
+ "learning_rate": 2.8125914548000243e-05,
7091
+ "loss": 4.295824432373047,
7092
+ "step": 9890
7093
+ },
7094
+ {
7095
+ "epoch": 1.6725798276735935,
7096
+ "grad_norm": 0.4703952670097351,
7097
+ "learning_rate": 2.7847294707439828e-05,
7098
+ "loss": 4.28874626159668,
7099
+ "step": 9900
7100
+ },
7101
+ {
7102
+ "epoch": 1.6742693022470012,
7103
+ "grad_norm": 0.4726548194885254,
7104
+ "learning_rate": 2.7569920496220398e-05,
7105
+ "loss": 4.304931259155273,
7106
+ "step": 9910
7107
+ },
7108
+ {
7109
+ "epoch": 1.675958776820409,
7110
+ "grad_norm": 0.47394225001335144,
7111
+ "learning_rate": 2.729379474281352e-05,
7112
+ "loss": 4.3050182342529295,
7113
+ "step": 9920
7114
+ },
7115
+ {
7116
+ "epoch": 1.6776482513938165,
7117
+ "grad_norm": 0.49833500385284424,
7118
+ "learning_rate": 2.701892026295979e-05,
7119
+ "loss": 4.331858062744141,
7120
+ "step": 9930
7121
+ },
7122
+ {
7123
+ "epoch": 1.6793377259672242,
7124
+ "grad_norm": 0.4709710478782654,
7125
+ "learning_rate": 2.6745299859640318e-05,
7126
+ "loss": 4.332807159423828,
7127
+ "step": 9940
7128
+ },
7129
+ {
7130
+ "epoch": 1.681027200540632,
7131
+ "grad_norm": 0.48379939794540405,
7132
+ "learning_rate": 2.6472936323047972e-05,
7133
+ "loss": 4.311476516723633,
7134
+ "step": 9950
7135
+ },
7136
+ {
7137
+ "epoch": 1.6827166751140394,
7138
+ "grad_norm": 0.475941926240921,
7139
+ "learning_rate": 2.6201832430558866e-05,
7140
+ "loss": 4.314311599731445,
7141
+ "step": 9960
7142
+ },
7143
+ {
7144
+ "epoch": 1.6844061496874472,
7145
+ "grad_norm": 0.4633561372756958,
7146
+ "learning_rate": 2.5931990946704206e-05,
7147
+ "loss": 4.312783050537109,
7148
+ "step": 9970
7149
+ },
7150
+ {
7151
+ "epoch": 1.686095624260855,
7152
+ "grad_norm": 0.4624374806880951,
7153
+ "learning_rate": 2.5663414623141943e-05,
7154
+ "loss": 4.315936279296875,
7155
+ "step": 9980
7156
+ },
7157
+ {
7158
+ "epoch": 1.6877850988342624,
7159
+ "grad_norm": 0.46104687452316284,
7160
+ "learning_rate": 2.5396106198628947e-05,
7161
+ "loss": 4.317576217651367,
7162
+ "step": 9990
7163
+ },
7164
+ {
7165
+ "epoch": 1.6894745734076704,
7166
+ "grad_norm": 0.46486878395080566,
7167
+ "learning_rate": 2.5130068398992716e-05,
7168
+ "loss": 4.3148681640625,
7169
+ "step": 10000
7170
+ },
7171
+ {
7172
+ "epoch": 1.6894745734076704,
7173
+ "eval_loss": 4.282918930053711,
7174
+ "eval_runtime": 3.8826,
7175
+ "eval_samples_per_second": 257.563,
7176
+ "eval_steps_per_second": 5.409,
7177
+ "step": 10000
7178
  }
7179
  ],
7180
  "logging_steps": 10,
 
7194
  "attributes": {}
7195
  }
7196
  },
7197
+ "total_flos": 3.344547305037496e+17,
7198
  "train_batch_size": 48,
7199
  "trial_name": null,
7200
  "trial_params": null