Training in progress, step 62000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c05cde8285dd52085342b46430f4e5412103d775ef2ecb3ff92fe973f05563a
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13bd3612e3785a0d69245374e9d503a45ff63d121c602ad8a1a69ce58b21ee6f
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b86dd42ce2bfa419ab9d950fa2e032bc9074c23516cf132dad718a38dfd9a2d
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10866,11 +10866,189 @@
|
|
| 10866 |
"eval_steps_per_second": 23.343,
|
| 10867 |
"num_input_tokens_seen": 15990779456,
|
| 10868 |
"step": 61000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10869 |
}
|
| 10870 |
],
|
| 10871 |
"logging_steps": 50,
|
| 10872 |
"max_steps": 70000,
|
| 10873 |
-
"num_input_tokens_seen":
|
| 10874 |
"num_train_epochs": 1,
|
| 10875 |
"save_steps": 1000,
|
| 10876 |
"stateful_callbacks": {
|
|
@@ -10885,7 +11063,7 @@
|
|
| 10885 |
"attributes": {}
|
| 10886 |
}
|
| 10887 |
},
|
| 10888 |
-
"total_flos": 4.
|
| 10889 |
"train_batch_size": 64,
|
| 10890 |
"trial_name": null,
|
| 10891 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2957415600367292,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 62000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10866 |
"eval_steps_per_second": 23.343,
|
| 10867 |
"num_input_tokens_seen": 15990779456,
|
| 10868 |
"step": 61000
|
| 10869 |
+
},
|
| 10870 |
+
{
|
| 10871 |
+
"epoch": 0.2912100361329406,
|
| 10872 |
+
"grad_norm": 0.19767510890960693,
|
| 10873 |
+
"learning_rate": 0.0007118738970516943,
|
| 10874 |
+
"loss": 2.5963,
|
| 10875 |
+
"num_input_tokens_seen": 16003886656,
|
| 10876 |
+
"step": 61050
|
| 10877 |
+
},
|
| 10878 |
+
{
|
| 10879 |
+
"epoch": 0.2914485373910347,
|
| 10880 |
+
"grad_norm": 0.21463529765605927,
|
| 10881 |
+
"learning_rate": 0.0007067792524832604,
|
| 10882 |
+
"loss": 2.5825,
|
| 10883 |
+
"num_input_tokens_seen": 16016993856,
|
| 10884 |
+
"step": 61100
|
| 10885 |
+
},
|
| 10886 |
+
{
|
| 10887 |
+
"epoch": 0.29168703864912887,
|
| 10888 |
+
"grad_norm": 0.2011532485485077,
|
| 10889 |
+
"learning_rate": 0.0007016585772004026,
|
| 10890 |
+
"loss": 2.5783,
|
| 10891 |
+
"num_input_tokens_seen": 16030101056,
|
| 10892 |
+
"step": 61150
|
| 10893 |
+
},
|
| 10894 |
+
{
|
| 10895 |
+
"epoch": 0.29192553990722303,
|
| 10896 |
+
"grad_norm": 0.19351401925086975,
|
| 10897 |
+
"learning_rate": 0.0006965125158269618,
|
| 10898 |
+
"loss": 2.5619,
|
| 10899 |
+
"num_input_tokens_seen": 16043208256,
|
| 10900 |
+
"step": 61200
|
| 10901 |
+
},
|
| 10902 |
+
{
|
| 10903 |
+
"epoch": 0.29216404116531713,
|
| 10904 |
+
"grad_norm": 0.1988568902015686,
|
| 10905 |
+
"learning_rate": 0.000691341716182545,
|
| 10906 |
+
"loss": 2.6007,
|
| 10907 |
+
"num_input_tokens_seen": 16056315456,
|
| 10908 |
+
"step": 61250
|
| 10909 |
+
},
|
| 10910 |
+
{
|
| 10911 |
+
"epoch": 0.2924025424234113,
|
| 10912 |
+
"grad_norm": 0.20459413528442383,
|
| 10913 |
+
"learning_rate": 0.0006861468292009726,
|
| 10914 |
+
"loss": 2.5762,
|
| 10915 |
+
"num_input_tokens_seen": 16069422656,
|
| 10916 |
+
"step": 61300
|
| 10917 |
+
},
|
| 10918 |
+
{
|
| 10919 |
+
"epoch": 0.2926410436815054,
|
| 10920 |
+
"grad_norm": 0.1914205551147461,
|
| 10921 |
+
"learning_rate": 0.0006809285088483361,
|
| 10922 |
+
"loss": 2.5734,
|
| 10923 |
+
"num_input_tokens_seen": 16082529856,
|
| 10924 |
+
"step": 61350
|
| 10925 |
+
},
|
| 10926 |
+
{
|
| 10927 |
+
"epoch": 0.29287954493959956,
|
| 10928 |
+
"grad_norm": 0.194325253367424,
|
| 10929 |
+
"learning_rate": 0.0006756874120406714,
|
| 10930 |
+
"loss": 2.5874,
|
| 10931 |
+
"num_input_tokens_seen": 16095637056,
|
| 10932 |
+
"step": 61400
|
| 10933 |
+
},
|
| 10934 |
+
{
|
| 10935 |
+
"epoch": 0.2931180461976937,
|
| 10936 |
+
"grad_norm": 0.20854853093624115,
|
| 10937 |
+
"learning_rate": 0.0006704241985612625,
|
| 10938 |
+
"loss": 2.5865,
|
| 10939 |
+
"num_input_tokens_seen": 16108744256,
|
| 10940 |
+
"step": 61450
|
| 10941 |
+
},
|
| 10942 |
+
{
|
| 10943 |
+
"epoch": 0.2933565474557878,
|
| 10944 |
+
"grad_norm": 0.190395787358284,
|
| 10945 |
+
"learning_rate": 0.0006651395309775837,
|
| 10946 |
+
"loss": 2.5716,
|
| 10947 |
+
"num_input_tokens_seen": 16121851456,
|
| 10948 |
+
"step": 61500
|
| 10949 |
+
},
|
| 10950 |
+
{
|
| 10951 |
+
"epoch": 0.2933565474557878,
|
| 10952 |
+
"eval_loss": 2.4551966190338135,
|
| 10953 |
+
"eval_runtime": 53.3343,
|
| 10954 |
+
"eval_samples_per_second": 93.748,
|
| 10955 |
+
"eval_steps_per_second": 23.437,
|
| 10956 |
+
"num_input_tokens_seen": 16121851456,
|
| 10957 |
+
"step": 61500
|
| 10958 |
+
},
|
| 10959 |
+
{
|
| 10960 |
+
"epoch": 0.293595048713882,
|
| 10961 |
+
"grad_norm": 0.20652073621749878,
|
| 10962 |
+
"learning_rate": 0.0006598340745578908,
|
| 10963 |
+
"loss": 2.5765,
|
| 10964 |
+
"num_input_tokens_seen": 16134958656,
|
| 10965 |
+
"step": 61550
|
| 10966 |
+
},
|
| 10967 |
+
{
|
| 10968 |
+
"epoch": 0.2938335499719761,
|
| 10969 |
+
"grad_norm": 0.20701836049556732,
|
| 10970 |
+
"learning_rate": 0.0006545084971874737,
|
| 10971 |
+
"loss": 2.5653,
|
| 10972 |
+
"num_input_tokens_seen": 16148065856,
|
| 10973 |
+
"step": 61600
|
| 10974 |
+
},
|
| 10975 |
+
{
|
| 10976 |
+
"epoch": 0.29407205123007024,
|
| 10977 |
+
"grad_norm": 0.1792392134666443,
|
| 10978 |
+
"learning_rate": 0.000649163469284578,
|
| 10979 |
+
"loss": 2.577,
|
| 10980 |
+
"num_input_tokens_seen": 16161173056,
|
| 10981 |
+
"step": 61650
|
| 10982 |
+
},
|
| 10983 |
+
{
|
| 10984 |
+
"epoch": 0.2943105524881644,
|
| 10985 |
+
"grad_norm": 0.21742790937423706,
|
| 10986 |
+
"learning_rate": 0.0006437996637160086,
|
| 10987 |
+
"loss": 2.574,
|
| 10988 |
+
"num_input_tokens_seen": 16174280256,
|
| 10989 |
+
"step": 61700
|
| 10990 |
+
},
|
| 10991 |
+
{
|
| 10992 |
+
"epoch": 0.2945490537462585,
|
| 10993 |
+
"grad_norm": 0.20747682452201843,
|
| 10994 |
+
"learning_rate": 0.0006384177557124247,
|
| 10995 |
+
"loss": 2.564,
|
| 10996 |
+
"num_input_tokens_seen": 16187387456,
|
| 10997 |
+
"step": 61750
|
| 10998 |
+
},
|
| 10999 |
+
{
|
| 11000 |
+
"epoch": 0.29478755500435266,
|
| 11001 |
+
"grad_norm": 0.19990311563014984,
|
| 11002 |
+
"learning_rate": 0.0006330184227833376,
|
| 11003 |
+
"loss": 2.5866,
|
| 11004 |
+
"num_input_tokens_seen": 16200494656,
|
| 11005 |
+
"step": 61800
|
| 11006 |
+
},
|
| 11007 |
+
{
|
| 11008 |
+
"epoch": 0.29502605626244677,
|
| 11009 |
+
"grad_norm": 0.20410317182540894,
|
| 11010 |
+
"learning_rate": 0.0006276023446318213,
|
| 11011 |
+
"loss": 2.5559,
|
| 11012 |
+
"num_input_tokens_seen": 16213601856,
|
| 11013 |
+
"step": 61850
|
| 11014 |
+
},
|
| 11015 |
+
{
|
| 11016 |
+
"epoch": 0.2952645575205409,
|
| 11017 |
+
"grad_norm": 0.19365034997463226,
|
| 11018 |
+
"learning_rate": 0.000622170203068947,
|
| 11019 |
+
"loss": 2.5705,
|
| 11020 |
+
"num_input_tokens_seen": 16226709056,
|
| 11021 |
+
"step": 61900
|
| 11022 |
+
},
|
| 11023 |
+
{
|
| 11024 |
+
"epoch": 0.29550305877863503,
|
| 11025 |
+
"grad_norm": 0.2115161269903183,
|
| 11026 |
+
"learning_rate": 0.0006167226819279528,
|
| 11027 |
+
"loss": 2.5621,
|
| 11028 |
+
"num_input_tokens_seen": 16239816256,
|
| 11029 |
+
"step": 61950
|
| 11030 |
+
},
|
| 11031 |
+
{
|
| 11032 |
+
"epoch": 0.2957415600367292,
|
| 11033 |
+
"grad_norm": 0.22992485761642456,
|
| 11034 |
+
"learning_rate": 0.0006112604669781572,
|
| 11035 |
+
"loss": 2.5587,
|
| 11036 |
+
"num_input_tokens_seen": 16252923456,
|
| 11037 |
+
"step": 62000
|
| 11038 |
+
},
|
| 11039 |
+
{
|
| 11040 |
+
"epoch": 0.2957415600367292,
|
| 11041 |
+
"eval_loss": 2.452096462249756,
|
| 11042 |
+
"eval_runtime": 53.6354,
|
| 11043 |
+
"eval_samples_per_second": 93.222,
|
| 11044 |
+
"eval_steps_per_second": 23.306,
|
| 11045 |
+
"num_input_tokens_seen": 16252923456,
|
| 11046 |
+
"step": 62000
|
| 11047 |
}
|
| 11048 |
],
|
| 11049 |
"logging_steps": 50,
|
| 11050 |
"max_steps": 70000,
|
| 11051 |
+
"num_input_tokens_seen": 16252923456,
|
| 11052 |
"num_train_epochs": 1,
|
| 11053 |
"save_steps": 1000,
|
| 11054 |
"stateful_callbacks": {
|
|
|
|
| 11063 |
"attributes": {}
|
| 11064 |
}
|
| 11065 |
},
|
| 11066 |
+
"total_flos": 4.3478156530129306e+18,
|
| 11067 |
"train_batch_size": 64,
|
| 11068 |
"trial_name": null,
|
| 11069 |
"trial_params": null
|