Training checkpoint at step 33500
Browse files- trainer_state.json +185 -5
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-33000",
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11888,6 +11888,186 @@
|
|
| 11888 |
"eval_samples_per_second": 2.471,
|
| 11889 |
"eval_steps_per_second": 1.235,
|
| 11890 |
"step": 33000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11891 |
}
|
| 11892 |
],
|
| 11893 |
"logging_steps": 25,
|
|
@@ -11907,7 +12087,7 @@
|
|
| 11907 |
"attributes": {}
|
| 11908 |
}
|
| 11909 |
},
|
| 11910 |
-
"total_flos": 7.
|
| 11911 |
"train_batch_size": 1,
|
| 11912 |
"trial_name": null,
|
| 11913 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 33400,
|
| 3 |
+
"best_metric": 2.48046875,
|
| 4 |
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-33000",
|
| 5 |
+
"epoch": 0.67,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 33500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11888 |
"eval_samples_per_second": 2.471,
|
| 11889 |
"eval_steps_per_second": 1.235,
|
| 11890 |
"step": 33000
|
| 11891 |
+
},
|
| 11892 |
+
{
|
| 11893 |
+
"epoch": 0.6605,
|
| 11894 |
+
"grad_norm": 1.4210561637451704,
|
| 11895 |
+
"learning_rate": 3.7724444444444445e-06,
|
| 11896 |
+
"loss": 2.4701,
|
| 11897 |
+
"step": 33025
|
| 11898 |
+
},
|
| 11899 |
+
{
|
| 11900 |
+
"epoch": 0.661,
|
| 11901 |
+
"grad_norm": 2.1617711578734915,
|
| 11902 |
+
"learning_rate": 3.7668888888888893e-06,
|
| 11903 |
+
"loss": 2.4788,
|
| 11904 |
+
"step": 33050
|
| 11905 |
+
},
|
| 11906 |
+
{
|
| 11907 |
+
"epoch": 0.6615,
|
| 11908 |
+
"grad_norm": 1.5320070011690634,
|
| 11909 |
+
"learning_rate": 3.7613333333333333e-06,
|
| 11910 |
+
"loss": 2.47,
|
| 11911 |
+
"step": 33075
|
| 11912 |
+
},
|
| 11913 |
+
{
|
| 11914 |
+
"epoch": 0.662,
|
| 11915 |
+
"grad_norm": 1.3400391346818974,
|
| 11916 |
+
"learning_rate": 3.755777777777778e-06,
|
| 11917 |
+
"loss": 2.4792,
|
| 11918 |
+
"step": 33100
|
| 11919 |
+
},
|
| 11920 |
+
{
|
| 11921 |
+
"epoch": 0.662,
|
| 11922 |
+
"eval_loss": 2.481220006942749,
|
| 11923 |
+
"eval_runtime": 41.9988,
|
| 11924 |
+
"eval_samples_per_second": 2.476,
|
| 11925 |
+
"eval_steps_per_second": 1.238,
|
| 11926 |
+
"step": 33100
|
| 11927 |
+
},
|
| 11928 |
+
{
|
| 11929 |
+
"epoch": 0.6625,
|
| 11930 |
+
"grad_norm": 1.769214182110644,
|
| 11931 |
+
"learning_rate": 3.7502222222222225e-06,
|
| 11932 |
+
"loss": 2.4669,
|
| 11933 |
+
"step": 33125
|
| 11934 |
+
},
|
| 11935 |
+
{
|
| 11936 |
+
"epoch": 0.663,
|
| 11937 |
+
"grad_norm": 1.4499877924560598,
|
| 11938 |
+
"learning_rate": 3.744666666666667e-06,
|
| 11939 |
+
"loss": 2.47,
|
| 11940 |
+
"step": 33150
|
| 11941 |
+
},
|
| 11942 |
+
{
|
| 11943 |
+
"epoch": 0.6635,
|
| 11944 |
+
"grad_norm": 1.2272721031062317,
|
| 11945 |
+
"learning_rate": 3.7391111111111118e-06,
|
| 11946 |
+
"loss": 2.4808,
|
| 11947 |
+
"step": 33175
|
| 11948 |
+
},
|
| 11949 |
+
{
|
| 11950 |
+
"epoch": 0.664,
|
| 11951 |
+
"grad_norm": 1.9524804604619508,
|
| 11952 |
+
"learning_rate": 3.7335555555555557e-06,
|
| 11953 |
+
"loss": 2.4855,
|
| 11954 |
+
"step": 33200
|
| 11955 |
+
},
|
| 11956 |
+
{
|
| 11957 |
+
"epoch": 0.664,
|
| 11958 |
+
"eval_loss": 2.481595516204834,
|
| 11959 |
+
"eval_runtime": 42.0663,
|
| 11960 |
+
"eval_samples_per_second": 2.472,
|
| 11961 |
+
"eval_steps_per_second": 1.236,
|
| 11962 |
+
"step": 33200
|
| 11963 |
+
},
|
| 11964 |
+
{
|
| 11965 |
+
"epoch": 0.6645,
|
| 11966 |
+
"grad_norm": 1.5855882612813827,
|
| 11967 |
+
"learning_rate": 3.7280000000000006e-06,
|
| 11968 |
+
"loss": 2.4839,
|
| 11969 |
+
"step": 33225
|
| 11970 |
+
},
|
| 11971 |
+
{
|
| 11972 |
+
"epoch": 0.665,
|
| 11973 |
+
"grad_norm": 1.7981131055660284,
|
| 11974 |
+
"learning_rate": 3.7224444444444445e-06,
|
| 11975 |
+
"loss": 2.4831,
|
| 11976 |
+
"step": 33250
|
| 11977 |
+
},
|
| 11978 |
+
{
|
| 11979 |
+
"epoch": 0.6655,
|
| 11980 |
+
"grad_norm": 1.8893217376664102,
|
| 11981 |
+
"learning_rate": 3.7168888888888894e-06,
|
| 11982 |
+
"loss": 2.4689,
|
| 11983 |
+
"step": 33275
|
| 11984 |
+
},
|
| 11985 |
+
{
|
| 11986 |
+
"epoch": 0.666,
|
| 11987 |
+
"grad_norm": 1.5504407193892469,
|
| 11988 |
+
"learning_rate": 3.7113333333333333e-06,
|
| 11989 |
+
"loss": 2.4748,
|
| 11990 |
+
"step": 33300
|
| 11991 |
+
},
|
| 11992 |
+
{
|
| 11993 |
+
"epoch": 0.666,
|
| 11994 |
+
"eval_loss": 2.480543851852417,
|
| 11995 |
+
"eval_runtime": 42.0898,
|
| 11996 |
+
"eval_samples_per_second": 2.471,
|
| 11997 |
+
"eval_steps_per_second": 1.235,
|
| 11998 |
+
"step": 33300
|
| 11999 |
+
},
|
| 12000 |
+
{
|
| 12001 |
+
"epoch": 0.6665,
|
| 12002 |
+
"grad_norm": 1.7468260304964456,
|
| 12003 |
+
"learning_rate": 3.705777777777778e-06,
|
| 12004 |
+
"loss": 2.478,
|
| 12005 |
+
"step": 33325
|
| 12006 |
+
},
|
| 12007 |
+
{
|
| 12008 |
+
"epoch": 0.667,
|
| 12009 |
+
"grad_norm": 1.527950475107732,
|
| 12010 |
+
"learning_rate": 3.700222222222222e-06,
|
| 12011 |
+
"loss": 2.476,
|
| 12012 |
+
"step": 33350
|
| 12013 |
+
},
|
| 12014 |
+
{
|
| 12015 |
+
"epoch": 0.6675,
|
| 12016 |
+
"grad_norm": 1.6583388548480227,
|
| 12017 |
+
"learning_rate": 3.694666666666667e-06,
|
| 12018 |
+
"loss": 2.4675,
|
| 12019 |
+
"step": 33375
|
| 12020 |
+
},
|
| 12021 |
+
{
|
| 12022 |
+
"epoch": 0.668,
|
| 12023 |
+
"grad_norm": 1.5937276979972617,
|
| 12024 |
+
"learning_rate": 3.689111111111112e-06,
|
| 12025 |
+
"loss": 2.471,
|
| 12026 |
+
"step": 33400
|
| 12027 |
+
},
|
| 12028 |
+
{
|
| 12029 |
+
"epoch": 0.668,
|
| 12030 |
+
"eval_loss": 2.48046875,
|
| 12031 |
+
"eval_runtime": 41.9774,
|
| 12032 |
+
"eval_samples_per_second": 2.478,
|
| 12033 |
+
"eval_steps_per_second": 1.239,
|
| 12034 |
+
"step": 33400
|
| 12035 |
+
},
|
| 12036 |
+
{
|
| 12037 |
+
"epoch": 0.6685,
|
| 12038 |
+
"grad_norm": 1.4618192489732302,
|
| 12039 |
+
"learning_rate": 3.6835555555555558e-06,
|
| 12040 |
+
"loss": 2.4687,
|
| 12041 |
+
"step": 33425
|
| 12042 |
+
},
|
| 12043 |
+
{
|
| 12044 |
+
"epoch": 0.669,
|
| 12045 |
+
"grad_norm": 1.686028538338107,
|
| 12046 |
+
"learning_rate": 3.6780000000000006e-06,
|
| 12047 |
+
"loss": 2.4859,
|
| 12048 |
+
"step": 33450
|
| 12049 |
+
},
|
| 12050 |
+
{
|
| 12051 |
+
"epoch": 0.6695,
|
| 12052 |
+
"grad_norm": 1.352022736772511,
|
| 12053 |
+
"learning_rate": 3.6724444444444446e-06,
|
| 12054 |
+
"loss": 2.4611,
|
| 12055 |
+
"step": 33475
|
| 12056 |
+
},
|
| 12057 |
+
{
|
| 12058 |
+
"epoch": 0.67,
|
| 12059 |
+
"grad_norm": 1.9063367987545683,
|
| 12060 |
+
"learning_rate": 3.6668888888888894e-06,
|
| 12061 |
+
"loss": 2.4724,
|
| 12062 |
+
"step": 33500
|
| 12063 |
+
},
|
| 12064 |
+
{
|
| 12065 |
+
"epoch": 0.67,
|
| 12066 |
+
"eval_loss": 2.480919361114502,
|
| 12067 |
+
"eval_runtime": 42.1121,
|
| 12068 |
+
"eval_samples_per_second": 2.47,
|
| 12069 |
+
"eval_steps_per_second": 1.235,
|
| 12070 |
+
"step": 33500
|
| 12071 |
}
|
| 12072 |
],
|
| 12073 |
"logging_steps": 25,
|
|
|
|
| 12087 |
"attributes": {}
|
| 12088 |
}
|
| 12089 |
},
|
| 12090 |
+
"total_flos": 7.518134864593918e+19,
|
| 12091 |
"train_batch_size": 1,
|
| 12092 |
"trial_name": null,
|
| 12093 |
"trial_params": null
|