irodkin commited on
Commit
9c5e2f2
·
verified ·
1 Parent(s): ae5e782

Training checkpoint at step 33500

Browse files
Files changed (1) hide show
  1. trainer_state.json +185 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 33000,
3
- "best_metric": 2.480618953704834,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-33000",
5
- "epoch": 0.66,
6
  "eval_steps": 100,
7
- "global_step": 33000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11888,6 +11888,186 @@
11888
  "eval_samples_per_second": 2.471,
11889
  "eval_steps_per_second": 1.235,
11890
  "step": 33000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11891
  }
11892
  ],
11893
  "logging_steps": 25,
@@ -11907,7 +12087,7 @@
11907
  "attributes": {}
11908
  }
11909
  },
11910
- "total_flos": 7.405923896183593e+19,
11911
  "train_batch_size": 1,
11912
  "trial_name": null,
11913
  "trial_params": null
 
1
  {
2
+ "best_global_step": 33400,
3
+ "best_metric": 2.48046875,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-33000",
5
+ "epoch": 0.67,
6
  "eval_steps": 100,
7
+ "global_step": 33500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11888
  "eval_samples_per_second": 2.471,
11889
  "eval_steps_per_second": 1.235,
11890
  "step": 33000
11891
+ },
11892
+ {
11893
+ "epoch": 0.6605,
11894
+ "grad_norm": 1.4210561637451704,
11895
+ "learning_rate": 3.7724444444444445e-06,
11896
+ "loss": 2.4701,
11897
+ "step": 33025
11898
+ },
11899
+ {
11900
+ "epoch": 0.661,
11901
+ "grad_norm": 2.1617711578734915,
11902
+ "learning_rate": 3.7668888888888893e-06,
11903
+ "loss": 2.4788,
11904
+ "step": 33050
11905
+ },
11906
+ {
11907
+ "epoch": 0.6615,
11908
+ "grad_norm": 1.5320070011690634,
11909
+ "learning_rate": 3.7613333333333333e-06,
11910
+ "loss": 2.47,
11911
+ "step": 33075
11912
+ },
11913
+ {
11914
+ "epoch": 0.662,
11915
+ "grad_norm": 1.3400391346818974,
11916
+ "learning_rate": 3.755777777777778e-06,
11917
+ "loss": 2.4792,
11918
+ "step": 33100
11919
+ },
11920
+ {
11921
+ "epoch": 0.662,
11922
+ "eval_loss": 2.481220006942749,
11923
+ "eval_runtime": 41.9988,
11924
+ "eval_samples_per_second": 2.476,
11925
+ "eval_steps_per_second": 1.238,
11926
+ "step": 33100
11927
+ },
11928
+ {
11929
+ "epoch": 0.6625,
11930
+ "grad_norm": 1.769214182110644,
11931
+ "learning_rate": 3.7502222222222225e-06,
11932
+ "loss": 2.4669,
11933
+ "step": 33125
11934
+ },
11935
+ {
11936
+ "epoch": 0.663,
11937
+ "grad_norm": 1.4499877924560598,
11938
+ "learning_rate": 3.744666666666667e-06,
11939
+ "loss": 2.47,
11940
+ "step": 33150
11941
+ },
11942
+ {
11943
+ "epoch": 0.6635,
11944
+ "grad_norm": 1.2272721031062317,
11945
+ "learning_rate": 3.7391111111111118e-06,
11946
+ "loss": 2.4808,
11947
+ "step": 33175
11948
+ },
11949
+ {
11950
+ "epoch": 0.664,
11951
+ "grad_norm": 1.9524804604619508,
11952
+ "learning_rate": 3.7335555555555557e-06,
11953
+ "loss": 2.4855,
11954
+ "step": 33200
11955
+ },
11956
+ {
11957
+ "epoch": 0.664,
11958
+ "eval_loss": 2.481595516204834,
11959
+ "eval_runtime": 42.0663,
11960
+ "eval_samples_per_second": 2.472,
11961
+ "eval_steps_per_second": 1.236,
11962
+ "step": 33200
11963
+ },
11964
+ {
11965
+ "epoch": 0.6645,
11966
+ "grad_norm": 1.5855882612813827,
11967
+ "learning_rate": 3.7280000000000006e-06,
11968
+ "loss": 2.4839,
11969
+ "step": 33225
11970
+ },
11971
+ {
11972
+ "epoch": 0.665,
11973
+ "grad_norm": 1.7981131055660284,
11974
+ "learning_rate": 3.7224444444444445e-06,
11975
+ "loss": 2.4831,
11976
+ "step": 33250
11977
+ },
11978
+ {
11979
+ "epoch": 0.6655,
11980
+ "grad_norm": 1.8893217376664102,
11981
+ "learning_rate": 3.7168888888888894e-06,
11982
+ "loss": 2.4689,
11983
+ "step": 33275
11984
+ },
11985
+ {
11986
+ "epoch": 0.666,
11987
+ "grad_norm": 1.5504407193892469,
11988
+ "learning_rate": 3.7113333333333333e-06,
11989
+ "loss": 2.4748,
11990
+ "step": 33300
11991
+ },
11992
+ {
11993
+ "epoch": 0.666,
11994
+ "eval_loss": 2.480543851852417,
11995
+ "eval_runtime": 42.0898,
11996
+ "eval_samples_per_second": 2.471,
11997
+ "eval_steps_per_second": 1.235,
11998
+ "step": 33300
11999
+ },
12000
+ {
12001
+ "epoch": 0.6665,
12002
+ "grad_norm": 1.7468260304964456,
12003
+ "learning_rate": 3.705777777777778e-06,
12004
+ "loss": 2.478,
12005
+ "step": 33325
12006
+ },
12007
+ {
12008
+ "epoch": 0.667,
12009
+ "grad_norm": 1.527950475107732,
12010
+ "learning_rate": 3.700222222222222e-06,
12011
+ "loss": 2.476,
12012
+ "step": 33350
12013
+ },
12014
+ {
12015
+ "epoch": 0.6675,
12016
+ "grad_norm": 1.6583388548480227,
12017
+ "learning_rate": 3.694666666666667e-06,
12018
+ "loss": 2.4675,
12019
+ "step": 33375
12020
+ },
12021
+ {
12022
+ "epoch": 0.668,
12023
+ "grad_norm": 1.5937276979972617,
12024
+ "learning_rate": 3.689111111111112e-06,
12025
+ "loss": 2.471,
12026
+ "step": 33400
12027
+ },
12028
+ {
12029
+ "epoch": 0.668,
12030
+ "eval_loss": 2.48046875,
12031
+ "eval_runtime": 41.9774,
12032
+ "eval_samples_per_second": 2.478,
12033
+ "eval_steps_per_second": 1.239,
12034
+ "step": 33400
12035
+ },
12036
+ {
12037
+ "epoch": 0.6685,
12038
+ "grad_norm": 1.4618192489732302,
12039
+ "learning_rate": 3.6835555555555558e-06,
12040
+ "loss": 2.4687,
12041
+ "step": 33425
12042
+ },
12043
+ {
12044
+ "epoch": 0.669,
12045
+ "grad_norm": 1.686028538338107,
12046
+ "learning_rate": 3.6780000000000006e-06,
12047
+ "loss": 2.4859,
12048
+ "step": 33450
12049
+ },
12050
+ {
12051
+ "epoch": 0.6695,
12052
+ "grad_norm": 1.352022736772511,
12053
+ "learning_rate": 3.6724444444444446e-06,
12054
+ "loss": 2.4611,
12055
+ "step": 33475
12056
+ },
12057
+ {
12058
+ "epoch": 0.67,
12059
+ "grad_norm": 1.9063367987545683,
12060
+ "learning_rate": 3.6668888888888894e-06,
12061
+ "loss": 2.4724,
12062
+ "step": 33500
12063
+ },
12064
+ {
12065
+ "epoch": 0.67,
12066
+ "eval_loss": 2.480919361114502,
12067
+ "eval_runtime": 42.1121,
12068
+ "eval_samples_per_second": 2.47,
12069
+ "eval_steps_per_second": 1.235,
12070
+ "step": 33500
12071
  }
12072
  ],
12073
  "logging_steps": 25,
 
12087
  "attributes": {}
12088
  }
12089
  },
12090
+ "total_flos": 7.518134864593918e+19,
12091
  "train_batch_size": 1,
12092
  "trial_name": null,
12093
  "trial_params": null