Uploaded checkpoint-4000

Browse files

Files changed (5) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +711 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e80a771a902651e246b193fcf37986946f6a5ab021798cf9a01c65b71035adaa
 size 2692969128

 version https://git-lfs.github.com/spec/v1
+oid sha256:11b12a5db03b21601ef2fbf830ece39543cf496c2b384f79ea8cd1e13f05c681
 size 2692969128

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d19439958f10e6b8bd2d75773a92fac0b59e303294313bf5a29059f4d0a3be3d
 size 5386075202

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0bc7be545c71698aed05f3fc1be12a4ed5c0a5ce82ef2cdbed93d83ab3fd6e8
 size 5386075202

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee3528bf0ace792176d57cac1ea8e325db1e81a8856e3e8a6e53688b51f9516e
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c92a95a97d689d636b085d406167a1d143dce26fb83ee64d21cf4b37a120302
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:770db92ac44ccb712216aece2abb8a41e68fd6d952c7ae7884e9032fb3cc3f81
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f80b0441e18382140898e5947e4bf00161c8985bfd13094069daa8dad861cc8
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.01529085636138916,
   "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek-FULL-ArithHard-30k_20240424-195522/checkpoint-3000",
-  "epoch": 0.9399232396020991,
   "eval_steps": 1000,
-  "global_step": 3000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2131,6 +2131,714 @@
       "eval_samples_per_second": 18.675,
       "eval_steps_per_second": 18.675,
       "step": 3000
     }
   ],
   "logging_steps": 10,
@@ -2138,7 +2846,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
-  "total_flos": 4.7201094991872e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.01529085636138916,
   "best_model_checkpoint": "runs/deepseek_CMU-AIR2/math-deepseek-FULL-ArithHard-30k_20240424-195522/checkpoint-3000",
+  "epoch": 1.2532309861361322,
   "eval_steps": 1000,
+  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 18.675,
       "eval_steps_per_second": 18.675,
       "step": 3000
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 9.875,
+      "learning_rate": 8.844444444444445e-06,
+      "loss": 0.1394,
+      "step": 3010
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.0013885498046875,
+      "learning_rate": 8.8e-06,
+      "loss": 0.0398,
+      "step": 3020
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.0028228759765625,
+      "learning_rate": 8.755555555555556e-06,
+      "loss": 0.0174,
+      "step": 3030
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 14.6875,
+      "learning_rate": 8.711111111111111e-06,
+      "loss": 0.0902,
+      "step": 3040
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.005157470703125,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 0.0367,
+      "step": 3050
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.005157470703125,
+      "learning_rate": 8.622222222222223e-06,
+      "loss": 0.0494,
+      "step": 3060
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 14.5,
+      "learning_rate": 8.577777777777778e-06,
+      "loss": 0.0721,
+      "step": 3070
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.0014190673828125,
+      "learning_rate": 8.533333333333335e-06,
+      "loss": 0.0439,
+      "step": 3080
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 15.1875,
+      "learning_rate": 8.48888888888889e-06,
+      "loss": 0.0115,
+      "step": 3090
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.00135040283203125,
+      "learning_rate": 8.444444444444446e-06,
+      "loss": 0.0594,
+      "step": 3100
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.0052490234375,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.1152,
+      "step": 3110
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 8.355555555555556e-06,
+      "loss": 0.0279,
+      "step": 3120
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.0023345947265625,
+      "learning_rate": 8.311111111111111e-06,
+      "loss": 0.0406,
+      "step": 3130
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.00182342529296875,
+      "learning_rate": 8.266666666666667e-06,
+      "loss": 0.0773,
+      "step": 3140
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 4.8125,
+      "learning_rate": 8.222222222222222e-06,
+      "loss": 0.014,
+      "step": 3150
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.004913330078125,
+      "learning_rate": 8.177777777777779e-06,
+      "loss": 0.0279,
+      "step": 3160
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.00148773193359375,
+      "learning_rate": 8.133333333333334e-06,
+      "loss": 0.0097,
+      "step": 3170
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.00142669677734375,
+      "learning_rate": 8.08888888888889e-06,
+      "loss": 0.0592,
+      "step": 3180
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.0015411376953125,
+      "learning_rate": 8.044444444444444e-06,
+      "loss": 0.0246,
+      "step": 3190
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 3.9375,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.007,
+      "step": 3200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 6.84375,
+      "learning_rate": 7.955555555555557e-06,
+      "loss": 0.0145,
+      "step": 3210
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 13.875,
+      "learning_rate": 7.911111111111112e-06,
+      "loss": 0.0614,
+      "step": 3220
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.0013427734375,
+      "learning_rate": 7.866666666666667e-06,
+      "loss": 0.0126,
+      "step": 3230
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.703125,
+      "learning_rate": 7.822222222222224e-06,
+      "loss": 0.0077,
+      "step": 3240
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.005218505859375,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 0.0156,
+      "step": 3250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 7.733333333333334e-06,
+      "loss": 0.0201,
+      "step": 3260
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 6.5625,
+      "learning_rate": 7.68888888888889e-06,
+      "loss": 0.0444,
+      "step": 3270
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0016326904296875,
+      "learning_rate": 7.644444444444445e-06,
+      "loss": 0.0145,
+      "step": 3280
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0021514892578125,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.0001,
+      "step": 3290
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0027923583984375,
+      "learning_rate": 7.555555555555556e-06,
+      "loss": 0.0075,
+      "step": 3300
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 7.5625,
+      "learning_rate": 7.511111111111111e-06,
+      "loss": 0.023,
+      "step": 3310
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.0013885498046875,
+      "learning_rate": 7.4666666666666675e-06,
+      "loss": 0.0159,
+      "step": 3320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 1.09375,
+      "learning_rate": 7.422222222222223e-06,
+      "loss": 0.1023,
+      "step": 3330
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 14.1875,
+      "learning_rate": 7.377777777777778e-06,
+      "loss": 0.0436,
+      "step": 3340
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.004974365234375,
+      "learning_rate": 7.333333333333333e-06,
+      "loss": 0.0197,
+      "step": 3350
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.00147247314453125,
+      "learning_rate": 7.28888888888889e-06,
+      "loss": 0.0183,
+      "step": 3360
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.00150299072265625,
+      "learning_rate": 7.244444444444445e-06,
+      "loss": 0.0529,
+      "step": 3370
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.00518798828125,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.0733,
+      "step": 3380
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.88671875,
+      "learning_rate": 7.155555555555556e-06,
+      "loss": 0.0076,
+      "step": 3390
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 7.111111111111112e-06,
+      "loss": 0.0099,
+      "step": 3400
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 5.625,
+      "learning_rate": 7.066666666666667e-06,
+      "loss": 0.0174,
+      "step": 3410
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.00112152099609375,
+      "learning_rate": 7.022222222222222e-06,
+      "loss": 0.0021,
+      "step": 3420
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.0013885498046875,
+      "learning_rate": 6.977777777777779e-06,
+      "loss": 0.0449,
+      "step": 3430
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.0024871826171875,
+      "learning_rate": 6.9333333333333344e-06,
+      "loss": 0.0369,
+      "step": 3440
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.004974365234375,
+      "learning_rate": 6.88888888888889e-06,
+      "loss": 0.0058,
+      "step": 3450
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.0011749267578125,
+      "learning_rate": 6.844444444444445e-06,
+      "loss": 0.0166,
+      "step": 3460
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.474609375,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.0177,
+      "step": 3470
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.28125,
+      "learning_rate": 6.755555555555556e-06,
+      "loss": 0.0477,
+      "step": 3480
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.84375,
+      "learning_rate": 6.711111111111111e-06,
+      "loss": 0.0404,
+      "step": 3490
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.0348,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.00537109375,
+      "learning_rate": 6.6222222222222236e-06,
+      "loss": 0.0541,
+      "step": 3510
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.00139617919921875,
+      "learning_rate": 6.577777777777779e-06,
+      "loss": 0.0136,
+      "step": 3520
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.453125,
+      "learning_rate": 6.533333333333334e-06,
+      "loss": 0.0404,
+      "step": 3530
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.005218505859375,
+      "learning_rate": 6.488888888888889e-06,
+      "loss": 0.0226,
+      "step": 3540
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.00135040283203125,
+      "learning_rate": 6.444444444444445e-06,
+      "loss": 0.0049,
+      "step": 3550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.0419,
+      "step": 3560
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0012054443359375,
+      "learning_rate": 6.355555555555556e-06,
+      "loss": 0.0223,
+      "step": 3570
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0106201171875,
+      "learning_rate": 6.311111111111111e-06,
+      "loss": 0.0229,
+      "step": 3580
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0024261474609375,
+      "learning_rate": 6.266666666666668e-06,
+      "loss": 0.0099,
+      "step": 3590
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 7.96875,
+      "learning_rate": 6.222222222222223e-06,
+      "loss": 0.0291,
+      "step": 3600
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.00225830078125,
+      "learning_rate": 6.177777777777778e-06,
+      "loss": 0.0,
+      "step": 3610
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.00124359130859375,
+      "learning_rate": 6.133333333333334e-06,
+      "loss": 0.02,
+      "step": 3620
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 11.875,
+      "learning_rate": 6.08888888888889e-06,
+      "loss": 0.0115,
+      "step": 3630
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 6.044444444444445e-06,
+      "loss": 0.0363,
+      "step": 3640
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 15.4375,
+      "learning_rate": 6e-06,
+      "loss": 0.034,
+      "step": 3650
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 9.3125,
+      "learning_rate": 5.955555555555555e-06,
+      "loss": 0.0423,
+      "step": 3660
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.9609375,
+      "learning_rate": 5.911111111111112e-06,
+      "loss": 0.0256,
+      "step": 3670
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.00102996826171875,
+      "learning_rate": 5.8666666666666675e-06,
+      "loss": 0.0369,
+      "step": 3680
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 4.5625,
+      "learning_rate": 5.822222222222223e-06,
+      "loss": 0.0705,
+      "step": 3690
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.125,
+      "learning_rate": 5.777777777777778e-06,
+      "loss": 0.0195,
+      "step": 3700
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.0022430419921875,
+      "learning_rate": 5.733333333333334e-06,
+      "loss": 0.032,
+      "step": 3710
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 5.688888888888889e-06,
+      "loss": 0.0796,
+      "step": 3720
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.000965118408203125,
+      "learning_rate": 5.6444444444444445e-06,
+      "loss": 0.0185,
+      "step": 3730
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.00099945068359375,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.0536,
+      "step": 3740
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.033203125,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 0.0247,
+      "step": 3750
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.00131988525390625,
+      "learning_rate": 5.511111111111112e-06,
+      "loss": 0.0206,
+      "step": 3760
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.0021514892578125,
+      "learning_rate": 5.466666666666667e-06,
+      "loss": 0.062,
+      "step": 3770
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.004913330078125,
+      "learning_rate": 5.422222222222223e-06,
+      "loss": 0.0049,
+      "step": 3780
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 5.3777777777777784e-06,
+      "loss": 0.0189,
+      "step": 3790
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 1.1796875,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.015,
+      "step": 3800
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.0015411376953125,
+      "learning_rate": 5.288888888888889e-06,
+      "loss": 0.0128,
+      "step": 3810
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.00103759765625,
+      "learning_rate": 5.244444444444445e-06,
+      "loss": 0.0301,
+      "step": 3820
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 5.2e-06,
+      "loss": 0.0142,
+      "step": 3830
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.004852294921875,
+      "learning_rate": 5.155555555555556e-06,
+      "loss": 0.0221,
+      "step": 3840
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.001983642578125,
+      "learning_rate": 5.1111111111111115e-06,
+      "loss": 0.0001,
+      "step": 3850
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 2.625,
+      "learning_rate": 5.0666666666666676e-06,
+      "loss": 0.0297,
+      "step": 3860
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.004974365234375,
+      "learning_rate": 5.022222222222223e-06,
+      "loss": 0.0274,
+      "step": 3870
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.00121307373046875,
+      "learning_rate": 4.977777777777778e-06,
+      "loss": 0.0252,
+      "step": 3880
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 5.375,
+      "learning_rate": 4.933333333333334e-06,
+      "loss": 0.0097,
+      "step": 3890
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 3.140625,
+      "learning_rate": 4.888888888888889e-06,
+      "loss": 0.0514,
+      "step": 3900
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.00106048583984375,
+      "learning_rate": 4.8444444444444446e-06,
+      "loss": 0.0011,
+      "step": 3910
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.01080322265625,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.0516,
+      "step": 3920
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 5.28125,
+      "learning_rate": 4.755555555555556e-06,
+      "loss": 0.008,
+      "step": 3930
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.00179290771484375,
+      "learning_rate": 4.711111111111111e-06,
+      "loss": 0.0,
+      "step": 3940
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0020599365234375,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.0178,
+      "step": 3950
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.00738525390625,
+      "learning_rate": 4.622222222222222e-06,
+      "loss": 0.0338,
+      "step": 3960
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.00139617919921875,
+      "learning_rate": 4.5777777777777785e-06,
+      "loss": 0.0596,
+      "step": 3970
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.00102996826171875,
+      "learning_rate": 4.533333333333334e-06,
+      "loss": 0.0197,
+      "step": 3980
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.515625,
+      "learning_rate": 4.488888888888889e-06,
+      "loss": 0.0032,
+      "step": 3990
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0625,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0176,
+      "step": 4000
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.018360020592808723,
+      "eval_runtime": 53.5697,
+      "eval_samples_per_second": 18.667,
+      "eval_steps_per_second": 18.667,
+      "step": 4000
     }
   ],
   "logging_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
+  "total_flos": 6.2934793322496e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null