Uploaded checkpoint-4000

Browse files

Files changed (5) hide show

model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +711 -3

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b7cb2f47cd38e6383f75f3414a8a23b1ecd4968b1bb9dfd8b1057ae5f8584b2
 size 2692969128

 version https://git-lfs.github.com/spec/v1
+oid sha256:3201017b6d299d8c2cb5eff8dfb87a841857ac5b48a904337a63577b57e72464
 size 2692969128

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a37508368a64e93757105266e012ac04351eb16612969f0665a75aeb13c0d8d
 size 5386075202

 version https://git-lfs.github.com/spec/v1
+oid sha256:0bc91b5189b6a58b603e0148037300250d28c3b0c30d72a7e486062f8ab49769
 size 5386075202

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee3528bf0ace792176d57cac1ea8e325db1e81a8856e3e8a6e53688b51f9516e
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c92a95a97d689d636b085d406167a1d143dce26fb83ee64d21cf4b37a120302
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:770db92ac44ccb712216aece2abb8a41e68fd6d952c7ae7884e9032fb3cc3f81
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f80b0441e18382140898e5947e4bf00161c8985bfd13094069daa8dad861cc8
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": 0.01655612699687481,
   "best_model_checkpoint": "runs/deepseek_20240423-162824/checkpoint-3000",
-  "epoch": 0.9399232396020991,
   "eval_steps": 1000,
-  "global_step": 3000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2131,6 +2131,714 @@
       "eval_samples_per_second": 18.745,
       "eval_steps_per_second": 18.745,
       "step": 3000
     }
   ],
   "logging_steps": 10,
@@ -2138,7 +2846,7 @@
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
-  "total_flos": 4.7201094991872e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": 0.01655612699687481,
   "best_model_checkpoint": "runs/deepseek_20240423-162824/checkpoint-3000",
+  "epoch": 1.2532309861361322,
   "eval_steps": 1000,
+  "global_step": 4000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 18.745,
       "eval_steps_per_second": 18.745,
       "step": 3000
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 10.6875,
+      "learning_rate": 8.844444444444445e-06,
+      "loss": 0.137,
+      "step": 3010
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.00127410888671875,
+      "learning_rate": 8.8e-06,
+      "loss": 0.0456,
+      "step": 3020
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.003204345703125,
+      "learning_rate": 8.755555555555556e-06,
+      "loss": 0.0133,
+      "step": 3030
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 8.625,
+      "learning_rate": 8.711111111111111e-06,
+      "loss": 0.1079,
+      "step": 3040
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.004791259765625,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 0.0327,
+      "step": 3050
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.004150390625,
+      "learning_rate": 8.622222222222223e-06,
+      "loss": 0.0525,
+      "step": 3060
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 8.9375,
+      "learning_rate": 8.577777777777778e-06,
+      "loss": 0.0584,
+      "step": 3070
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00131988525390625,
+      "learning_rate": 8.533333333333335e-06,
+      "loss": 0.0406,
+      "step": 3080
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 14.0,
+      "learning_rate": 8.48888888888889e-06,
+      "loss": 0.0195,
+      "step": 3090
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.001373291015625,
+      "learning_rate": 8.444444444444446e-06,
+      "loss": 0.0571,
+      "step": 3100
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.00469970703125,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.1144,
+      "step": 3110
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 8.355555555555556e-06,
+      "loss": 0.0206,
+      "step": 3120
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.0027618408203125,
+      "learning_rate": 8.311111111111111e-06,
+      "loss": 0.0463,
+      "step": 3130
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.00156402587890625,
+      "learning_rate": 8.266666666666667e-06,
+      "loss": 0.079,
+      "step": 3140
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 12.875,
+      "learning_rate": 8.222222222222222e-06,
+      "loss": 0.0177,
+      "step": 3150
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.004547119140625,
+      "learning_rate": 8.177777777777779e-06,
+      "loss": 0.0287,
+      "step": 3160
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 8.133333333333334e-06,
+      "loss": 0.0063,
+      "step": 3170
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.00140380859375,
+      "learning_rate": 8.08888888888889e-06,
+      "loss": 0.0575,
+      "step": 3180
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 8.044444444444444e-06,
+      "loss": 0.0281,
+      "step": 3190
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 4.34375,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.0165,
+      "step": 3200
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 1.2109375,
+      "learning_rate": 7.955555555555557e-06,
+      "loss": 0.0136,
+      "step": 3210
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 28.875,
+      "learning_rate": 7.911111111111112e-06,
+      "loss": 0.0679,
+      "step": 3220
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 7.866666666666667e-06,
+      "loss": 0.0122,
+      "step": 3230
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 3.703125,
+      "learning_rate": 7.822222222222224e-06,
+      "loss": 0.0085,
+      "step": 3240
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.00482177734375,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 0.0195,
+      "step": 3250
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 7.733333333333334e-06,
+      "loss": 0.0234,
+      "step": 3260
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 4.71875,
+      "learning_rate": 7.68888888888889e-06,
+      "loss": 0.0451,
+      "step": 3270
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.00148773193359375,
+      "learning_rate": 7.644444444444445e-06,
+      "loss": 0.0098,
+      "step": 3280
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.002532958984375,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.0034,
+      "step": 3290
+    },
+    {
+      "epoch": 1.03,
+      "grad_norm": 0.0027923583984375,
+      "learning_rate": 7.555555555555556e-06,
+      "loss": 0.01,
+      "step": 3300
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 3.5,
+      "learning_rate": 7.511111111111111e-06,
+      "loss": 0.0269,
+      "step": 3310
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.00124359130859375,
+      "learning_rate": 7.4666666666666675e-06,
+      "loss": 0.018,
+      "step": 3320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 8.75,
+      "learning_rate": 7.422222222222223e-06,
+      "loss": 0.1035,
+      "step": 3330
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 13.875,
+      "learning_rate": 7.377777777777778e-06,
+      "loss": 0.0368,
+      "step": 3340
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.004608154296875,
+      "learning_rate": 7.333333333333333e-06,
+      "loss": 0.0243,
+      "step": 3350
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.00146484375,
+      "learning_rate": 7.28888888888889e-06,
+      "loss": 0.0177,
+      "step": 3360
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.0014190673828125,
+      "learning_rate": 7.244444444444445e-06,
+      "loss": 0.0426,
+      "step": 3370
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 0.004913330078125,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.0506,
+      "step": 3380
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 9.125,
+      "learning_rate": 7.155555555555556e-06,
+      "loss": 0.0113,
+      "step": 3390
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.0015716552734375,
+      "learning_rate": 7.111111111111112e-06,
+      "loss": 0.0075,
+      "step": 3400
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 9.9375,
+      "learning_rate": 7.066666666666667e-06,
+      "loss": 0.0188,
+      "step": 3410
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.001312255859375,
+      "learning_rate": 7.022222222222222e-06,
+      "loss": 0.0017,
+      "step": 3420
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 6.977777777777779e-06,
+      "loss": 0.0339,
+      "step": 3430
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.00262451171875,
+      "learning_rate": 6.9333333333333344e-06,
+      "loss": 0.0326,
+      "step": 3440
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.004547119140625,
+      "learning_rate": 6.88888888888889e-06,
+      "loss": 0.0145,
+      "step": 3450
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.00107574462890625,
+      "learning_rate": 6.844444444444445e-06,
+      "loss": 0.0126,
+      "step": 3460
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 0.875,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.0141,
+      "step": 3470
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.59375,
+      "learning_rate": 6.755555555555556e-06,
+      "loss": 0.0301,
+      "step": 3480
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 7.75,
+      "learning_rate": 6.711111111111111e-06,
+      "loss": 0.0432,
+      "step": 3490
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.0010528564453125,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.0335,
+      "step": 3500
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 6.6222222222222236e-06,
+      "loss": 0.0616,
+      "step": 3510
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.001312255859375,
+      "learning_rate": 6.577777777777779e-06,
+      "loss": 0.0144,
+      "step": 3520
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 2.5,
+      "learning_rate": 6.533333333333334e-06,
+      "loss": 0.0415,
+      "step": 3530
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.004730224609375,
+      "learning_rate": 6.488888888888889e-06,
+      "loss": 0.0315,
+      "step": 3540
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 0.00128173828125,
+      "learning_rate": 6.444444444444445e-06,
+      "loss": 0.0051,
+      "step": 3550
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.0411,
+      "step": 3560
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0010528564453125,
+      "learning_rate": 6.355555555555556e-06,
+      "loss": 0.0271,
+      "step": 3570
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.00482177734375,
+      "learning_rate": 6.311111111111111e-06,
+      "loss": 0.0172,
+      "step": 3580
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.0027008056640625,
+      "learning_rate": 6.266666666666668e-06,
+      "loss": 0.0062,
+      "step": 3590
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 6.84375,
+      "learning_rate": 6.222222222222223e-06,
+      "loss": 0.0234,
+      "step": 3600
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.0023345947265625,
+      "learning_rate": 6.177777777777778e-06,
+      "loss": 0.0,
+      "step": 3610
+    },
+    {
+      "epoch": 1.13,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 6.133333333333334e-06,
+      "loss": 0.0194,
+      "step": 3620
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 6.08888888888889e-06,
+      "loss": 0.0088,
+      "step": 3630
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.203125,
+      "learning_rate": 6.044444444444445e-06,
+      "loss": 0.0339,
+      "step": 3640
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 15.25,
+      "learning_rate": 6e-06,
+      "loss": 0.0362,
+      "step": 3650
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 4.375,
+      "learning_rate": 5.955555555555555e-06,
+      "loss": 0.0335,
+      "step": 3660
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 3.5,
+      "learning_rate": 5.911111111111112e-06,
+      "loss": 0.0254,
+      "step": 3670
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.000949859619140625,
+      "learning_rate": 5.8666666666666675e-06,
+      "loss": 0.0173,
+      "step": 3680
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 5.125,
+      "learning_rate": 5.822222222222223e-06,
+      "loss": 0.0677,
+      "step": 3690
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 9.75,
+      "learning_rate": 5.777777777777778e-06,
+      "loss": 0.0162,
+      "step": 3700
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.002288818359375,
+      "learning_rate": 5.733333333333334e-06,
+      "loss": 0.0334,
+      "step": 3710
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 5.688888888888889e-06,
+      "loss": 0.0673,
+      "step": 3720
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0010986328125,
+      "learning_rate": 5.6444444444444445e-06,
+      "loss": 0.0189,
+      "step": 3730
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.00095367431640625,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.0493,
+      "step": 3740
+    },
+    {
+      "epoch": 1.17,
+      "grad_norm": 0.0093994140625,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 0.032,
+      "step": 3750
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.00119781494140625,
+      "learning_rate": 5.511111111111112e-06,
+      "loss": 0.0201,
+      "step": 3760
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.00225830078125,
+      "learning_rate": 5.466666666666667e-06,
+      "loss": 0.0928,
+      "step": 3770
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.0045166015625,
+      "learning_rate": 5.422222222222223e-06,
+      "loss": 0.0048,
+      "step": 3780
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 5.3777777777777784e-06,
+      "loss": 0.023,
+      "step": 3790
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 2.875,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.0134,
+      "step": 3800
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.00146484375,
+      "learning_rate": 5.288888888888889e-06,
+      "loss": 0.0063,
+      "step": 3810
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.001007080078125,
+      "learning_rate": 5.244444444444445e-06,
+      "loss": 0.0241,
+      "step": 3820
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.01483154296875,
+      "learning_rate": 5.2e-06,
+      "loss": 0.0124,
+      "step": 3830
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.00433349609375,
+      "learning_rate": 5.155555555555556e-06,
+      "loss": 0.0231,
+      "step": 3840
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.00201416015625,
+      "learning_rate": 5.1111111111111115e-06,
+      "loss": 0.0001,
+      "step": 3850
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 4.375,
+      "learning_rate": 5.0666666666666676e-06,
+      "loss": 0.0294,
+      "step": 3860
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 0.004547119140625,
+      "learning_rate": 5.022222222222223e-06,
+      "loss": 0.0379,
+      "step": 3870
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 0.0010223388671875,
+      "learning_rate": 4.977777777777778e-06,
+      "loss": 0.0284,
+      "step": 3880
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 4.8125,
+      "learning_rate": 4.933333333333334e-06,
+      "loss": 0.0131,
+      "step": 3890
+    },
+    {
+      "epoch": 1.22,
+      "grad_norm": 3.9375,
+      "learning_rate": 4.888888888888889e-06,
+      "loss": 0.0527,
+      "step": 3900
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.0010986328125,
+      "learning_rate": 4.8444444444444446e-06,
+      "loss": 0.0002,
+      "step": 3910
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.0035247802734375,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.0455,
+      "step": 3920
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 8.125,
+      "learning_rate": 4.755555555555556e-06,
+      "loss": 0.0104,
+      "step": 3930
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 0.00156402587890625,
+      "learning_rate": 4.711111111111111e-06,
+      "loss": 0.0,
+      "step": 3940
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0026397705078125,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.0191,
+      "step": 3950
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.0037689208984375,
+      "learning_rate": 4.622222222222222e-06,
+      "loss": 0.0288,
+      "step": 3960
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.00135040283203125,
+      "learning_rate": 4.5777777777777785e-06,
+      "loss": 0.0519,
+      "step": 3970
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.00106048583984375,
+      "learning_rate": 4.533333333333334e-06,
+      "loss": 0.0175,
+      "step": 3980
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 7.0,
+      "learning_rate": 4.488888888888889e-06,
+      "loss": 0.0066,
+      "step": 3990
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.0253,
+      "step": 4000
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.019900379702448845,
+      "eval_runtime": 53.4572,
+      "eval_samples_per_second": 18.707,
+      "eval_steps_per_second": 18.707,
+      "step": 4000
     }
   ],
   "logging_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,
+  "total_flos": 6.2934793322496e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null