Training in progress, step 500, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +37 -661
last-checkpoint/training_args.bin +1 -1

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0b208bca8c38b9a486fdd9848f4c1f128bf3a9e0dd7c14795016400d7b156f9
 size 577789320

 version https://git-lfs.github.com/spec/v1
+oid sha256:eed2c86bc2c40a1659071d1ab121cc1614d73dba8e8d412c17f25aa20274aa3b
 size 577789320

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cac6a52c28a4ed955005016fab85cda9f346342c5c6edaf668270626ccece527
 size 1155772233

 version https://git-lfs.github.com/spec/v1
+oid sha256:42e92a45af5353c492dbd93e2c4132cb07d4381b943c66b0c7fc1620231e6a8f
 size 1155772233

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5effaff3153159d29c8fd9780ecce06ed2dc4f38caf2bd0d61af35ad7d99b03a
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:22229bc8272370caff8e1f2fa838a639867ecbcd2cf7b0c1722f97dc0bc4d3f7
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9becdbdb0c73597ccfbdbc6b4341353e796af8282a1a374bd120ee6357f11761
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a1a187666ea0e44f9d015f844e1601f5b4c6844588e1b362a3c9b6a7527a74f
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,714 +1,90 @@
 {
-  "best_metric": 0.4142945408821106,
-  "best_model_checkpoint": "mikhail_panzo/fil_b64_le4_s8000/checkpoint-2500",
-  "epoch": 200.0,
   "eval_steps": 500,
-  "global_step": 4500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 2.2222222222222223,
-      "grad_norm": 3.1291983127593994,
       "learning_rate": 2.5e-06,
-      "loss": 0.7845,
       "step": 50
     },
     {
       "epoch": 4.444444444444445,
-      "grad_norm": 1.4106616973876953,
       "learning_rate": 5e-06,
-      "loss": 0.7033,
       "step": 100
     },
     {
       "epoch": 6.666666666666667,
-      "grad_norm": 3.4416260719299316,
       "learning_rate": 7.5e-06,
-      "loss": 0.6535,
       "step": 150
     },
     {
       "epoch": 8.88888888888889,
-      "grad_norm": 1.7668957710266113,
-      "learning_rate": 1e-05,
-      "loss": 0.5726,
       "step": 200
     },
     {
       "epoch": 11.11111111111111,
-      "grad_norm": 1.3341814279556274,
-      "learning_rate": 1.25e-05,
-      "loss": 0.5312,
       "step": 250
     },
     {
       "epoch": 13.333333333333334,
-      "grad_norm": 1.4352439641952515,
-      "learning_rate": 1.5e-05,
-      "loss": 0.5094,
       "step": 300
     },
     {
       "epoch": 15.555555555555555,
-      "grad_norm": 1.0031296014785767,
-      "learning_rate": 1.75e-05,
-      "loss": 0.495,
       "step": 350
     },
     {
       "epoch": 17.77777777777778,
-      "grad_norm": 3.518950939178467,
-      "learning_rate": 2e-05,
-      "loss": 0.489,
       "step": 400
     },
     {
       "epoch": 20.0,
-      "grad_norm": 2.1400034427642822,
-      "learning_rate": 2.25e-05,
-      "loss": 0.4816,
       "step": 450
     },
     {
       "epoch": 22.22222222222222,
-      "grad_norm": 1.7150920629501343,
-      "learning_rate": 2.5e-05,
-      "loss": 0.4725,
       "step": 500
     },
     {
       "epoch": 22.22222222222222,
-      "eval_loss": 0.4371508061885834,
-      "eval_runtime": 8.219,
-      "eval_samples_per_second": 19.345,
-      "eval_steps_per_second": 2.433,
       "step": 500
-    },
-    {
-      "epoch": 24.444444444444443,
-      "grad_norm": 1.6130620241165161,
-      "learning_rate": 2.7500000000000004e-05,
-      "loss": 0.4607,
-      "step": 550
-    },
-    {
-      "epoch": 26.666666666666668,
-      "grad_norm": 2.737618923187256,
-      "learning_rate": 3e-05,
-      "loss": 0.4634,
-      "step": 600
-    },
-    {
-      "epoch": 28.88888888888889,
-      "grad_norm": 3.0077686309814453,
-      "learning_rate": 3.2500000000000004e-05,
-      "loss": 0.4621,
-      "step": 650
-    },
-    {
-      "epoch": 31.11111111111111,
-      "grad_norm": 1.4074121713638306,
-      "learning_rate": 3.5e-05,
-      "loss": 0.4611,
-      "step": 700
-    },
-    {
-      "epoch": 33.333333333333336,
-      "grad_norm": 2.665407657623291,
-      "learning_rate": 3.7500000000000003e-05,
-      "loss": 0.4557,
-      "step": 750
-    },
-    {
-      "epoch": 35.55555555555556,
-      "grad_norm": 2.857210874557495,
-      "learning_rate": 4e-05,
-      "loss": 0.4493,
-      "step": 800
-    },
-    {
-      "epoch": 37.77777777777778,
-      "grad_norm": 1.7210990190505981,
-      "learning_rate": 4.25e-05,
-      "loss": 0.4489,
-      "step": 850
-    },
-    {
-      "epoch": 40.0,
-      "grad_norm": 1.4690616130828857,
-      "learning_rate": 4.5e-05,
-      "loss": 0.4494,
-      "step": 900
-    },
-    {
-      "epoch": 42.22222222222222,
-      "grad_norm": 1.6961876153945923,
-      "learning_rate": 4.75e-05,
-      "loss": 0.4451,
-      "step": 950
-    },
-    {
-      "epoch": 44.44444444444444,
-      "grad_norm": 1.8849211931228638,
-      "learning_rate": 5e-05,
-      "loss": 0.4415,
-      "step": 1000
-    },
-    {
-      "epoch": 44.44444444444444,
-      "eval_loss": 0.4203202724456787,
-      "eval_runtime": 8.7725,
-      "eval_samples_per_second": 18.125,
-      "eval_steps_per_second": 2.28,
-      "step": 1000
-    },
-    {
-      "epoch": 46.666666666666664,
-      "grad_norm": 4.348108768463135,
-      "learning_rate": 5.25e-05,
-      "loss": 0.4444,
-      "step": 1050
-    },
-    {
-      "epoch": 48.888888888888886,
-      "grad_norm": 2.2666618824005127,
-      "learning_rate": 5.500000000000001e-05,
-      "loss": 0.436,
-      "step": 1100
-    },
-    {
-      "epoch": 51.111111111111114,
-      "grad_norm": 2.367915630340576,
-      "learning_rate": 5.7499999999999995e-05,
-      "loss": 0.4358,
-      "step": 1150
-    },
-    {
-      "epoch": 53.333333333333336,
-      "grad_norm": 1.8919609785079956,
-      "learning_rate": 6e-05,
-      "loss": 0.432,
-      "step": 1200
-    },
-    {
-      "epoch": 55.55555555555556,
-      "grad_norm": 2.736358642578125,
-      "learning_rate": 6.25e-05,
-      "loss": 0.4373,
-      "step": 1250
-    },
-    {
-      "epoch": 57.77777777777778,
-      "grad_norm": 2.9132068157196045,
-      "learning_rate": 6.500000000000001e-05,
-      "loss": 0.4317,
-      "step": 1300
-    },
-    {
-      "epoch": 60.0,
-      "grad_norm": 1.5804557800292969,
-      "learning_rate": 6.750000000000001e-05,
-      "loss": 0.4288,
-      "step": 1350
-    },
-    {
-      "epoch": 62.22222222222222,
-      "grad_norm": 2.7998554706573486,
-      "learning_rate": 7e-05,
-      "loss": 0.4291,
-      "step": 1400
-    },
-    {
-      "epoch": 64.44444444444444,
-      "grad_norm": 1.1768637895584106,
-      "learning_rate": 7.25e-05,
-      "loss": 0.4183,
-      "step": 1450
-    },
-    {
-      "epoch": 66.66666666666667,
-      "grad_norm": 1.888795018196106,
-      "learning_rate": 7.500000000000001e-05,
-      "loss": 0.423,
-      "step": 1500
-    },
-    {
-      "epoch": 66.66666666666667,
-      "eval_loss": 0.4168904721736908,
-      "eval_runtime": 8.2236,
-      "eval_samples_per_second": 19.335,
-      "eval_steps_per_second": 2.432,
-      "step": 1500
-    },
-    {
-      "epoch": 68.88888888888889,
-      "grad_norm": 1.5093848705291748,
-      "learning_rate": 7.75e-05,
-      "loss": 0.4214,
-      "step": 1550
-    },
-    {
-      "epoch": 71.11111111111111,
-      "grad_norm": 3.0897819995880127,
-      "learning_rate": 8e-05,
-      "loss": 0.4217,
-      "step": 1600
-    },
-    {
-      "epoch": 73.33333333333333,
-      "grad_norm": 4.435822486877441,
-      "learning_rate": 8.25e-05,
-      "loss": 0.4115,
-      "step": 1650
-    },
-    {
-      "epoch": 75.55555555555556,
-      "grad_norm": 1.9570446014404297,
-      "learning_rate": 8.5e-05,
-      "loss": 0.414,
-      "step": 1700
-    },
-    {
-      "epoch": 77.77777777777777,
-      "grad_norm": 2.8186404705047607,
-      "learning_rate": 8.75e-05,
-      "loss": 0.418,
-      "step": 1750
-    },
-    {
-      "epoch": 80.0,
-      "grad_norm": 5.278728485107422,
-      "learning_rate": 9e-05,
-      "loss": 0.4128,
-      "step": 1800
-    },
-    {
-      "epoch": 82.22222222222223,
-      "grad_norm": 2.285869598388672,
-      "learning_rate": 9.250000000000001e-05,
-      "loss": 0.4144,
-      "step": 1850
-    },
-    {
-      "epoch": 84.44444444444444,
-      "grad_norm": 2.994182586669922,
-      "learning_rate": 9.5e-05,
-      "loss": 0.4105,
-      "step": 1900
-    },
-    {
-      "epoch": 86.66666666666667,
-      "grad_norm": 2.9983792304992676,
-      "learning_rate": 9.75e-05,
-      "loss": 0.414,
-      "step": 1950
-    },
-    {
-      "epoch": 88.88888888888889,
-      "grad_norm": 3.550610303878784,
-      "learning_rate": 0.0001,
-      "loss": 0.4108,
-      "step": 2000
-    },
-    {
-      "epoch": 88.88888888888889,
-      "eval_loss": 0.4182928204536438,
-      "eval_runtime": 7.7764,
-      "eval_samples_per_second": 20.446,
-      "eval_steps_per_second": 2.572,
-      "step": 2000
-    },
-    {
-      "epoch": 91.11111111111111,
-      "grad_norm": 2.4401373863220215,
-      "learning_rate": 9.916666666666667e-05,
-      "loss": 0.4159,
-      "step": 2050
-    },
-    {
-      "epoch": 93.33333333333333,
-      "grad_norm": 4.816738128662109,
-      "learning_rate": 9.835e-05,
-      "loss": 0.4109,
-      "step": 2100
-    },
-    {
-      "epoch": 95.55555555555556,
-      "grad_norm": 3.018982410430908,
-      "learning_rate": 9.751666666666666e-05,
-      "loss": 0.4072,
-      "step": 2150
-    },
-    {
-      "epoch": 97.77777777777777,
-      "grad_norm": 1.719527006149292,
-      "learning_rate": 9.668333333333334e-05,
-      "loss": 0.4042,
-      "step": 2200
-    },
-    {
-      "epoch": 100.0,
-      "grad_norm": 1.9012302160263062,
-      "learning_rate": 9.585000000000001e-05,
-      "loss": 0.4023,
-      "step": 2250
-    },
-    {
-      "epoch": 102.22222222222223,
-      "grad_norm": 3.062488317489624,
-      "learning_rate": 9.501666666666668e-05,
-      "loss": 0.406,
-      "step": 2300
-    },
-    {
-      "epoch": 104.44444444444444,
-      "grad_norm": 2.6211459636688232,
-      "learning_rate": 9.418333333333334e-05,
-      "loss": 0.405,
-      "step": 2350
-    },
-    {
-      "epoch": 106.66666666666667,
-      "grad_norm": 1.7554346323013306,
-      "learning_rate": 9.335e-05,
-      "loss": 0.4052,
-      "step": 2400
-    },
-    {
-      "epoch": 108.88888888888889,
-      "grad_norm": 1.7569692134857178,
-      "learning_rate": 9.251666666666667e-05,
-      "loss": 0.3979,
-      "step": 2450
-    },
-    {
-      "epoch": 111.11111111111111,
-      "grad_norm": 0.9594977498054504,
-      "learning_rate": 9.168333333333333e-05,
-      "loss": 0.396,
-      "step": 2500
-    },
-    {
-      "epoch": 111.11111111111111,
-      "eval_loss": 0.4142945408821106,
-      "eval_runtime": 8.1333,
-      "eval_samples_per_second": 19.549,
-      "eval_steps_per_second": 2.459,
-      "step": 2500
-    },
-    {
-      "epoch": 113.33333333333333,
-      "grad_norm": 2.79782772064209,
-      "learning_rate": 9.085e-05,
-      "loss": 0.3945,
-      "step": 2550
-    },
-    {
-      "epoch": 115.55555555555556,
-      "grad_norm": 2.9776482582092285,
-      "learning_rate": 9.001666666666667e-05,
-      "loss": 0.3901,
-      "step": 2600
-    },
-    {
-      "epoch": 117.77777777777777,
-      "grad_norm": 1.6582765579223633,
-      "learning_rate": 8.918333333333334e-05,
-      "loss": 0.3897,
-      "step": 2650
-    },
-    {
-      "epoch": 120.0,
-      "grad_norm": 2.3238115310668945,
-      "learning_rate": 8.834999999999999e-05,
-      "loss": 0.3946,
-      "step": 2700
-    },
-    {
-      "epoch": 122.22222222222223,
-      "grad_norm": 1.783127784729004,
-      "learning_rate": 8.751666666666668e-05,
-      "loss": 0.3956,
-      "step": 2750
-    },
-    {
-      "epoch": 124.44444444444444,
-      "grad_norm": 1.8057986497879028,
-      "learning_rate": 8.668333333333334e-05,
-      "loss": 0.3884,
-      "step": 2800
-    },
-    {
-      "epoch": 126.66666666666667,
-      "grad_norm": 1.6317520141601562,
-      "learning_rate": 8.585000000000001e-05,
-      "loss": 0.3895,
-      "step": 2850
-    },
-    {
-      "epoch": 128.88888888888889,
-      "grad_norm": 1.802009105682373,
-      "learning_rate": 8.501666666666667e-05,
-      "loss": 0.3944,
-      "step": 2900
-    },
-    {
-      "epoch": 131.11111111111111,
-      "grad_norm": 2.605529308319092,
-      "learning_rate": 8.418333333333334e-05,
-      "loss": 0.3951,
-      "step": 2950
-    },
-    {
-      "epoch": 133.33333333333334,
-      "grad_norm": 1.2800849676132202,
-      "learning_rate": 8.335e-05,
-      "loss": 0.3883,
-      "step": 3000
-    },
-    {
-      "epoch": 133.33333333333334,
-      "eval_loss": 0.4183387756347656,
-      "eval_runtime": 9.0098,
-      "eval_samples_per_second": 17.647,
-      "eval_steps_per_second": 2.22,
-      "step": 3000
-    },
-    {
-      "epoch": 135.55555555555554,
-      "grad_norm": 1.146140456199646,
-      "learning_rate": 8.251666666666668e-05,
-      "loss": 0.3853,
-      "step": 3050
-    },
-    {
-      "epoch": 137.77777777777777,
-      "grad_norm": 1.3666647672653198,
-      "learning_rate": 8.168333333333333e-05,
-      "loss": 0.3815,
-      "step": 3100
-    },
-    {
-      "epoch": 140.0,
-      "grad_norm": 1.4111328125,
-      "learning_rate": 8.085e-05,
-      "loss": 0.3831,
-      "step": 3150
-    },
-    {
-      "epoch": 142.22222222222223,
-      "grad_norm": 3.635448455810547,
-      "learning_rate": 8.001666666666667e-05,
-      "loss": 0.3838,
-      "step": 3200
-    },
-    {
-      "epoch": 144.44444444444446,
-      "grad_norm": 1.2508800029754639,
-      "learning_rate": 7.918333333333334e-05,
-      "loss": 0.3805,
-      "step": 3250
-    },
-    {
-      "epoch": 146.66666666666666,
-      "grad_norm": 1.908144235610962,
-      "learning_rate": 7.835000000000001e-05,
-      "loss": 0.3802,
-      "step": 3300
-    },
-    {
-      "epoch": 148.88888888888889,
-      "grad_norm": 1.8435084819793701,
-      "learning_rate": 7.751666666666668e-05,
-      "loss": 0.3764,
-      "step": 3350
-    },
-    {
-      "epoch": 151.11111111111111,
-      "grad_norm": 2.1144354343414307,
-      "learning_rate": 7.668333333333335e-05,
-      "loss": 0.3781,
-      "step": 3400
-    },
-    {
-      "epoch": 153.33333333333334,
-      "grad_norm": 2.9214673042297363,
-      "learning_rate": 7.585e-05,
-      "loss": 0.3766,
-      "step": 3450
-    },
-    {
-      "epoch": 155.55555555555554,
-      "grad_norm": 1.5731481313705444,
-      "learning_rate": 7.501666666666667e-05,
-      "loss": 0.3771,
-      "step": 3500
-    },
-    {
-      "epoch": 155.55555555555554,
-      "eval_loss": 0.42192450165748596,
-      "eval_runtime": 8.6123,
-      "eval_samples_per_second": 18.462,
-      "eval_steps_per_second": 2.322,
-      "step": 3500
-    },
-    {
-      "epoch": 157.77777777777777,
-      "grad_norm": 1.3311264514923096,
-      "learning_rate": 7.418333333333334e-05,
-      "loss": 0.3807,
-      "step": 3550
-    },
-    {
-      "epoch": 160.0,
-      "grad_norm": 1.186661720275879,
-      "learning_rate": 7.335000000000001e-05,
-      "loss": 0.3768,
-      "step": 3600
-    },
-    {
-      "epoch": 162.22222222222223,
-      "grad_norm": 1.4181159734725952,
-      "learning_rate": 7.251666666666666e-05,
-      "loss": 0.3735,
-      "step": 3650
-    },
-    {
-      "epoch": 164.44444444444446,
-      "grad_norm": 2.7172157764434814,
-      "learning_rate": 7.168333333333333e-05,
-      "loss": 0.3741,
-      "step": 3700
-    },
-    {
-      "epoch": 166.66666666666666,
-      "grad_norm": 1.6093742847442627,
-      "learning_rate": 7.085e-05,
-      "loss": 0.3721,
-      "step": 3750
-    },
-    {
-      "epoch": 168.88888888888889,
-      "grad_norm": 1.3624290227890015,
-      "learning_rate": 7.001666666666667e-05,
-      "loss": 0.37,
-      "step": 3800
-    },
-    {
-      "epoch": 171.11111111111111,
-      "grad_norm": 1.287617564201355,
-      "learning_rate": 6.918333333333334e-05,
-      "loss": 0.3655,
-      "step": 3850
-    },
-    {
-      "epoch": 173.33333333333334,
-      "grad_norm": 1.69022536277771,
-      "learning_rate": 6.835000000000001e-05,
-      "loss": 0.37,
-      "step": 3900
-    },
-    {
-      "epoch": 175.55555555555554,
-      "grad_norm": 2.1261706352233887,
-      "learning_rate": 6.751666666666668e-05,
-      "loss": 0.3736,
-      "step": 3950
-    },
-    {
-      "epoch": 177.77777777777777,
-      "grad_norm": 1.6689108610153198,
-      "learning_rate": 6.668333333333333e-05,
-      "loss": 0.3731,
-      "step": 4000
-    },
-    {
-      "epoch": 177.77777777777777,
-      "eval_loss": 0.4211507737636566,
-      "eval_runtime": 8.6906,
-      "eval_samples_per_second": 18.296,
-      "eval_steps_per_second": 2.301,
-      "step": 4000
-    },
-    {
-      "epoch": 180.0,
-      "grad_norm": 2.798970937728882,
-      "learning_rate": 6.585e-05,
-      "loss": 0.3657,
-      "step": 4050
-    },
-    {
-      "epoch": 182.22222222222223,
-      "grad_norm": 1.2629871368408203,
-      "learning_rate": 6.501666666666667e-05,
-      "loss": 0.3666,
-      "step": 4100
-    },
-    {
-      "epoch": 184.44444444444446,
-      "grad_norm": 0.970268964767456,
-      "learning_rate": 6.418333333333334e-05,
-      "loss": 0.3668,
-      "step": 4150
-    },
-    {
-      "epoch": 186.66666666666666,
-      "grad_norm": 1.7865740060806274,
-      "learning_rate": 6.335e-05,
-      "loss": 0.364,
-      "step": 4200
-    },
-    {
-      "epoch": 188.88888888888889,
-      "grad_norm": 0.8204932808876038,
-      "learning_rate": 6.251666666666666e-05,
-      "loss": 0.3647,
-      "step": 4250
-    },
-    {
-      "epoch": 191.11111111111111,
-      "grad_norm": 0.910372257232666,
-      "learning_rate": 6.168333333333333e-05,
-      "loss": 0.3676,
-      "step": 4300
-    },
-    {
-      "epoch": 193.33333333333334,
-      "grad_norm": 1.2381336688995361,
-      "learning_rate": 6.085000000000001e-05,
-      "loss": 0.3632,
-      "step": 4350
-    },
-    {
-      "epoch": 195.55555555555554,
-      "grad_norm": 1.588180661201477,
-      "learning_rate": 6.0016666666666664e-05,
-      "loss": 0.3621,
-      "step": 4400
-    },
-    {
-      "epoch": 197.77777777777777,
-      "grad_norm": 1.093023419380188,
-      "learning_rate": 5.918333333333333e-05,
-      "loss": 0.3609,
-      "step": 4450
-    },
-    {
-      "epoch": 200.0,
-      "grad_norm": 1.071207046508789,
-      "learning_rate": 5.835e-05,
-      "loss": 0.362,
-      "step": 4500
-    },
-    {
-      "epoch": 200.0,
-      "eval_loss": 0.42145419120788574,
-      "eval_runtime": 7.8841,
-      "eval_samples_per_second": 20.167,
-      "eval_steps_per_second": 2.537,
-      "step": 4500
     }
   ],
   "logging_steps": 50,
@@ -728,8 +104,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 6.180482295807811e+16,
-  "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.4381416440010071,
+  "best_model_checkpoint": "mikhail_panzo/fil_b64_le4_s8000/checkpoint-500",
+  "epoch": 22.22222222222222,
   "eval_steps": 500,
+  "global_step": 500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 2.2222222222222223,
+      "grad_norm": 1.774868130683899,
       "learning_rate": 2.5e-06,
+      "loss": 0.7871,
       "step": 50
     },
     {
       "epoch": 4.444444444444445,
+      "grad_norm": 4.1770920753479,
       "learning_rate": 5e-06,
+      "loss": 0.7106,
       "step": 100
     },
     {
       "epoch": 6.666666666666667,
+      "grad_norm": 3.3354010581970215,
       "learning_rate": 7.5e-06,
+      "loss": 0.6568,
       "step": 150
     },
     {
       "epoch": 8.88888888888889,
+      "grad_norm": 2.2018914222717285,
+      "learning_rate": 9.950000000000001e-06,
+      "loss": 0.577,
       "step": 200
     },
     {
       "epoch": 11.11111111111111,
+      "grad_norm": 1.4593122005462646,
+      "learning_rate": 1.2450000000000001e-05,
+      "loss": 0.5321,
       "step": 250
     },
     {
       "epoch": 13.333333333333334,
+      "grad_norm": 1.9025092124938965,
+      "learning_rate": 1.4950000000000001e-05,
+      "loss": 0.5224,
       "step": 300
     },
     {
       "epoch": 15.555555555555555,
+      "grad_norm": 1.1274640560150146,
+      "learning_rate": 1.745e-05,
+      "loss": 0.5033,
       "step": 350
     },
     {
       "epoch": 17.77777777777778,
+      "grad_norm": 1.615440011024475,
+      "learning_rate": 1.995e-05,
+      "loss": 0.4884,
       "step": 400
     },
     {
       "epoch": 20.0,
+      "grad_norm": 1.224423885345459,
+      "learning_rate": 2.245e-05,
+      "loss": 0.4891,
       "step": 450
     },
     {
       "epoch": 22.22222222222222,
+      "grad_norm": 1.3765957355499268,
+      "learning_rate": 2.495e-05,
+      "loss": 0.4811,
       "step": 500
     },
     {
       "epoch": 22.22222222222222,
+      "eval_loss": 0.4381416440010071,
+      "eval_runtime": 7.3061,
+      "eval_samples_per_second": 21.763,
+      "eval_steps_per_second": 2.737,
       "step": 500
     }
   ],
   "logging_steps": 50,
       "attributes": {}
     }
   },
+  "total_flos": 7257884840759520.0,
+  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a77b5730a7c220a064b5f4530a16997470ffd2ece6c16c7b1757d4b08f671d29
 size 5304

 version https://git-lfs.github.com/spec/v1
+oid sha256:b20fa88d77a29f6532fc31932997f0c8ed961dcd3657a7862d793575cbe78fcd
 size 5304