End of training

Browse files

Files changed (6) hide show

README.md +18 -4
all_results.json +13 -0
eval_results.json +8 -0
log_history.json +1928 -0
train_results.json +8 -0
trainer_state.json +1959 -0

README.md CHANGED Viewed

@@ -1,6 +1,7 @@
 ---
 library_name: transformers
 tags:
 - generated_from_trainer
 datasets:
 - voxceleb
@@ -8,7 +9,20 @@ metrics:
 - accuracy
 model-index:
 - name: aam-len3-bs256-lr1e-3
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,10 +30,10 @@ should probably proofread and complete it, then remove this comment. -->
 # aam-len3-bs256-lr1e-3
-This model is a fine-tuned version of [](https://huggingface.co/) on the voxceleb dataset.
 It achieves the following results on the evaluation set:
-- Loss: nan
-- Accuracy: 0.0005
 ## Model description

 ---
 library_name: transformers
 tags:
+- audio-classification
 - generated_from_trainer
 datasets:
 - voxceleb
 - accuracy
 model-index:
 - name: aam-len3-bs256-lr1e-3
+  results:
+  - task:
+      name: Audio Classification
+      type: audio-classification
+    dataset:
+      name: confit/voxceleb
+      type: voxceleb
+      config: verification
+      split: train
+      args: verification
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.9616548940464178
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # aam-len3-bs256-lr1e-3
+This model is a fine-tuned version of [](https://huggingface.co/) on the confit/voxceleb dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.6434
+- Accuracy: 0.9617
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.9616548940464178,
+    "eval_loss": 0.6434142589569092,
+    "eval_runtime": 88.549,
+    "eval_samples_per_second": 167.873,
+    "eval_steps_per_second": 167.873,
+    "total_flos": 8.1484088684544e+18,
+    "train_loss": 1.9844060945237343,
+    "train_runtime": 19376.6921,
+    "train_samples_per_second": 69.04,
+    "train_steps_per_second": 0.27
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 10.0,
+    "eval_accuracy": 0.9616548940464178,
+    "eval_loss": 0.6434142589569092,
+    "eval_runtime": 88.549,
+    "eval_samples_per_second": 167.873,
+    "eval_steps_per_second": 167.873
+}

log_history.json ADDED Viewed

	@@ -0,0 +1,1928 @@

+[
+    {
+        "loss": 13.1781,
+        "grad_norm": 5.8640031814575195,
+        "learning_rate": 3.824091778202677e-05,
+        "epoch": 0.03824091778202677,
+        "step": 20
+    },
+    {
+        "loss": 12.9709,
+        "grad_norm": 4.678975582122803,
+        "learning_rate": 7.648183556405354e-05,
+        "epoch": 0.07648183556405354,
+        "step": 40
+    },
+    {
+        "loss": 12.7525,
+        "grad_norm": 3.863873243331909,
+        "learning_rate": 0.0001147227533460803,
+        "epoch": 0.1147227533460803,
+        "step": 60
+    },
+    {
+        "loss": 12.5664,
+        "grad_norm": 3.3697011470794678,
+        "learning_rate": 0.00015296367112810707,
+        "epoch": 0.15296367112810708,
+        "step": 80
+    },
+    {
+        "loss": 12.345,
+        "grad_norm": 3.032871961593628,
+        "learning_rate": 0.00019120458891013384,
+        "epoch": 0.19120458891013384,
+        "step": 100
+    },
+    {
+        "loss": 12.1041,
+        "grad_norm": 2.81247615814209,
+        "learning_rate": 0.0002294455066921606,
+        "epoch": 0.2294455066921606,
+        "step": 120
+    },
+    {
+        "loss": 11.8394,
+        "grad_norm": 2.684664011001587,
+        "learning_rate": 0.0002676864244741874,
+        "epoch": 0.2676864244741874,
+        "step": 140
+    },
+    {
+        "loss": 11.6226,
+        "grad_norm": 2.6299898624420166,
+        "learning_rate": 0.00030592734225621415,
+        "epoch": 0.30592734225621415,
+        "step": 160
+    },
+    {
+        "loss": 11.3617,
+        "grad_norm": 2.584235429763794,
+        "learning_rate": 0.00034416826003824094,
+        "epoch": 0.3441682600382409,
+        "step": 180
+    },
+    {
+        "loss": 11.0999,
+        "grad_norm": 2.6476144790649414,
+        "learning_rate": 0.0003824091778202677,
+        "epoch": 0.3824091778202677,
+        "step": 200
+    },
+    {
+        "loss": 10.9299,
+        "grad_norm": 2.607042074203491,
+        "learning_rate": 0.0004206500956022944,
+        "epoch": 0.42065009560229444,
+        "step": 220
+    },
+    {
+        "loss": 10.6686,
+        "grad_norm": 2.6292777061462402,
+        "learning_rate": 0.0004588910133843212,
+        "epoch": 0.4588910133843212,
+        "step": 240
+    },
+    {
+        "loss": 10.4216,
+        "grad_norm": 2.56091570854187,
+        "learning_rate": 0.0004971319311663481,
+        "epoch": 0.497131931166348,
+        "step": 260
+    },
+    {
+        "loss": 10.258,
+        "grad_norm": 2.5721495151519775,
+        "learning_rate": 0.0005353728489483748,
+        "epoch": 0.5353728489483748,
+        "step": 280
+    },
+    {
+        "loss": 10.0441,
+        "grad_norm": 2.6123592853546143,
+        "learning_rate": 0.0005736137667304016,
+        "epoch": 0.5736137667304015,
+        "step": 300
+    },
+    {
+        "loss": 9.7655,
+        "grad_norm": 2.5007669925689697,
+        "learning_rate": 0.0006118546845124283,
+        "epoch": 0.6118546845124283,
+        "step": 320
+    },
+    {
+        "loss": 9.6003,
+        "grad_norm": 2.6687047481536865,
+        "learning_rate": 0.000650095602294455,
+        "epoch": 0.6500956022944551,
+        "step": 340
+    },
+    {
+        "loss": 9.3888,
+        "grad_norm": 2.758013963699341,
+        "learning_rate": 0.0006883365200764819,
+        "epoch": 0.6883365200764818,
+        "step": 360
+    },
+    {
+        "loss": 9.0976,
+        "grad_norm": 2.6198856830596924,
+        "learning_rate": 0.0007265774378585086,
+        "epoch": 0.7265774378585086,
+        "step": 380
+    },
+    {
+        "loss": 8.9158,
+        "grad_norm": 2.6092536449432373,
+        "learning_rate": 0.0007648183556405354,
+        "epoch": 0.7648183556405354,
+        "step": 400
+    },
+    {
+        "loss": 8.7461,
+        "grad_norm": 2.609232187271118,
+        "learning_rate": 0.0008030592734225621,
+        "epoch": 0.8030592734225621,
+        "step": 420
+    },
+    {
+        "loss": 8.5383,
+        "grad_norm": 2.717844247817993,
+        "learning_rate": 0.0008413001912045888,
+        "epoch": 0.8413001912045889,
+        "step": 440
+    },
+    {
+        "loss": 8.243,
+        "grad_norm": 2.665982723236084,
+        "learning_rate": 0.0008795411089866157,
+        "epoch": 0.8795411089866156,
+        "step": 460
+    },
+    {
+        "loss": 8.143,
+        "grad_norm": 2.6607024669647217,
+        "learning_rate": 0.0009177820267686424,
+        "epoch": 0.9177820267686424,
+        "step": 480
+    },
+    {
+        "loss": 7.9178,
+        "grad_norm": 2.6848645210266113,
+        "learning_rate": 0.0009560229445506692,
+        "epoch": 0.9560229445506692,
+        "step": 500
+    },
+    {
+        "loss": 7.6771,
+        "grad_norm": 2.72930908203125,
+        "learning_rate": 0.0009942638623326961,
+        "epoch": 0.994263862332696,
+        "step": 520
+    },
+    {
+        "eval_loss": 6.701026916503906,
+        "eval_accuracy": 0.6003363605785402,
+        "eval_runtime": 210.9446,
+        "eval_samples_per_second": 70.469,
+        "eval_steps_per_second": 70.469,
+        "epoch": 1.0,
+        "step": 523
+    },
+    {
+        "loss": 7.2977,
+        "grad_norm": 2.6760189533233643,
+        "learning_rate": 0.0009963883577650309,
+        "epoch": 1.0325047801147227,
+        "step": 540
+    },
+    {
+        "loss": 7.1159,
+        "grad_norm": 2.761121988296509,
+        "learning_rate": 0.0009921393669003612,
+        "epoch": 1.0707456978967496,
+        "step": 560
+    },
+    {
+        "loss": 6.8399,
+        "grad_norm": 2.6698880195617676,
+        "learning_rate": 0.0009878903760356915,
+        "epoch": 1.1089866156787762,
+        "step": 580
+    },
+    {
+        "loss": 6.6834,
+        "grad_norm": 2.64094614982605,
+        "learning_rate": 0.0009836413851710218,
+        "epoch": 1.147227533460803,
+        "step": 600
+    },
+    {
+        "loss": 6.5621,
+        "grad_norm": 2.7290735244750977,
+        "learning_rate": 0.0009793923943063523,
+        "epoch": 1.1854684512428297,
+        "step": 620
+    },
+    {
+        "loss": 6.217,
+        "grad_norm": 2.7097644805908203,
+        "learning_rate": 0.0009751434034416827,
+        "epoch": 1.2237093690248566,
+        "step": 640
+    },
+    {
+        "loss": 6.0556,
+        "grad_norm": 2.703202962875366,
+        "learning_rate": 0.000970894412577013,
+        "epoch": 1.2619502868068833,
+        "step": 660
+    },
+    {
+        "loss": 5.9178,
+        "grad_norm": 2.7082841396331787,
+        "learning_rate": 0.0009666454217123433,
+        "epoch": 1.3001912045889101,
+        "step": 680
+    },
+    {
+        "loss": 5.6926,
+        "grad_norm": 2.6659677028656006,
+        "learning_rate": 0.0009623964308476737,
+        "epoch": 1.338432122370937,
+        "step": 700
+    },
+    {
+        "loss": 5.4895,
+        "grad_norm": 2.656085252761841,
+        "learning_rate": 0.000958147439983004,
+        "epoch": 1.3766730401529637,
+        "step": 720
+    },
+    {
+        "loss": 5.3583,
+        "grad_norm": 2.692253351211548,
+        "learning_rate": 0.0009538984491183344,
+        "epoch": 1.4149139579349903,
+        "step": 740
+    },
+    {
+        "loss": 5.207,
+        "grad_norm": 2.6969852447509766,
+        "learning_rate": 0.0009496494582536647,
+        "epoch": 1.4531548757170172,
+        "step": 760
+    },
+    {
+        "loss": 5.0834,
+        "grad_norm": 2.675426483154297,
+        "learning_rate": 0.0009454004673889951,
+        "epoch": 1.491395793499044,
+        "step": 780
+    },
+    {
+        "loss": 4.9146,
+        "grad_norm": 2.750749111175537,
+        "learning_rate": 0.0009411514765243255,
+        "epoch": 1.5296367112810707,
+        "step": 800
+    },
+    {
+        "loss": 4.7151,
+        "grad_norm": 2.668574333190918,
+        "learning_rate": 0.0009369024856596558,
+        "epoch": 1.5678776290630974,
+        "step": 820
+    },
+    {
+        "loss": 4.6307,
+        "grad_norm": 2.688476324081421,
+        "learning_rate": 0.0009326534947949862,
+        "epoch": 1.6061185468451242,
+        "step": 840
+    },
+    {
+        "loss": 4.5571,
+        "grad_norm": 2.8131325244903564,
+        "learning_rate": 0.0009284045039303166,
+        "epoch": 1.644359464627151,
+        "step": 860
+    },
+    {
+        "loss": 4.3895,
+        "grad_norm": 2.68613338470459,
+        "learning_rate": 0.0009241555130656469,
+        "epoch": 1.682600382409178,
+        "step": 880
+    },
+    {
+        "loss": 4.2343,
+        "grad_norm": 2.662827730178833,
+        "learning_rate": 0.0009199065222009773,
+        "epoch": 1.7208413001912046,
+        "step": 900
+    },
+    {
+        "loss": 4.1223,
+        "grad_norm": 2.5778844356536865,
+        "learning_rate": 0.0009156575313363077,
+        "epoch": 1.7590822179732313,
+        "step": 920
+    },
+    {
+        "loss": 4.0716,
+        "grad_norm": 2.7444920539855957,
+        "learning_rate": 0.000911408540471638,
+        "epoch": 1.7973231357552581,
+        "step": 940
+    },
+    {
+        "loss": 3.9948,
+        "grad_norm": 2.7660489082336426,
+        "learning_rate": 0.0009071595496069684,
+        "epoch": 1.835564053537285,
+        "step": 960
+    },
+    {
+        "loss": 3.9169,
+        "grad_norm": 2.6412644386291504,
+        "learning_rate": 0.0009029105587422988,
+        "epoch": 1.8738049713193117,
+        "step": 980
+    },
+    {
+        "loss": 3.7099,
+        "grad_norm": 2.571195602416992,
+        "learning_rate": 0.0008986615678776291,
+        "epoch": 1.9120458891013383,
+        "step": 1000
+    },
+    {
+        "loss": 3.5425,
+        "grad_norm": 2.6130528450012207,
+        "learning_rate": 0.0008944125770129595,
+        "epoch": 1.9502868068833652,
+        "step": 1020
+    },
+    {
+        "loss": 3.5879,
+        "grad_norm": 2.7550861835479736,
+        "learning_rate": 0.0008901635861482899,
+        "epoch": 1.988527724665392,
+        "step": 1040
+    },
+    {
+        "eval_loss": 1.9992759227752686,
+        "eval_accuracy": 0.9140935082408341,
+        "eval_runtime": 203.0806,
+        "eval_samples_per_second": 73.198,
+        "eval_steps_per_second": 73.198,
+        "epoch": 2.0,
+        "step": 1046
+    },
+    {
+        "loss": 3.2225,
+        "grad_norm": 2.4564621448516846,
+        "learning_rate": 0.0008859145952836202,
+        "epoch": 2.026768642447419,
+        "step": 1060
+    },
+    {
+        "loss": 2.9864,
+        "grad_norm": 2.484492063522339,
+        "learning_rate": 0.0008816656044189504,
+        "epoch": 2.0650095602294454,
+        "step": 1080
+    },
+    {
+        "loss": 3.0552,
+        "grad_norm": 2.467353582382202,
+        "learning_rate": 0.000877416613554281,
+        "epoch": 2.1032504780114722,
+        "step": 1100
+    },
+    {
+        "loss": 2.9168,
+        "grad_norm": 2.594728469848633,
+        "learning_rate": 0.0008731676226896112,
+        "epoch": 2.141491395793499,
+        "step": 1120
+    },
+    {
+        "loss": 2.8856,
+        "grad_norm": 2.4294815063476562,
+        "learning_rate": 0.0008689186318249415,
+        "epoch": 2.179732313575526,
+        "step": 1140
+    },
+    {
+        "loss": 2.8125,
+        "grad_norm": 2.521456480026245,
+        "learning_rate": 0.000864669640960272,
+        "epoch": 2.2179732313575524,
+        "step": 1160
+    },
+    {
+        "loss": 2.8216,
+        "grad_norm": 2.5333268642425537,
+        "learning_rate": 0.0008604206500956023,
+        "epoch": 2.2562141491395793,
+        "step": 1180
+    },
+    {
+        "loss": 2.7326,
+        "grad_norm": 2.449925422668457,
+        "learning_rate": 0.0008561716592309326,
+        "epoch": 2.294455066921606,
+        "step": 1200
+    },
+    {
+        "loss": 2.5496,
+        "grad_norm": 2.5326759815216064,
+        "learning_rate": 0.000851922668366263,
+        "epoch": 2.332695984703633,
+        "step": 1220
+    },
+    {
+        "loss": 2.6125,
+        "grad_norm": 2.4263978004455566,
+        "learning_rate": 0.0008478861270448269,
+        "epoch": 2.3709369024856595,
+        "step": 1240
+    },
+    {
+        "loss": 2.6434,
+        "grad_norm": 2.4790937900543213,
+        "learning_rate": 0.0008436371361801573,
+        "epoch": 2.4091778202676863,
+        "step": 1260
+    },
+    {
+        "loss": 2.5629,
+        "grad_norm": 2.52268385887146,
+        "learning_rate": 0.0008393881453154876,
+        "epoch": 2.447418738049713,
+        "step": 1280
+    },
+    {
+        "loss": 2.4858,
+        "grad_norm": 2.4463202953338623,
+        "learning_rate": 0.0008353516039940514,
+        "epoch": 2.48565965583174,
+        "step": 1300
+    },
+    {
+        "loss": 2.431,
+        "grad_norm": 2.455935001373291,
+        "learning_rate": 0.0008311026131293817,
+        "epoch": 2.5239005736137665,
+        "step": 1320
+    },
+    {
+        "loss": 2.327,
+        "grad_norm": 2.3563170433044434,
+        "learning_rate": 0.0008268536222647122,
+        "epoch": 2.5621414913957934,
+        "step": 1340
+    },
+    {
+        "loss": 2.4019,
+        "grad_norm": 2.4477434158325195,
+        "learning_rate": 0.0008226046314000425,
+        "epoch": 2.6003824091778203,
+        "step": 1360
+    },
+    {
+        "loss": 2.2696,
+        "grad_norm": 2.2741150856018066,
+        "learning_rate": 0.0008183556405353728,
+        "epoch": 2.638623326959847,
+        "step": 1380
+    },
+    {
+        "loss": 2.2923,
+        "grad_norm": 2.2840123176574707,
+        "learning_rate": 0.0008141066496707033,
+        "epoch": 2.676864244741874,
+        "step": 1400
+    },
+    {
+        "loss": 2.2051,
+        "grad_norm": 2.3673081398010254,
+        "learning_rate": 0.0008098576588060336,
+        "epoch": 2.7151051625239004,
+        "step": 1420
+    },
+    {
+        "loss": 2.1845,
+        "grad_norm": 2.2644526958465576,
+        "learning_rate": 0.0008056086679413639,
+        "epoch": 2.7533460803059273,
+        "step": 1440
+    },
+    {
+        "loss": 2.1307,
+        "grad_norm": 2.392956256866455,
+        "learning_rate": 0.0008013596770766944,
+        "epoch": 2.791586998087954,
+        "step": 1460
+    },
+    {
+        "loss": 2.0802,
+        "grad_norm": 2.361642837524414,
+        "learning_rate": 0.0007973231357552581,
+        "epoch": 2.8298279158699806,
+        "step": 1480
+    },
+    {
+        "loss": 1.9928,
+        "grad_norm": 2.3466880321502686,
+        "learning_rate": 0.000793286594433822,
+        "epoch": 2.8680688336520075,
+        "step": 1500
+    },
+    {
+        "loss": 1.9699,
+        "grad_norm": 2.3408572673797607,
+        "learning_rate": 0.0007894625026556193,
+        "epoch": 2.9063097514340344,
+        "step": 1520
+    },
+    {
+        "loss": 1.9908,
+        "grad_norm": 2.3541271686553955,
+        "learning_rate": 0.0007854259613341832,
+        "epoch": 2.9445506692160612,
+        "step": 1540
+    },
+    {
+        "loss": 1.9536,
+        "grad_norm": 2.4044065475463867,
+        "learning_rate": 0.0007813894200127471,
+        "epoch": 2.982791586998088,
+        "step": 1560
+    },
+    {
+        "eval_loss": 0.8233553767204285,
+        "eval_accuracy": 0.9607130844265052,
+        "eval_runtime": 419.4359,
+        "eval_samples_per_second": 35.44,
+        "eval_steps_per_second": 35.44,
+        "epoch": 3.0,
+        "step": 1569
+    },
+    {
+        "loss": 1.8218,
+        "grad_norm": 2.1922998428344727,
+        "learning_rate": 0.0007771404291480774,
+        "epoch": 3.0210325047801145,
+        "step": 1580
+    },
+    {
+        "loss": 1.5913,
+        "grad_norm": NaN,
+        "learning_rate": 0.0007731038878266412,
+        "epoch": 3.0592734225621414,
+        "step": 1600
+    },
+    {
+        "loss": 1.6189,
+        "grad_norm": NaN,
+        "learning_rate": 0.000769492245591672,
+        "epoch": 3.0975143403441683,
+        "step": 1620
+    },
+    {
+        "loss": 1.5457,
+        "grad_norm": 2.179269313812256,
+        "learning_rate": 0.0007658806033567028,
+        "epoch": 3.135755258126195,
+        "step": 1640
+    },
+    {
+        "loss": 1.5588,
+        "grad_norm": 2.30602765083313,
+        "learning_rate": 0.0007618440620352667,
+        "epoch": 3.173996175908222,
+        "step": 1660
+    },
+    {
+        "loss": 1.5639,
+        "grad_norm": 2.100261688232422,
+        "learning_rate": 0.0007578075207138306,
+        "epoch": 3.2122370936902485,
+        "step": 1680
+    },
+    {
+        "loss": 1.5383,
+        "grad_norm": 2.185692548751831,
+        "learning_rate": 0.0007544083280220948,
+        "epoch": 3.2504780114722753,
+        "step": 1700
+    },
+    {
+        "loss": 1.5617,
+        "grad_norm": 2.014509439468384,
+        "learning_rate": 0.0007503717867006586,
+        "epoch": 3.288718929254302,
+        "step": 1720
+    },
+    {
+        "loss": 1.6206,
+        "grad_norm": 1.757117509841919,
+        "learning_rate": 0.0007467601444656894,
+        "epoch": 3.3269598470363286,
+        "step": 1740
+    },
+    {
+        "loss": 1.6565,
+        "grad_norm": 1.4908952713012695,
+        "learning_rate": 0.0007435734013171872,
+        "epoch": 3.3652007648183555,
+        "step": 1760
+    },
+    {
+        "loss": 1.6018,
+        "grad_norm": 1.310102939605713,
+        "learning_rate": 0.000739961759082218,
+        "epoch": 3.4034416826003824,
+        "step": 1780
+    },
+    {
+        "loss": 1.5565,
+        "grad_norm": 1.3842021226882935,
+        "learning_rate": 0.0007359252177607819,
+        "epoch": 3.4416826003824093,
+        "step": 1800
+    },
+    {
+        "loss": 1.6196,
+        "grad_norm": 0.23385359346866608,
+        "learning_rate": 0.0007321011259825792,
+        "epoch": 3.479923518164436,
+        "step": 1820
+    },
+    {
+        "loss": 1.5137,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007291268323773104,
+        "epoch": 3.5181644359464626,
+        "step": 1840
+    },
+    {
+        "loss": 1.5645,
+        "grad_norm": NaN,
+        "learning_rate": 0.0007253027405991077,
+        "epoch": 3.5564053537284894,
+        "step": 1860
+    },
+    {
+        "loss": 1.5224,
+        "grad_norm": NaN,
+        "learning_rate": 0.0007216910983641384,
+        "epoch": 3.5946462715105163,
+        "step": 1880
+    },
+    {
+        "loss": 1.4859,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007185043552156363,
+        "epoch": 3.632887189292543,
+        "step": 1900
+    },
+    {
+        "loss": 1.5356,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007155300616103676,
+        "epoch": 3.67112810707457,
+        "step": 1920
+    },
+    {
+        "loss": 1.5673,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007125557680050988,
+        "epoch": 3.7093690248565965,
+        "step": 1940
+    },
+    {
+        "loss": 1.4688,
+        "grad_norm": 0.0,
+        "learning_rate": 0.00070958147439983,
+        "epoch": 3.7476099426386233,
+        "step": 1960
+    },
+    {
+        "loss": 1.5848,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007061822817080943,
+        "epoch": 3.78585086042065,
+        "step": 1980
+    },
+    {
+        "loss": 1.5157,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0007036328871892926,
+        "epoch": 3.8240917782026767,
+        "step": 2000
+    },
+    {
+        "loss": 1.5408,
+        "grad_norm": NaN,
+        "learning_rate": 0.0007006585935840239,
+        "epoch": 3.8623326959847035,
+        "step": 2020
+    },
+    {
+        "loss": 1.4978,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006978967495219886,
+        "epoch": 3.9005736137667304,
+        "step": 2040
+    },
+    {
+        "loss": 1.557,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006949224559167198,
+        "epoch": 3.9388145315487573,
+        "step": 2060
+    },
+    {
+        "loss": 1.5008,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006921606118546845,
+        "epoch": 3.977055449330784,
+        "step": 2080
+    },
+    {
+        "eval_loss": 0.6434142589569092,
+        "eval_accuracy": 0.9616548940464178,
+        "eval_runtime": 182.6336,
+        "eval_samples_per_second": 81.392,
+        "eval_steps_per_second": 81.392,
+        "epoch": 4.0,
+        "step": 2092
+    },
+    {
+        "loss": 1.4216,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006887614191629488,
+        "epoch": 4.015296367112811,
+        "step": 2100
+    },
+    {
+        "loss": 1.3752,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006859995751009136,
+        "epoch": 4.053537284894838,
+        "step": 2120
+    },
+    {
+        "loss": 1.3751,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006828128319524112,
+        "epoch": 4.091778202676864,
+        "step": 2140
+    },
+    {
+        "loss": 1.3608,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006794136392606756,
+        "epoch": 4.130019120458891,
+        "step": 2160
+    },
+    {
+        "loss": 1.3152,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006766517951986402,
+        "epoch": 4.168260038240918,
+        "step": 2180
+    },
+    {
+        "loss": 1.3721,
+        "grad_norm": 0.0,
+        "learning_rate": 0.0006736775015933716,
+        "epoch": 4.2065009560229445,
+        "step": 2200
+    },
+    {
+        "loss": 1.3709,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.244741873804971,
+        "step": 2220
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.282982791586998,
+        "step": 2240
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.321223709369025,
+        "step": 2260
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.359464627151052,
+        "step": 2280
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.397705544933078,
+        "step": 2300
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.435946462715105,
+        "step": 2320
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.474187380497132,
+        "step": 2340
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.512428298279159,
+        "step": 2360
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.550669216061186,
+        "step": 2380
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.588910133843212,
+        "step": 2400
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.627151051625239,
+        "step": 2420
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.665391969407266,
+        "step": 2440
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.7036328871892925,
+        "step": 2460
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.741873804971319,
+        "step": 2480
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.780114722753346,
+        "step": 2500
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.818355640535373,
+        "step": 2520
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.8565965583174,
+        "step": 2540
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.894837476099426,
+        "step": 2560
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.933078393881453,
+        "step": 2580
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 4.97131931166348,
+        "step": 2600
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 180.7668,
+        "eval_samples_per_second": 82.233,
+        "eval_steps_per_second": 82.233,
+        "epoch": 5.0,
+        "step": 2615
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.009560229445507,
+        "step": 2620
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.047801147227533,
+        "step": 2640
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.08604206500956,
+        "step": 2660
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.124282982791587,
+        "step": 2680
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.162523900573614,
+        "step": 2700
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.2007648183556405,
+        "step": 2720
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.239005736137667,
+        "step": 2740
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.277246653919694,
+        "step": 2760
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.315487571701721,
+        "step": 2780
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.353728489483748,
+        "step": 2800
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.3919694072657744,
+        "step": 2820
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.430210325047801,
+        "step": 2840
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.468451242829828,
+        "step": 2860
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.506692160611855,
+        "step": 2880
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.544933078393882,
+        "step": 2900
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.583173996175908,
+        "step": 2920
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.621414913957935,
+        "step": 2940
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.659655831739962,
+        "step": 2960
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.6978967495219885,
+        "step": 2980
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.736137667304015,
+        "step": 3000
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.774378585086042,
+        "step": 3020
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.812619502868069,
+        "step": 3040
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.850860420650095,
+        "step": 3060
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.8891013384321225,
+        "step": 3080
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.927342256214149,
+        "step": 3100
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 5.965583173996176,
+        "step": 3120
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 182.9382,
+        "eval_samples_per_second": 81.257,
+        "eval_steps_per_second": 81.257,
+        "epoch": 6.0,
+        "step": 3138
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.003824091778203,
+        "step": 3140
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.042065009560229,
+        "step": 3160
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.080305927342256,
+        "step": 3180
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.118546845124283,
+        "step": 3200
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.15678776290631,
+        "step": 3220
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.195028680688337,
+        "step": 3240
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.233269598470363,
+        "step": 3260
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.27151051625239,
+        "step": 3280
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.309751434034417,
+        "step": 3300
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.347992351816444,
+        "step": 3320
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.3862332695984705,
+        "step": 3340
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.424474187380497,
+        "step": 3360
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.462715105162524,
+        "step": 3380
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.500956022944551,
+        "step": 3400
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.539196940726577,
+        "step": 3420
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.577437858508604,
+        "step": 3440
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.615678776290631,
+        "step": 3460
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.653919694072657,
+        "step": 3480
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.692160611854685,
+        "step": 3500
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.730401529636711,
+        "step": 3520
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.768642447418738,
+        "step": 3540
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.806883365200765,
+        "step": 3560
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.845124282982791,
+        "step": 3580
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.8833652007648185,
+        "step": 3600
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.921606118546845,
+        "step": 3620
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.959847036328872,
+        "step": 3640
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 6.998087954110899,
+        "step": 3660
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 188.9144,
+        "eval_samples_per_second": 78.686,
+        "eval_steps_per_second": 78.686,
+        "epoch": 7.0,
+        "step": 3661
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.036328871892925,
+        "step": 3680
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.074569789674952,
+        "step": 3700
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.112810707456979,
+        "step": 3720
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.151051625239006,
+        "step": 3740
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.189292543021033,
+        "step": 3760
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.227533460803059,
+        "step": 3780
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.265774378585086,
+        "step": 3800
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.304015296367113,
+        "step": 3820
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.342256214149139,
+        "step": 3840
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.3804971319311665,
+        "step": 3860
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.418738049713193,
+        "step": 3880
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.45697896749522,
+        "step": 3900
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.495219885277247,
+        "step": 3920
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.533460803059273,
+        "step": 3940
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.5717017208413,
+        "step": 3960
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.609942638623327,
+        "step": 3980
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.648183556405353,
+        "step": 4000
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.686424474187381,
+        "step": 4020
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.724665391969407,
+        "step": 4040
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.762906309751434,
+        "step": 4060
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.801147227533461,
+        "step": 4080
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.839388145315487,
+        "step": 4100
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.8776290630975145,
+        "step": 4120
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.915869980879541,
+        "step": 4140
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.954110898661568,
+        "step": 4160
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 7.992351816443595,
+        "step": 4180
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 167.1036,
+        "eval_samples_per_second": 88.957,
+        "eval_steps_per_second": 88.957,
+        "epoch": 8.0,
+        "step": 4184
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.030592734225621,
+        "step": 4200
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.068833652007648,
+        "step": 4220
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.107074569789676,
+        "step": 4240
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.145315487571702,
+        "step": 4260
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.183556405353729,
+        "step": 4280
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.221797323135755,
+        "step": 4300
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.260038240917781,
+        "step": 4320
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.29827915869981,
+        "step": 4340
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.336520076481836,
+        "step": 4360
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.374760994263863,
+        "step": 4380
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.413001912045889,
+        "step": 4400
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.451242829827915,
+        "step": 4420
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.489483747609942,
+        "step": 4440
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.52772466539197,
+        "step": 4460
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.565965583173996,
+        "step": 4480
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.604206500956023,
+        "step": 4500
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.64244741873805,
+        "step": 4520
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.680688336520076,
+        "step": 4540
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.718929254302104,
+        "step": 4560
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.75717017208413,
+        "step": 4580
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.795411089866157,
+        "step": 4600
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.833652007648183,
+        "step": 4620
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.87189292543021,
+        "step": 4640
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.910133843212238,
+        "step": 4660
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.948374760994264,
+        "step": 4680
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 8.98661567877629,
+        "step": 4700
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 176.3855,
+        "eval_samples_per_second": 84.276,
+        "eval_steps_per_second": 84.276,
+        "epoch": 9.0,
+        "step": 4707
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.024856596558317,
+        "step": 4720
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.063097514340344,
+        "step": 4740
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.101338432122372,
+        "step": 4760
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.139579349904398,
+        "step": 4780
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.177820267686425,
+        "step": 4800
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.216061185468451,
+        "step": 4820
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.254302103250478,
+        "step": 4840
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.292543021032504,
+        "step": 4860
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.330783938814532,
+        "step": 4880
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.369024856596559,
+        "step": 4900
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.407265774378585,
+        "step": 4920
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.445506692160611,
+        "step": 4940
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.483747609942638,
+        "step": 4960
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.521988527724666,
+        "step": 4980
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.560229445506693,
+        "step": 5000
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.598470363288719,
+        "step": 5020
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.636711281070745,
+        "step": 5040
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.674952198852772,
+        "step": 5060
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.7131931166348,
+        "step": 5080
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.751434034416826,
+        "step": 5100
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.789674952198853,
+        "step": 5120
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.82791586998088,
+        "step": 5140
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.866156787762906,
+        "step": 5160
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.904397705544934,
+        "step": 5180
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.94263862332696,
+        "step": 5200
+    },
+    {
+        "loss": 0.0,
+        "grad_norm": NaN,
+        "learning_rate": 0.0006717654557042703,
+        "epoch": 9.980879541108987,
+        "step": 5220
+    },
+    {
+        "eval_loss": NaN,
+        "eval_accuracy": 0.0005381769256643121,
+        "eval_runtime": 183.845,
+        "eval_samples_per_second": 80.856,
+        "eval_steps_per_second": 80.856,
+        "epoch": 10.0,
+        "step": 5230
+    },
+    {
+        "train_runtime": 19376.6921,
+        "train_samples_per_second": 69.04,
+        "train_steps_per_second": 0.27,
+        "total_flos": 8.1484088684544e+18,
+        "train_loss": 1.9844060945237343,
+        "epoch": 10.0,
+        "step": 5230
+    }
+]

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 10.0,
+    "total_flos": 8.1484088684544e+18,
+    "train_loss": 1.9844060945237343,
+    "train_runtime": 19376.6921,
+    "train_samples_per_second": 69.04,
+    "train_steps_per_second": 0.27
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1959 @@

+{
+  "best_metric": 0.9616548940464178,
+  "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/finetune/aam-len3-bs256-lr1e-3/checkpoint-2092",
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 5230,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03824091778202677,
+      "grad_norm": 5.8640031814575195,
+      "learning_rate": 3.824091778202677e-05,
+      "loss": 13.1781,
+      "step": 20
+    },
+    {
+      "epoch": 0.07648183556405354,
+      "grad_norm": 4.678975582122803,
+      "learning_rate": 7.648183556405354e-05,
+      "loss": 12.9709,
+      "step": 40
+    },
+    {
+      "epoch": 0.1147227533460803,
+      "grad_norm": 3.863873243331909,
+      "learning_rate": 0.0001147227533460803,
+      "loss": 12.7525,
+      "step": 60
+    },
+    {
+      "epoch": 0.15296367112810708,
+      "grad_norm": 3.3697011470794678,
+      "learning_rate": 0.00015296367112810707,
+      "loss": 12.5664,
+      "step": 80
+    },
+    {
+      "epoch": 0.19120458891013384,
+      "grad_norm": 3.032871961593628,
+      "learning_rate": 0.00019120458891013384,
+      "loss": 12.345,
+      "step": 100
+    },
+    {
+      "epoch": 0.2294455066921606,
+      "grad_norm": 2.81247615814209,
+      "learning_rate": 0.0002294455066921606,
+      "loss": 12.1041,
+      "step": 120
+    },
+    {
+      "epoch": 0.2676864244741874,
+      "grad_norm": 2.684664011001587,
+      "learning_rate": 0.0002676864244741874,
+      "loss": 11.8394,
+      "step": 140
+    },
+    {
+      "epoch": 0.30592734225621415,
+      "grad_norm": 2.6299898624420166,
+      "learning_rate": 0.00030592734225621415,
+      "loss": 11.6226,
+      "step": 160
+    },
+    {
+      "epoch": 0.3441682600382409,
+      "grad_norm": 2.584235429763794,
+      "learning_rate": 0.00034416826003824094,
+      "loss": 11.3617,
+      "step": 180
+    },
+    {
+      "epoch": 0.3824091778202677,
+      "grad_norm": 2.6476144790649414,
+      "learning_rate": 0.0003824091778202677,
+      "loss": 11.0999,
+      "step": 200
+    },
+    {
+      "epoch": 0.42065009560229444,
+      "grad_norm": 2.607042074203491,
+      "learning_rate": 0.0004206500956022944,
+      "loss": 10.9299,
+      "step": 220
+    },
+    {
+      "epoch": 0.4588910133843212,
+      "grad_norm": 2.6292777061462402,
+      "learning_rate": 0.0004588910133843212,
+      "loss": 10.6686,
+      "step": 240
+    },
+    {
+      "epoch": 0.497131931166348,
+      "grad_norm": 2.56091570854187,
+      "learning_rate": 0.0004971319311663481,
+      "loss": 10.4216,
+      "step": 260
+    },
+    {
+      "epoch": 0.5353728489483748,
+      "grad_norm": 2.5721495151519775,
+      "learning_rate": 0.0005353728489483748,
+      "loss": 10.258,
+      "step": 280
+    },
+    {
+      "epoch": 0.5736137667304015,
+      "grad_norm": 2.6123592853546143,
+      "learning_rate": 0.0005736137667304016,
+      "loss": 10.0441,
+      "step": 300
+    },
+    {
+      "epoch": 0.6118546845124283,
+      "grad_norm": 2.5007669925689697,
+      "learning_rate": 0.0006118546845124283,
+      "loss": 9.7655,
+      "step": 320
+    },
+    {
+      "epoch": 0.6500956022944551,
+      "grad_norm": 2.6687047481536865,
+      "learning_rate": 0.000650095602294455,
+      "loss": 9.6003,
+      "step": 340
+    },
+    {
+      "epoch": 0.6883365200764818,
+      "grad_norm": 2.758013963699341,
+      "learning_rate": 0.0006883365200764819,
+      "loss": 9.3888,
+      "step": 360
+    },
+    {
+      "epoch": 0.7265774378585086,
+      "grad_norm": 2.6198856830596924,
+      "learning_rate": 0.0007265774378585086,
+      "loss": 9.0976,
+      "step": 380
+    },
+    {
+      "epoch": 0.7648183556405354,
+      "grad_norm": 2.6092536449432373,
+      "learning_rate": 0.0007648183556405354,
+      "loss": 8.9158,
+      "step": 400
+    },
+    {
+      "epoch": 0.8030592734225621,
+      "grad_norm": 2.609232187271118,
+      "learning_rate": 0.0008030592734225621,
+      "loss": 8.7461,
+      "step": 420
+    },
+    {
+      "epoch": 0.8413001912045889,
+      "grad_norm": 2.717844247817993,
+      "learning_rate": 0.0008413001912045888,
+      "loss": 8.5383,
+      "step": 440
+    },
+    {
+      "epoch": 0.8795411089866156,
+      "grad_norm": 2.665982723236084,
+      "learning_rate": 0.0008795411089866157,
+      "loss": 8.243,
+      "step": 460
+    },
+    {
+      "epoch": 0.9177820267686424,
+      "grad_norm": 2.6607024669647217,
+      "learning_rate": 0.0009177820267686424,
+      "loss": 8.143,
+      "step": 480
+    },
+    {
+      "epoch": 0.9560229445506692,
+      "grad_norm": 2.6848645210266113,
+      "learning_rate": 0.0009560229445506692,
+      "loss": 7.9178,
+      "step": 500
+    },
+    {
+      "epoch": 0.994263862332696,
+      "grad_norm": 2.72930908203125,
+      "learning_rate": 0.0009942638623326961,
+      "loss": 7.6771,
+      "step": 520
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.6003363605785402,
+      "eval_loss": 6.701026916503906,
+      "eval_runtime": 210.9446,
+      "eval_samples_per_second": 70.469,
+      "eval_steps_per_second": 70.469,
+      "step": 523
+    },
+    {
+      "epoch": 1.0325047801147227,
+      "grad_norm": 2.6760189533233643,
+      "learning_rate": 0.0009963883577650309,
+      "loss": 7.2977,
+      "step": 540
+    },
+    {
+      "epoch": 1.0707456978967496,
+      "grad_norm": 2.761121988296509,
+      "learning_rate": 0.0009921393669003612,
+      "loss": 7.1159,
+      "step": 560
+    },
+    {
+      "epoch": 1.1089866156787762,
+      "grad_norm": 2.6698880195617676,
+      "learning_rate": 0.0009878903760356915,
+      "loss": 6.8399,
+      "step": 580
+    },
+    {
+      "epoch": 1.147227533460803,
+      "grad_norm": 2.64094614982605,
+      "learning_rate": 0.0009836413851710218,
+      "loss": 6.6834,
+      "step": 600
+    },
+    {
+      "epoch": 1.1854684512428297,
+      "grad_norm": 2.7290735244750977,
+      "learning_rate": 0.0009793923943063523,
+      "loss": 6.5621,
+      "step": 620
+    },
+    {
+      "epoch": 1.2237093690248566,
+      "grad_norm": 2.7097644805908203,
+      "learning_rate": 0.0009751434034416827,
+      "loss": 6.217,
+      "step": 640
+    },
+    {
+      "epoch": 1.2619502868068833,
+      "grad_norm": 2.703202962875366,
+      "learning_rate": 0.000970894412577013,
+      "loss": 6.0556,
+      "step": 660
+    },
+    {
+      "epoch": 1.3001912045889101,
+      "grad_norm": 2.7082841396331787,
+      "learning_rate": 0.0009666454217123433,
+      "loss": 5.9178,
+      "step": 680
+    },
+    {
+      "epoch": 1.338432122370937,
+      "grad_norm": 2.6659677028656006,
+      "learning_rate": 0.0009623964308476737,
+      "loss": 5.6926,
+      "step": 700
+    },
+    {
+      "epoch": 1.3766730401529637,
+      "grad_norm": 2.656085252761841,
+      "learning_rate": 0.000958147439983004,
+      "loss": 5.4895,
+      "step": 720
+    },
+    {
+      "epoch": 1.4149139579349903,
+      "grad_norm": 2.692253351211548,
+      "learning_rate": 0.0009538984491183344,
+      "loss": 5.3583,
+      "step": 740
+    },
+    {
+      "epoch": 1.4531548757170172,
+      "grad_norm": 2.6969852447509766,
+      "learning_rate": 0.0009496494582536647,
+      "loss": 5.207,
+      "step": 760
+    },
+    {
+      "epoch": 1.491395793499044,
+      "grad_norm": 2.675426483154297,
+      "learning_rate": 0.0009454004673889951,
+      "loss": 5.0834,
+      "step": 780
+    },
+    {
+      "epoch": 1.5296367112810707,
+      "grad_norm": 2.750749111175537,
+      "learning_rate": 0.0009411514765243255,
+      "loss": 4.9146,
+      "step": 800
+    },
+    {
+      "epoch": 1.5678776290630974,
+      "grad_norm": 2.668574333190918,
+      "learning_rate": 0.0009369024856596558,
+      "loss": 4.7151,
+      "step": 820
+    },
+    {
+      "epoch": 1.6061185468451242,
+      "grad_norm": 2.688476324081421,
+      "learning_rate": 0.0009326534947949862,
+      "loss": 4.6307,
+      "step": 840
+    },
+    {
+      "epoch": 1.644359464627151,
+      "grad_norm": 2.8131325244903564,
+      "learning_rate": 0.0009284045039303166,
+      "loss": 4.5571,
+      "step": 860
+    },
+    {
+      "epoch": 1.682600382409178,
+      "grad_norm": 2.68613338470459,
+      "learning_rate": 0.0009241555130656469,
+      "loss": 4.3895,
+      "step": 880
+    },
+    {
+      "epoch": 1.7208413001912046,
+      "grad_norm": 2.662827730178833,
+      "learning_rate": 0.0009199065222009773,
+      "loss": 4.2343,
+      "step": 900
+    },
+    {
+      "epoch": 1.7590822179732313,
+      "grad_norm": 2.5778844356536865,
+      "learning_rate": 0.0009156575313363077,
+      "loss": 4.1223,
+      "step": 920
+    },
+    {
+      "epoch": 1.7973231357552581,
+      "grad_norm": 2.7444920539855957,
+      "learning_rate": 0.000911408540471638,
+      "loss": 4.0716,
+      "step": 940
+    },
+    {
+      "epoch": 1.835564053537285,
+      "grad_norm": 2.7660489082336426,
+      "learning_rate": 0.0009071595496069684,
+      "loss": 3.9948,
+      "step": 960
+    },
+    {
+      "epoch": 1.8738049713193117,
+      "grad_norm": 2.6412644386291504,
+      "learning_rate": 0.0009029105587422988,
+      "loss": 3.9169,
+      "step": 980
+    },
+    {
+      "epoch": 1.9120458891013383,
+      "grad_norm": 2.571195602416992,
+      "learning_rate": 0.0008986615678776291,
+      "loss": 3.7099,
+      "step": 1000
+    },
+    {
+      "epoch": 1.9502868068833652,
+      "grad_norm": 2.6130528450012207,
+      "learning_rate": 0.0008944125770129595,
+      "loss": 3.5425,
+      "step": 1020
+    },
+    {
+      "epoch": 1.988527724665392,
+      "grad_norm": 2.7550861835479736,
+      "learning_rate": 0.0008901635861482899,
+      "loss": 3.5879,
+      "step": 1040
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.9140935082408341,
+      "eval_loss": 1.9992759227752686,
+      "eval_runtime": 203.0806,
+      "eval_samples_per_second": 73.198,
+      "eval_steps_per_second": 73.198,
+      "step": 1046
+    },
+    {
+      "epoch": 2.026768642447419,
+      "grad_norm": 2.4564621448516846,
+      "learning_rate": 0.0008859145952836202,
+      "loss": 3.2225,
+      "step": 1060
+    },
+    {
+      "epoch": 2.0650095602294454,
+      "grad_norm": 2.484492063522339,
+      "learning_rate": 0.0008816656044189504,
+      "loss": 2.9864,
+      "step": 1080
+    },
+    {
+      "epoch": 2.1032504780114722,
+      "grad_norm": 2.467353582382202,
+      "learning_rate": 0.000877416613554281,
+      "loss": 3.0552,
+      "step": 1100
+    },
+    {
+      "epoch": 2.141491395793499,
+      "grad_norm": 2.594728469848633,
+      "learning_rate": 0.0008731676226896112,
+      "loss": 2.9168,
+      "step": 1120
+    },
+    {
+      "epoch": 2.179732313575526,
+      "grad_norm": 2.4294815063476562,
+      "learning_rate": 0.0008689186318249415,
+      "loss": 2.8856,
+      "step": 1140
+    },
+    {
+      "epoch": 2.2179732313575524,
+      "grad_norm": 2.521456480026245,
+      "learning_rate": 0.000864669640960272,
+      "loss": 2.8125,
+      "step": 1160
+    },
+    {
+      "epoch": 2.2562141491395793,
+      "grad_norm": 2.5333268642425537,
+      "learning_rate": 0.0008604206500956023,
+      "loss": 2.8216,
+      "step": 1180
+    },
+    {
+      "epoch": 2.294455066921606,
+      "grad_norm": 2.449925422668457,
+      "learning_rate": 0.0008561716592309326,
+      "loss": 2.7326,
+      "step": 1200
+    },
+    {
+      "epoch": 2.332695984703633,
+      "grad_norm": 2.5326759815216064,
+      "learning_rate": 0.000851922668366263,
+      "loss": 2.5496,
+      "step": 1220
+    },
+    {
+      "epoch": 2.3709369024856595,
+      "grad_norm": 2.4263978004455566,
+      "learning_rate": 0.0008478861270448269,
+      "loss": 2.6125,
+      "step": 1240
+    },
+    {
+      "epoch": 2.4091778202676863,
+      "grad_norm": 2.4790937900543213,
+      "learning_rate": 0.0008436371361801573,
+      "loss": 2.6434,
+      "step": 1260
+    },
+    {
+      "epoch": 2.447418738049713,
+      "grad_norm": 2.52268385887146,
+      "learning_rate": 0.0008393881453154876,
+      "loss": 2.5629,
+      "step": 1280
+    },
+    {
+      "epoch": 2.48565965583174,
+      "grad_norm": 2.4463202953338623,
+      "learning_rate": 0.0008353516039940514,
+      "loss": 2.4858,
+      "step": 1300
+    },
+    {
+      "epoch": 2.5239005736137665,
+      "grad_norm": 2.455935001373291,
+      "learning_rate": 0.0008311026131293817,
+      "loss": 2.431,
+      "step": 1320
+    },
+    {
+      "epoch": 2.5621414913957934,
+      "grad_norm": 2.3563170433044434,
+      "learning_rate": 0.0008268536222647122,
+      "loss": 2.327,
+      "step": 1340
+    },
+    {
+      "epoch": 2.6003824091778203,
+      "grad_norm": 2.4477434158325195,
+      "learning_rate": 0.0008226046314000425,
+      "loss": 2.4019,
+      "step": 1360
+    },
+    {
+      "epoch": 2.638623326959847,
+      "grad_norm": 2.2741150856018066,
+      "learning_rate": 0.0008183556405353728,
+      "loss": 2.2696,
+      "step": 1380
+    },
+    {
+      "epoch": 2.676864244741874,
+      "grad_norm": 2.2840123176574707,
+      "learning_rate": 0.0008141066496707033,
+      "loss": 2.2923,
+      "step": 1400
+    },
+    {
+      "epoch": 2.7151051625239004,
+      "grad_norm": 2.3673081398010254,
+      "learning_rate": 0.0008098576588060336,
+      "loss": 2.2051,
+      "step": 1420
+    },
+    {
+      "epoch": 2.7533460803059273,
+      "grad_norm": 2.2644526958465576,
+      "learning_rate": 0.0008056086679413639,
+      "loss": 2.1845,
+      "step": 1440
+    },
+    {
+      "epoch": 2.791586998087954,
+      "grad_norm": 2.392956256866455,
+      "learning_rate": 0.0008013596770766944,
+      "loss": 2.1307,
+      "step": 1460
+    },
+    {
+      "epoch": 2.8298279158699806,
+      "grad_norm": 2.361642837524414,
+      "learning_rate": 0.0007973231357552581,
+      "loss": 2.0802,
+      "step": 1480
+    },
+    {
+      "epoch": 2.8680688336520075,
+      "grad_norm": 2.3466880321502686,
+      "learning_rate": 0.000793286594433822,
+      "loss": 1.9928,
+      "step": 1500
+    },
+    {
+      "epoch": 2.9063097514340344,
+      "grad_norm": 2.3408572673797607,
+      "learning_rate": 0.0007894625026556193,
+      "loss": 1.9699,
+      "step": 1520
+    },
+    {
+      "epoch": 2.9445506692160612,
+      "grad_norm": 2.3541271686553955,
+      "learning_rate": 0.0007854259613341832,
+      "loss": 1.9908,
+      "step": 1540
+    },
+    {
+      "epoch": 2.982791586998088,
+      "grad_norm": 2.4044065475463867,
+      "learning_rate": 0.0007813894200127471,
+      "loss": 1.9536,
+      "step": 1560
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.9607130844265052,
+      "eval_loss": 0.8233553767204285,
+      "eval_runtime": 419.4359,
+      "eval_samples_per_second": 35.44,
+      "eval_steps_per_second": 35.44,
+      "step": 1569
+    },
+    {
+      "epoch": 3.0210325047801145,
+      "grad_norm": 2.1922998428344727,
+      "learning_rate": 0.0007771404291480774,
+      "loss": 1.8218,
+      "step": 1580
+    },
+    {
+      "epoch": 3.0592734225621414,
+      "grad_norm": NaN,
+      "learning_rate": 0.0007731038878266412,
+      "loss": 1.5913,
+      "step": 1600
+    },
+    {
+      "epoch": 3.0975143403441683,
+      "grad_norm": NaN,
+      "learning_rate": 0.000769492245591672,
+      "loss": 1.6189,
+      "step": 1620
+    },
+    {
+      "epoch": 3.135755258126195,
+      "grad_norm": 2.179269313812256,
+      "learning_rate": 0.0007658806033567028,
+      "loss": 1.5457,
+      "step": 1640
+    },
+    {
+      "epoch": 3.173996175908222,
+      "grad_norm": 2.30602765083313,
+      "learning_rate": 0.0007618440620352667,
+      "loss": 1.5588,
+      "step": 1660
+    },
+    {
+      "epoch": 3.2122370936902485,
+      "grad_norm": 2.100261688232422,
+      "learning_rate": 0.0007578075207138306,
+      "loss": 1.5639,
+      "step": 1680
+    },
+    {
+      "epoch": 3.2504780114722753,
+      "grad_norm": 2.185692548751831,
+      "learning_rate": 0.0007544083280220948,
+      "loss": 1.5383,
+      "step": 1700
+    },
+    {
+      "epoch": 3.288718929254302,
+      "grad_norm": 2.014509439468384,
+      "learning_rate": 0.0007503717867006586,
+      "loss": 1.5617,
+      "step": 1720
+    },
+    {
+      "epoch": 3.3269598470363286,
+      "grad_norm": 1.757117509841919,
+      "learning_rate": 0.0007467601444656894,
+      "loss": 1.6206,
+      "step": 1740
+    },
+    {
+      "epoch": 3.3652007648183555,
+      "grad_norm": 1.4908952713012695,
+      "learning_rate": 0.0007435734013171872,
+      "loss": 1.6565,
+      "step": 1760
+    },
+    {
+      "epoch": 3.4034416826003824,
+      "grad_norm": 1.310102939605713,
+      "learning_rate": 0.000739961759082218,
+      "loss": 1.6018,
+      "step": 1780
+    },
+    {
+      "epoch": 3.4416826003824093,
+      "grad_norm": 1.3842021226882935,
+      "learning_rate": 0.0007359252177607819,
+      "loss": 1.5565,
+      "step": 1800
+    },
+    {
+      "epoch": 3.479923518164436,
+      "grad_norm": 0.23385359346866608,
+      "learning_rate": 0.0007321011259825792,
+      "loss": 1.6196,
+      "step": 1820
+    },
+    {
+      "epoch": 3.5181644359464626,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007291268323773104,
+      "loss": 1.5137,
+      "step": 1840
+    },
+    {
+      "epoch": 3.5564053537284894,
+      "grad_norm": NaN,
+      "learning_rate": 0.0007253027405991077,
+      "loss": 1.5645,
+      "step": 1860
+    },
+    {
+      "epoch": 3.5946462715105163,
+      "grad_norm": NaN,
+      "learning_rate": 0.0007216910983641384,
+      "loss": 1.5224,
+      "step": 1880
+    },
+    {
+      "epoch": 3.632887189292543,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007185043552156363,
+      "loss": 1.4859,
+      "step": 1900
+    },
+    {
+      "epoch": 3.67112810707457,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007155300616103676,
+      "loss": 1.5356,
+      "step": 1920
+    },
+    {
+      "epoch": 3.7093690248565965,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007125557680050988,
+      "loss": 1.5673,
+      "step": 1940
+    },
+    {
+      "epoch": 3.7476099426386233,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00070958147439983,
+      "loss": 1.4688,
+      "step": 1960
+    },
+    {
+      "epoch": 3.78585086042065,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007061822817080943,
+      "loss": 1.5848,
+      "step": 1980
+    },
+    {
+      "epoch": 3.8240917782026767,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0007036328871892926,
+      "loss": 1.5157,
+      "step": 2000
+    },
+    {
+      "epoch": 3.8623326959847035,
+      "grad_norm": NaN,
+      "learning_rate": 0.0007006585935840239,
+      "loss": 1.5408,
+      "step": 2020
+    },
+    {
+      "epoch": 3.9005736137667304,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006978967495219886,
+      "loss": 1.4978,
+      "step": 2040
+    },
+    {
+      "epoch": 3.9388145315487573,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006949224559167198,
+      "loss": 1.557,
+      "step": 2060
+    },
+    {
+      "epoch": 3.977055449330784,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006921606118546845,
+      "loss": 1.5008,
+      "step": 2080
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.9616548940464178,
+      "eval_loss": 0.6434142589569092,
+      "eval_runtime": 182.6336,
+      "eval_samples_per_second": 81.392,
+      "eval_steps_per_second": 81.392,
+      "step": 2092
+    },
+    {
+      "epoch": 4.015296367112811,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006887614191629488,
+      "loss": 1.4216,
+      "step": 2100
+    },
+    {
+      "epoch": 4.053537284894838,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006859995751009136,
+      "loss": 1.3752,
+      "step": 2120
+    },
+    {
+      "epoch": 4.091778202676864,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006828128319524112,
+      "loss": 1.3751,
+      "step": 2140
+    },
+    {
+      "epoch": 4.130019120458891,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006794136392606756,
+      "loss": 1.3608,
+      "step": 2160
+    },
+    {
+      "epoch": 4.168260038240918,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006766517951986402,
+      "loss": 1.3152,
+      "step": 2180
+    },
+    {
+      "epoch": 4.2065009560229445,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0006736775015933716,
+      "loss": 1.3721,
+      "step": 2200
+    },
+    {
+      "epoch": 4.244741873804971,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 1.3709,
+      "step": 2220
+    },
+    {
+      "epoch": 4.282982791586998,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2240
+    },
+    {
+      "epoch": 4.321223709369025,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2260
+    },
+    {
+      "epoch": 4.359464627151052,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2280
+    },
+    {
+      "epoch": 4.397705544933078,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2300
+    },
+    {
+      "epoch": 4.435946462715105,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2320
+    },
+    {
+      "epoch": 4.474187380497132,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2340
+    },
+    {
+      "epoch": 4.512428298279159,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2360
+    },
+    {
+      "epoch": 4.550669216061186,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2380
+    },
+    {
+      "epoch": 4.588910133843212,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2400
+    },
+    {
+      "epoch": 4.627151051625239,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2420
+    },
+    {
+      "epoch": 4.665391969407266,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2440
+    },
+    {
+      "epoch": 4.7036328871892925,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2460
+    },
+    {
+      "epoch": 4.741873804971319,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2480
+    },
+    {
+      "epoch": 4.780114722753346,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2500
+    },
+    {
+      "epoch": 4.818355640535373,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2520
+    },
+    {
+      "epoch": 4.8565965583174,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2540
+    },
+    {
+      "epoch": 4.894837476099426,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2560
+    },
+    {
+      "epoch": 4.933078393881453,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2580
+    },
+    {
+      "epoch": 4.97131931166348,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2600
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 180.7668,
+      "eval_samples_per_second": 82.233,
+      "eval_steps_per_second": 82.233,
+      "step": 2615
+    },
+    {
+      "epoch": 5.009560229445507,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2620
+    },
+    {
+      "epoch": 5.047801147227533,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2640
+    },
+    {
+      "epoch": 5.08604206500956,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2660
+    },
+    {
+      "epoch": 5.124282982791587,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2680
+    },
+    {
+      "epoch": 5.162523900573614,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2700
+    },
+    {
+      "epoch": 5.2007648183556405,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2720
+    },
+    {
+      "epoch": 5.239005736137667,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2740
+    },
+    {
+      "epoch": 5.277246653919694,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2760
+    },
+    {
+      "epoch": 5.315487571701721,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2780
+    },
+    {
+      "epoch": 5.353728489483748,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2800
+    },
+    {
+      "epoch": 5.3919694072657744,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2820
+    },
+    {
+      "epoch": 5.430210325047801,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2840
+    },
+    {
+      "epoch": 5.468451242829828,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2860
+    },
+    {
+      "epoch": 5.506692160611855,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2880
+    },
+    {
+      "epoch": 5.544933078393882,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2900
+    },
+    {
+      "epoch": 5.583173996175908,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2920
+    },
+    {
+      "epoch": 5.621414913957935,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2940
+    },
+    {
+      "epoch": 5.659655831739962,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2960
+    },
+    {
+      "epoch": 5.6978967495219885,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 2980
+    },
+    {
+      "epoch": 5.736137667304015,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3000
+    },
+    {
+      "epoch": 5.774378585086042,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3020
+    },
+    {
+      "epoch": 5.812619502868069,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3040
+    },
+    {
+      "epoch": 5.850860420650095,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3060
+    },
+    {
+      "epoch": 5.8891013384321225,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3080
+    },
+    {
+      "epoch": 5.927342256214149,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 5.965583173996176,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3120
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 182.9382,
+      "eval_samples_per_second": 81.257,
+      "eval_steps_per_second": 81.257,
+      "step": 3138
+    },
+    {
+      "epoch": 6.003824091778203,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3140
+    },
+    {
+      "epoch": 6.042065009560229,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3160
+    },
+    {
+      "epoch": 6.080305927342256,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3180
+    },
+    {
+      "epoch": 6.118546845124283,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3200
+    },
+    {
+      "epoch": 6.15678776290631,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3220
+    },
+    {
+      "epoch": 6.195028680688337,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3240
+    },
+    {
+      "epoch": 6.233269598470363,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3260
+    },
+    {
+      "epoch": 6.27151051625239,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3280
+    },
+    {
+      "epoch": 6.309751434034417,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3300
+    },
+    {
+      "epoch": 6.347992351816444,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3320
+    },
+    {
+      "epoch": 6.3862332695984705,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3340
+    },
+    {
+      "epoch": 6.424474187380497,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3360
+    },
+    {
+      "epoch": 6.462715105162524,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3380
+    },
+    {
+      "epoch": 6.500956022944551,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3400
+    },
+    {
+      "epoch": 6.539196940726577,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3420
+    },
+    {
+      "epoch": 6.577437858508604,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3440
+    },
+    {
+      "epoch": 6.615678776290631,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3460
+    },
+    {
+      "epoch": 6.653919694072657,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3480
+    },
+    {
+      "epoch": 6.692160611854685,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3500
+    },
+    {
+      "epoch": 6.730401529636711,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3520
+    },
+    {
+      "epoch": 6.768642447418738,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3540
+    },
+    {
+      "epoch": 6.806883365200765,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3560
+    },
+    {
+      "epoch": 6.845124282982791,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3580
+    },
+    {
+      "epoch": 6.8833652007648185,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3600
+    },
+    {
+      "epoch": 6.921606118546845,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3620
+    },
+    {
+      "epoch": 6.959847036328872,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3640
+    },
+    {
+      "epoch": 6.998087954110899,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3660
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 188.9144,
+      "eval_samples_per_second": 78.686,
+      "eval_steps_per_second": 78.686,
+      "step": 3661
+    },
+    {
+      "epoch": 7.036328871892925,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3680
+    },
+    {
+      "epoch": 7.074569789674952,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 7.112810707456979,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3720
+    },
+    {
+      "epoch": 7.151051625239006,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3740
+    },
+    {
+      "epoch": 7.189292543021033,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3760
+    },
+    {
+      "epoch": 7.227533460803059,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3780
+    },
+    {
+      "epoch": 7.265774378585086,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3800
+    },
+    {
+      "epoch": 7.304015296367113,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3820
+    },
+    {
+      "epoch": 7.342256214149139,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3840
+    },
+    {
+      "epoch": 7.3804971319311665,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3860
+    },
+    {
+      "epoch": 7.418738049713193,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3880
+    },
+    {
+      "epoch": 7.45697896749522,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3900
+    },
+    {
+      "epoch": 7.495219885277247,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3920
+    },
+    {
+      "epoch": 7.533460803059273,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3940
+    },
+    {
+      "epoch": 7.5717017208413,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3960
+    },
+    {
+      "epoch": 7.609942638623327,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 3980
+    },
+    {
+      "epoch": 7.648183556405353,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4000
+    },
+    {
+      "epoch": 7.686424474187381,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4020
+    },
+    {
+      "epoch": 7.724665391969407,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4040
+    },
+    {
+      "epoch": 7.762906309751434,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4060
+    },
+    {
+      "epoch": 7.801147227533461,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4080
+    },
+    {
+      "epoch": 7.839388145315487,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4100
+    },
+    {
+      "epoch": 7.8776290630975145,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4120
+    },
+    {
+      "epoch": 7.915869980879541,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4140
+    },
+    {
+      "epoch": 7.954110898661568,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4160
+    },
+    {
+      "epoch": 7.992351816443595,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4180
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 167.1036,
+      "eval_samples_per_second": 88.957,
+      "eval_steps_per_second": 88.957,
+      "step": 4184
+    },
+    {
+      "epoch": 8.030592734225621,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 8.068833652007648,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4220
+    },
+    {
+      "epoch": 8.107074569789676,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4240
+    },
+    {
+      "epoch": 8.145315487571702,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4260
+    },
+    {
+      "epoch": 8.183556405353729,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4280
+    },
+    {
+      "epoch": 8.221797323135755,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4300
+    },
+    {
+      "epoch": 8.260038240917781,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4320
+    },
+    {
+      "epoch": 8.29827915869981,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4340
+    },
+    {
+      "epoch": 8.336520076481836,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4360
+    },
+    {
+      "epoch": 8.374760994263863,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4380
+    },
+    {
+      "epoch": 8.413001912045889,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4400
+    },
+    {
+      "epoch": 8.451242829827915,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4420
+    },
+    {
+      "epoch": 8.489483747609942,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4440
+    },
+    {
+      "epoch": 8.52772466539197,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4460
+    },
+    {
+      "epoch": 8.565965583173996,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4480
+    },
+    {
+      "epoch": 8.604206500956023,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4500
+    },
+    {
+      "epoch": 8.64244741873805,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4520
+    },
+    {
+      "epoch": 8.680688336520076,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4540
+    },
+    {
+      "epoch": 8.718929254302104,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4560
+    },
+    {
+      "epoch": 8.75717017208413,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4580
+    },
+    {
+      "epoch": 8.795411089866157,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 8.833652007648183,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4620
+    },
+    {
+      "epoch": 8.87189292543021,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4640
+    },
+    {
+      "epoch": 8.910133843212238,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4660
+    },
+    {
+      "epoch": 8.948374760994264,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4680
+    },
+    {
+      "epoch": 8.98661567877629,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4700
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 176.3855,
+      "eval_samples_per_second": 84.276,
+      "eval_steps_per_second": 84.276,
+      "step": 4707
+    },
+    {
+      "epoch": 9.024856596558317,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4720
+    },
+    {
+      "epoch": 9.063097514340344,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4740
+    },
+    {
+      "epoch": 9.101338432122372,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4760
+    },
+    {
+      "epoch": 9.139579349904398,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4780
+    },
+    {
+      "epoch": 9.177820267686425,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4800
+    },
+    {
+      "epoch": 9.216061185468451,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4820
+    },
+    {
+      "epoch": 9.254302103250478,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4840
+    },
+    {
+      "epoch": 9.292543021032504,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4860
+    },
+    {
+      "epoch": 9.330783938814532,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4880
+    },
+    {
+      "epoch": 9.369024856596559,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4900
+    },
+    {
+      "epoch": 9.407265774378585,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4920
+    },
+    {
+      "epoch": 9.445506692160611,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4940
+    },
+    {
+      "epoch": 9.483747609942638,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4960
+    },
+    {
+      "epoch": 9.521988527724666,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 4980
+    },
+    {
+      "epoch": 9.560229445506693,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5000
+    },
+    {
+      "epoch": 9.598470363288719,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5020
+    },
+    {
+      "epoch": 9.636711281070745,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5040
+    },
+    {
+      "epoch": 9.674952198852772,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5060
+    },
+    {
+      "epoch": 9.7131931166348,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5080
+    },
+    {
+      "epoch": 9.751434034416826,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5100
+    },
+    {
+      "epoch": 9.789674952198853,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5120
+    },
+    {
+      "epoch": 9.82791586998088,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5140
+    },
+    {
+      "epoch": 9.866156787762906,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5160
+    },
+    {
+      "epoch": 9.904397705544934,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5180
+    },
+    {
+      "epoch": 9.94263862332696,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5200
+    },
+    {
+      "epoch": 9.980879541108987,
+      "grad_norm": NaN,
+      "learning_rate": 0.0006717654557042703,
+      "loss": 0.0,
+      "step": 5220
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.0005381769256643121,
+      "eval_loss": NaN,
+      "eval_runtime": 183.845,
+      "eval_samples_per_second": 80.856,
+      "eval_steps_per_second": 80.856,
+      "step": 5230
+    },
+    {
+      "epoch": 10.0,
+      "step": 5230,
+      "total_flos": 8.1484088684544e+18,
+      "train_loss": 1.9844060945237343,
+      "train_runtime": 19376.6921,
+      "train_samples_per_second": 69.04,
+      "train_steps_per_second": 0.27
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 5230,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.1484088684544e+18,
+  "train_batch_size": 256,
+  "trial_name": null,
+  "trial_params": null
+}