{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.823825503355705, "eval_steps": 500, "global_step": 11500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016778523489932886, "grad_norm": 0.35546875, "learning_rate": 9.999999999999999e-06, "loss": 1.2606, "step": 4 }, { "epoch": 0.003355704697986577, "grad_norm": 0.4140625, "learning_rate": 1.9999999999999998e-05, "loss": 1.3967, "step": 8 }, { "epoch": 0.0050335570469798654, "grad_norm": 0.484375, "learning_rate": 2.9999999999999997e-05, "loss": 1.5441, "step": 12 }, { "epoch": 0.006711409395973154, "grad_norm": 0.4296875, "learning_rate": 3.9999999999999996e-05, "loss": 1.4132, "step": 16 }, { "epoch": 0.008389261744966443, "grad_norm": 0.447265625, "learning_rate": 4.9999999999999996e-05, "loss": 1.2379, "step": 20 }, { "epoch": 0.010067114093959731, "grad_norm": 0.388671875, "learning_rate": 5.9999999999999995e-05, "loss": 1.2866, "step": 24 }, { "epoch": 0.01174496644295302, "grad_norm": 0.40625, "learning_rate": 7e-05, "loss": 1.0653, "step": 28 }, { "epoch": 0.013422818791946308, "grad_norm": 0.490234375, "learning_rate": 7.999999999999999e-05, "loss": 1.1787, "step": 32 }, { "epoch": 0.015100671140939598, "grad_norm": 0.5234375, "learning_rate": 8.999999999999999e-05, "loss": 1.1981, "step": 36 }, { "epoch": 0.016778523489932886, "grad_norm": 0.40234375, "learning_rate": 9.999999999999999e-05, "loss": 1.0779, "step": 40 }, { "epoch": 0.018456375838926176, "grad_norm": 0.5234375, "learning_rate": 0.00010999999999999998, "loss": 1.1127, "step": 44 }, { "epoch": 0.020134228187919462, "grad_norm": 0.38671875, "learning_rate": 0.00011999999999999999, "loss": 0.9777, "step": 48 }, { "epoch": 0.02181208053691275, "grad_norm": 0.5390625, "learning_rate": 0.00013, "loss": 1.1257, "step": 52 }, { "epoch": 0.02348993288590604, "grad_norm": 0.40625, "learning_rate": 0.00014, "loss": 1.129, "step": 56 }, { "epoch": 0.025167785234899327, "grad_norm": 0.38671875, "learning_rate": 0.00015, "loss": 1.1364, "step": 60 }, { "epoch": 0.026845637583892617, "grad_norm": 0.427734375, "learning_rate": 0.00015999999999999999, "loss": 1.2086, "step": 64 }, { "epoch": 0.028523489932885907, "grad_norm": 0.5234375, "learning_rate": 0.00016999999999999999, "loss": 1.1957, "step": 68 }, { "epoch": 0.030201342281879196, "grad_norm": 0.3828125, "learning_rate": 0.00017999999999999998, "loss": 1.1489, "step": 72 }, { "epoch": 0.031879194630872486, "grad_norm": 0.357421875, "learning_rate": 0.00018999999999999998, "loss": 1.0837, "step": 76 }, { "epoch": 0.03355704697986577, "grad_norm": 0.47265625, "learning_rate": 0.00019999999999999998, "loss": 1.1385, "step": 80 }, { "epoch": 0.03523489932885906, "grad_norm": 0.423828125, "learning_rate": 0.00020999999999999998, "loss": 1.114, "step": 84 }, { "epoch": 0.03691275167785235, "grad_norm": 0.447265625, "learning_rate": 0.00021999999999999995, "loss": 1.0247, "step": 88 }, { "epoch": 0.03859060402684564, "grad_norm": 0.4140625, "learning_rate": 0.00023, "loss": 1.0523, "step": 92 }, { "epoch": 0.040268456375838924, "grad_norm": 0.466796875, "learning_rate": 0.00023999999999999998, "loss": 1.0883, "step": 96 }, { "epoch": 0.04194630872483222, "grad_norm": 0.41796875, "learning_rate": 0.00025, "loss": 1.0155, "step": 100 }, { "epoch": 0.0436241610738255, "grad_norm": 0.388671875, "learning_rate": 0.00026, "loss": 1.0089, "step": 104 }, { "epoch": 0.04530201342281879, "grad_norm": 0.40625, "learning_rate": 0.00027, "loss": 1.1565, "step": 108 }, { "epoch": 0.04697986577181208, "grad_norm": 0.41796875, "learning_rate": 0.00028, "loss": 1.0369, "step": 112 }, { "epoch": 0.04865771812080537, "grad_norm": 0.58203125, "learning_rate": 0.00029, "loss": 1.0678, "step": 116 }, { "epoch": 0.050335570469798654, "grad_norm": 0.65625, "learning_rate": 0.0003, "loss": 1.0945, "step": 120 }, { "epoch": 0.05201342281879195, "grad_norm": 0.4140625, "learning_rate": 0.0002999999149416535, "loss": 1.1307, "step": 124 }, { "epoch": 0.053691275167785234, "grad_norm": 0.47265625, "learning_rate": 0.00029999965976671057, "loss": 1.0451, "step": 128 }, { "epoch": 0.05536912751677853, "grad_norm": 0.486328125, "learning_rate": 0.00029999923447546054, "loss": 0.9909, "step": 132 }, { "epoch": 0.05704697986577181, "grad_norm": 0.51171875, "learning_rate": 0.0002999986390683858, "loss": 1.0931, "step": 136 }, { "epoch": 0.0587248322147651, "grad_norm": 0.5, "learning_rate": 0.0002999978735461616, "loss": 1.0946, "step": 140 }, { "epoch": 0.06040268456375839, "grad_norm": 0.47265625, "learning_rate": 0.00029999693790965605, "loss": 1.0028, "step": 144 }, { "epoch": 0.06208053691275168, "grad_norm": 0.5625, "learning_rate": 0.0002999958321599304, "loss": 0.9547, "step": 148 }, { "epoch": 0.06375838926174497, "grad_norm": 0.50390625, "learning_rate": 0.00029999455629823853, "loss": 0.8909, "step": 152 }, { "epoch": 0.06543624161073826, "grad_norm": 0.5234375, "learning_rate": 0.0002999931103260275, "loss": 1.065, "step": 156 }, { "epoch": 0.06711409395973154, "grad_norm": 0.392578125, "learning_rate": 0.0002999914942449372, "loss": 0.9236, "step": 160 }, { "epoch": 0.06879194630872483, "grad_norm": 0.5390625, "learning_rate": 0.00029998970805680044, "loss": 0.9514, "step": 164 }, { "epoch": 0.07046979865771812, "grad_norm": 0.447265625, "learning_rate": 0.00029998775176364297, "loss": 0.9949, "step": 168 }, { "epoch": 0.07214765100671142, "grad_norm": 0.451171875, "learning_rate": 0.00029998562536768343, "loss": 1.0035, "step": 172 }, { "epoch": 0.0738255033557047, "grad_norm": 0.486328125, "learning_rate": 0.00029998332887133335, "loss": 1.0062, "step": 176 }, { "epoch": 0.07550335570469799, "grad_norm": 0.5703125, "learning_rate": 0.0002999808622771973, "loss": 1.0291, "step": 180 }, { "epoch": 0.07718120805369127, "grad_norm": 0.57421875, "learning_rate": 0.0002999782255880726, "loss": 1.0074, "step": 184 }, { "epoch": 0.07885906040268456, "grad_norm": 0.5078125, "learning_rate": 0.0002999754188069496, "loss": 0.8806, "step": 188 }, { "epoch": 0.08053691275167785, "grad_norm": 0.51953125, "learning_rate": 0.0002999724419370114, "loss": 0.959, "step": 192 }, { "epoch": 0.08221476510067115, "grad_norm": 0.5, "learning_rate": 0.0002999692949816343, "loss": 0.8861, "step": 196 }, { "epoch": 0.08389261744966443, "grad_norm": 0.47265625, "learning_rate": 0.00029996597794438706, "loss": 0.9081, "step": 200 }, { "epoch": 0.08557046979865772, "grad_norm": 0.484375, "learning_rate": 0.00029996249082903174, "loss": 0.9518, "step": 204 }, { "epoch": 0.087248322147651, "grad_norm": 0.5703125, "learning_rate": 0.0002999588336395231, "loss": 0.9668, "step": 208 }, { "epoch": 0.08892617449664429, "grad_norm": 0.388671875, "learning_rate": 0.0002999550063800087, "loss": 0.9889, "step": 212 }, { "epoch": 0.09060402684563758, "grad_norm": 0.396484375, "learning_rate": 0.0002999510090548291, "loss": 1.0162, "step": 216 }, { "epoch": 0.09228187919463088, "grad_norm": 0.5859375, "learning_rate": 0.00029994684166851786, "loss": 0.9121, "step": 220 }, { "epoch": 0.09395973154362416, "grad_norm": 0.45703125, "learning_rate": 0.0002999425042258011, "loss": 0.8678, "step": 224 }, { "epoch": 0.09563758389261745, "grad_norm": 0.47265625, "learning_rate": 0.00029993799673159804, "loss": 1.0144, "step": 228 }, { "epoch": 0.09731543624161074, "grad_norm": 0.57421875, "learning_rate": 0.0002999333191910206, "loss": 0.906, "step": 232 }, { "epoch": 0.09899328859060402, "grad_norm": 0.421875, "learning_rate": 0.0002999284716093737, "loss": 0.9312, "step": 236 }, { "epoch": 0.10067114093959731, "grad_norm": 0.48046875, "learning_rate": 0.000299923453992155, "loss": 0.9677, "step": 240 }, { "epoch": 0.10234899328859061, "grad_norm": 0.5234375, "learning_rate": 0.0002999182663450551, "loss": 1.0189, "step": 244 }, { "epoch": 0.1040268456375839, "grad_norm": 0.51953125, "learning_rate": 0.00029991290867395736, "loss": 0.9925, "step": 248 }, { "epoch": 0.10570469798657718, "grad_norm": 0.41015625, "learning_rate": 0.0002999073809849379, "loss": 0.967, "step": 252 }, { "epoch": 0.10738255033557047, "grad_norm": 0.50390625, "learning_rate": 0.00029990168328426574, "loss": 0.8949, "step": 256 }, { "epoch": 0.10906040268456375, "grad_norm": 0.51953125, "learning_rate": 0.00029989581557840283, "loss": 0.9469, "step": 260 }, { "epoch": 0.11073825503355705, "grad_norm": 0.484375, "learning_rate": 0.00029988977787400367, "loss": 1.1107, "step": 264 }, { "epoch": 0.11241610738255034, "grad_norm": 0.5, "learning_rate": 0.0002998835701779157, "loss": 0.8897, "step": 268 }, { "epoch": 0.11409395973154363, "grad_norm": 0.46484375, "learning_rate": 0.0002998771924971792, "loss": 0.9203, "step": 272 }, { "epoch": 0.11577181208053691, "grad_norm": 0.38671875, "learning_rate": 0.00029987064483902716, "loss": 0.8509, "step": 276 }, { "epoch": 0.1174496644295302, "grad_norm": 0.57421875, "learning_rate": 0.00029986392721088537, "loss": 0.6903, "step": 280 }, { "epoch": 0.11912751677852348, "grad_norm": 0.46875, "learning_rate": 0.00029985703962037225, "loss": 0.8941, "step": 284 }, { "epoch": 0.12080536912751678, "grad_norm": 0.365234375, "learning_rate": 0.0002998499820752992, "loss": 0.9462, "step": 288 }, { "epoch": 0.12248322147651007, "grad_norm": 0.55078125, "learning_rate": 0.00029984275458367027, "loss": 0.9579, "step": 292 }, { "epoch": 0.12416107382550336, "grad_norm": 0.400390625, "learning_rate": 0.0002998353571536822, "loss": 0.975, "step": 296 }, { "epoch": 0.12583892617449666, "grad_norm": 0.53125, "learning_rate": 0.00029982778979372447, "loss": 0.8866, "step": 300 }, { "epoch": 0.12751677852348994, "grad_norm": 0.443359375, "learning_rate": 0.0002998200525123794, "loss": 0.8697, "step": 304 }, { "epoch": 0.12919463087248323, "grad_norm": 0.73046875, "learning_rate": 0.00029981214531842184, "loss": 0.9911, "step": 308 }, { "epoch": 0.13087248322147652, "grad_norm": 0.53515625, "learning_rate": 0.0002998040682208195, "loss": 0.9035, "step": 312 }, { "epoch": 0.1325503355704698, "grad_norm": 0.466796875, "learning_rate": 0.00029979582122873264, "loss": 0.8463, "step": 316 }, { "epoch": 0.1342281879194631, "grad_norm": 0.55078125, "learning_rate": 0.0002997874043515143, "loss": 0.9234, "step": 320 }, { "epoch": 0.13590604026845637, "grad_norm": 0.56640625, "learning_rate": 0.00029977881759871014, "loss": 0.8061, "step": 324 }, { "epoch": 0.13758389261744966, "grad_norm": 0.53515625, "learning_rate": 0.00029977006098005844, "loss": 0.927, "step": 328 }, { "epoch": 0.13926174496644295, "grad_norm": 0.5078125, "learning_rate": 0.00029976113450549036, "loss": 0.9292, "step": 332 }, { "epoch": 0.14093959731543623, "grad_norm": 0.6171875, "learning_rate": 0.00029975203818512935, "loss": 0.8693, "step": 336 }, { "epoch": 0.14261744966442952, "grad_norm": 0.5390625, "learning_rate": 0.0002997427720292917, "loss": 0.88, "step": 340 }, { "epoch": 0.14429530201342283, "grad_norm": 0.640625, "learning_rate": 0.00029973333604848624, "loss": 0.8712, "step": 344 }, { "epoch": 0.14597315436241612, "grad_norm": 0.443359375, "learning_rate": 0.0002997237302534145, "loss": 0.905, "step": 348 }, { "epoch": 0.1476510067114094, "grad_norm": 0.67578125, "learning_rate": 0.0002997139546549704, "loss": 0.8432, "step": 352 }, { "epoch": 0.1493288590604027, "grad_norm": 0.61328125, "learning_rate": 0.0002997040092642407, "loss": 0.8936, "step": 356 }, { "epoch": 0.15100671140939598, "grad_norm": 0.470703125, "learning_rate": 0.0002996938940925045, "loss": 0.9304, "step": 360 }, { "epoch": 0.15268456375838926, "grad_norm": 0.5703125, "learning_rate": 0.0002996836091512335, "loss": 0.8782, "step": 364 }, { "epoch": 0.15436241610738255, "grad_norm": 0.6171875, "learning_rate": 0.00029967315445209193, "loss": 0.8652, "step": 368 }, { "epoch": 0.15604026845637584, "grad_norm": 0.49609375, "learning_rate": 0.00029966253000693674, "loss": 0.9251, "step": 372 }, { "epoch": 0.15771812080536912, "grad_norm": 0.462890625, "learning_rate": 0.0002996517358278171, "loss": 0.8218, "step": 376 }, { "epoch": 0.1593959731543624, "grad_norm": 0.63671875, "learning_rate": 0.00029964077192697487, "loss": 0.946, "step": 380 }, { "epoch": 0.1610738255033557, "grad_norm": 0.5078125, "learning_rate": 0.0002996296383168443, "loss": 0.8872, "step": 384 }, { "epoch": 0.16275167785234898, "grad_norm": 0.6171875, "learning_rate": 0.00029961833501005223, "loss": 0.8523, "step": 388 }, { "epoch": 0.1644295302013423, "grad_norm": 0.59375, "learning_rate": 0.0002996068620194177, "loss": 0.8799, "step": 392 }, { "epoch": 0.16610738255033558, "grad_norm": 0.50390625, "learning_rate": 0.00029959521935795253, "loss": 0.8898, "step": 396 }, { "epoch": 0.16778523489932887, "grad_norm": 0.625, "learning_rate": 0.0002995834070388607, "loss": 0.9137, "step": 400 }, { "epoch": 0.16946308724832215, "grad_norm": 0.578125, "learning_rate": 0.00029957142507553873, "loss": 0.832, "step": 404 }, { "epoch": 0.17114093959731544, "grad_norm": 0.44921875, "learning_rate": 0.0002995592734815755, "loss": 0.9506, "step": 408 }, { "epoch": 0.17281879194630873, "grad_norm": 0.58984375, "learning_rate": 0.00029954695227075225, "loss": 0.8445, "step": 412 }, { "epoch": 0.174496644295302, "grad_norm": 0.45703125, "learning_rate": 0.00029953446145704256, "loss": 0.9448, "step": 416 }, { "epoch": 0.1761744966442953, "grad_norm": 0.5390625, "learning_rate": 0.0002995218010546125, "loss": 0.8636, "step": 420 }, { "epoch": 0.17785234899328858, "grad_norm": 0.53125, "learning_rate": 0.0002995089710778203, "loss": 0.8672, "step": 424 }, { "epoch": 0.17953020134228187, "grad_norm": 0.546875, "learning_rate": 0.00029949597154121666, "loss": 0.8018, "step": 428 }, { "epoch": 0.18120805369127516, "grad_norm": 0.54296875, "learning_rate": 0.0002994828024595444, "loss": 0.9538, "step": 432 }, { "epoch": 0.18288590604026847, "grad_norm": 0.484375, "learning_rate": 0.00029946946384773877, "loss": 0.946, "step": 436 }, { "epoch": 0.18456375838926176, "grad_norm": 0.59765625, "learning_rate": 0.00029945595572092727, "loss": 0.8828, "step": 440 }, { "epoch": 0.18624161073825504, "grad_norm": 0.58984375, "learning_rate": 0.0002994422780944296, "loss": 0.9043, "step": 444 }, { "epoch": 0.18791946308724833, "grad_norm": 0.51953125, "learning_rate": 0.00029942843098375765, "loss": 0.841, "step": 448 }, { "epoch": 0.18959731543624161, "grad_norm": 0.45703125, "learning_rate": 0.0002994144144046157, "loss": 0.7846, "step": 452 }, { "epoch": 0.1912751677852349, "grad_norm": 0.48828125, "learning_rate": 0.00029940022837290003, "loss": 0.7902, "step": 456 }, { "epoch": 0.1929530201342282, "grad_norm": 0.4609375, "learning_rate": 0.0002993858729046992, "loss": 0.8384, "step": 460 }, { "epoch": 0.19463087248322147, "grad_norm": 0.4765625, "learning_rate": 0.0002993713480162939, "loss": 0.8327, "step": 464 }, { "epoch": 0.19630872483221476, "grad_norm": 0.48046875, "learning_rate": 0.000299356653724157, "loss": 0.8456, "step": 468 }, { "epoch": 0.19798657718120805, "grad_norm": 0.419921875, "learning_rate": 0.00029934179004495345, "loss": 0.7675, "step": 472 }, { "epoch": 0.19966442953020133, "grad_norm": 0.51171875, "learning_rate": 0.0002993267569955403, "loss": 0.7727, "step": 476 }, { "epoch": 0.20134228187919462, "grad_norm": 0.6484375, "learning_rate": 0.0002993115545929667, "loss": 0.8757, "step": 480 }, { "epoch": 0.20302013422818793, "grad_norm": 0.462890625, "learning_rate": 0.0002992961828544738, "loss": 0.7539, "step": 484 }, { "epoch": 0.20469798657718122, "grad_norm": 0.515625, "learning_rate": 0.000299280641797495, "loss": 0.8095, "step": 488 }, { "epoch": 0.2063758389261745, "grad_norm": 0.5234375, "learning_rate": 0.00029926493143965553, "loss": 0.7628, "step": 492 }, { "epoch": 0.2080536912751678, "grad_norm": 0.47265625, "learning_rate": 0.00029924905179877265, "loss": 0.8208, "step": 496 }, { "epoch": 0.20973154362416108, "grad_norm": 0.60546875, "learning_rate": 0.0002992330028928557, "loss": 0.8971, "step": 500 }, { "epoch": 0.21140939597315436, "grad_norm": 0.53125, "learning_rate": 0.00029921678474010584, "loss": 0.8972, "step": 504 }, { "epoch": 0.21308724832214765, "grad_norm": 0.578125, "learning_rate": 0.00029920039735891634, "loss": 0.8499, "step": 508 }, { "epoch": 0.21476510067114093, "grad_norm": 0.59375, "learning_rate": 0.00029918384076787233, "loss": 0.8417, "step": 512 }, { "epoch": 0.21644295302013422, "grad_norm": 0.546875, "learning_rate": 0.00029916711498575077, "loss": 0.9253, "step": 516 }, { "epoch": 0.2181208053691275, "grad_norm": 0.51953125, "learning_rate": 0.00029915022003152054, "loss": 0.841, "step": 520 }, { "epoch": 0.2197986577181208, "grad_norm": 0.4453125, "learning_rate": 0.0002991331559243425, "loss": 0.7593, "step": 524 }, { "epoch": 0.2214765100671141, "grad_norm": 0.546875, "learning_rate": 0.00029911592268356906, "loss": 0.8801, "step": 528 }, { "epoch": 0.2231543624161074, "grad_norm": 0.53125, "learning_rate": 0.0002990985203287448, "loss": 0.836, "step": 532 }, { "epoch": 0.22483221476510068, "grad_norm": 0.7265625, "learning_rate": 0.0002990809488796059, "loss": 0.8607, "step": 536 }, { "epoch": 0.22651006711409397, "grad_norm": 0.46875, "learning_rate": 0.0002990632083560803, "loss": 0.7403, "step": 540 }, { "epoch": 0.22818791946308725, "grad_norm": 0.59765625, "learning_rate": 0.00029904529877828776, "loss": 0.7487, "step": 544 }, { "epoch": 0.22986577181208054, "grad_norm": 0.51953125, "learning_rate": 0.00029902722016653967, "loss": 0.8129, "step": 548 }, { "epoch": 0.23154362416107382, "grad_norm": 0.45703125, "learning_rate": 0.00029900897254133926, "loss": 0.7163, "step": 552 }, { "epoch": 0.2332214765100671, "grad_norm": 0.5390625, "learning_rate": 0.0002989905559233814, "loss": 0.8498, "step": 556 }, { "epoch": 0.2348993288590604, "grad_norm": 0.53515625, "learning_rate": 0.0002989719703335525, "loss": 0.8468, "step": 560 }, { "epoch": 0.23657718120805368, "grad_norm": 0.71875, "learning_rate": 0.00029895321579293064, "loss": 0.836, "step": 564 }, { "epoch": 0.23825503355704697, "grad_norm": 0.4765625, "learning_rate": 0.0002989342923227857, "loss": 0.8005, "step": 568 }, { "epoch": 0.23993288590604026, "grad_norm": 0.5546875, "learning_rate": 0.00029891519994457887, "loss": 0.6559, "step": 572 }, { "epoch": 0.24161073825503357, "grad_norm": 0.55078125, "learning_rate": 0.00029889593867996316, "loss": 0.8039, "step": 576 }, { "epoch": 0.24328859060402686, "grad_norm": 0.55078125, "learning_rate": 0.00029887650855078287, "loss": 0.7879, "step": 580 }, { "epoch": 0.24496644295302014, "grad_norm": 0.5234375, "learning_rate": 0.0002988569095790741, "loss": 0.8427, "step": 584 }, { "epoch": 0.24664429530201343, "grad_norm": 0.55859375, "learning_rate": 0.000298837141787064, "loss": 0.7655, "step": 588 }, { "epoch": 0.2483221476510067, "grad_norm": 0.4453125, "learning_rate": 0.0002988172051971717, "loss": 0.719, "step": 592 }, { "epoch": 0.25, "grad_norm": 0.5, "learning_rate": 0.0002987970998320073, "loss": 0.7523, "step": 596 }, { "epoch": 0.2516778523489933, "grad_norm": 0.6015625, "learning_rate": 0.00029877682571437264, "loss": 0.862, "step": 600 }, { "epoch": 0.2533557046979866, "grad_norm": 0.546875, "learning_rate": 0.00029875638286726086, "loss": 0.7467, "step": 604 }, { "epoch": 0.2550335570469799, "grad_norm": 0.52734375, "learning_rate": 0.0002987357713138564, "loss": 0.7376, "step": 608 }, { "epoch": 0.25671140939597314, "grad_norm": 0.66015625, "learning_rate": 0.0002987149910775349, "loss": 0.8816, "step": 612 }, { "epoch": 0.25838926174496646, "grad_norm": 0.60546875, "learning_rate": 0.0002986940421818637, "loss": 0.6839, "step": 616 }, { "epoch": 0.2600671140939597, "grad_norm": 0.6796875, "learning_rate": 0.00029867292465060103, "loss": 0.8829, "step": 620 }, { "epoch": 0.26174496644295303, "grad_norm": 0.5546875, "learning_rate": 0.00029865163850769657, "loss": 0.7537, "step": 624 }, { "epoch": 0.2634228187919463, "grad_norm": 0.6484375, "learning_rate": 0.00029863018377729116, "loss": 0.8391, "step": 628 }, { "epoch": 0.2651006711409396, "grad_norm": 0.50390625, "learning_rate": 0.0002986085604837169, "loss": 0.8109, "step": 632 }, { "epoch": 0.26677852348993286, "grad_norm": 0.50390625, "learning_rate": 0.0002985867686514969, "loss": 0.853, "step": 636 }, { "epoch": 0.2684563758389262, "grad_norm": 0.56640625, "learning_rate": 0.00029856480830534564, "loss": 0.7718, "step": 640 }, { "epoch": 0.2701342281879195, "grad_norm": 0.54296875, "learning_rate": 0.0002985426794701685, "loss": 0.7674, "step": 644 }, { "epoch": 0.27181208053691275, "grad_norm": 0.57421875, "learning_rate": 0.0002985203821710621, "loss": 0.8179, "step": 648 }, { "epoch": 0.27348993288590606, "grad_norm": 0.54296875, "learning_rate": 0.000298497916433314, "loss": 0.789, "step": 652 }, { "epoch": 0.2751677852348993, "grad_norm": 0.61328125, "learning_rate": 0.000298475282282403, "loss": 0.7973, "step": 656 }, { "epoch": 0.27684563758389263, "grad_norm": 0.66796875, "learning_rate": 0.00029845247974399854, "loss": 0.7885, "step": 660 }, { "epoch": 0.2785234899328859, "grad_norm": 0.796875, "learning_rate": 0.0002984295088439614, "loss": 0.8043, "step": 664 }, { "epoch": 0.2802013422818792, "grad_norm": 0.609375, "learning_rate": 0.00029840636960834304, "loss": 0.746, "step": 668 }, { "epoch": 0.28187919463087246, "grad_norm": 0.5859375, "learning_rate": 0.000298383062063386, "loss": 0.7126, "step": 672 }, { "epoch": 0.2835570469798658, "grad_norm": 0.4921875, "learning_rate": 0.00029835958623552355, "loss": 0.8304, "step": 676 }, { "epoch": 0.28523489932885904, "grad_norm": 0.50390625, "learning_rate": 0.00029833594215137997, "loss": 0.7288, "step": 680 }, { "epoch": 0.28691275167785235, "grad_norm": 0.58203125, "learning_rate": 0.00029831212983777016, "loss": 0.8086, "step": 684 }, { "epoch": 0.28859060402684567, "grad_norm": 0.484375, "learning_rate": 0.0002982881493217001, "loss": 0.6582, "step": 688 }, { "epoch": 0.2902684563758389, "grad_norm": 0.625, "learning_rate": 0.00029826400063036627, "loss": 0.7031, "step": 692 }, { "epoch": 0.29194630872483224, "grad_norm": 0.60546875, "learning_rate": 0.00029823968379115597, "loss": 0.7417, "step": 696 }, { "epoch": 0.2936241610738255, "grad_norm": 0.67578125, "learning_rate": 0.0002982151988316472, "loss": 0.7739, "step": 700 }, { "epoch": 0.2953020134228188, "grad_norm": 0.51953125, "learning_rate": 0.00029819054577960867, "loss": 0.6847, "step": 704 }, { "epoch": 0.29697986577181207, "grad_norm": 0.55078125, "learning_rate": 0.00029816572466299967, "loss": 0.9169, "step": 708 }, { "epoch": 0.2986577181208054, "grad_norm": 0.5703125, "learning_rate": 0.00029814073550997006, "loss": 0.7464, "step": 712 }, { "epoch": 0.30033557046979864, "grad_norm": 0.50390625, "learning_rate": 0.0002981155783488604, "loss": 0.8583, "step": 716 }, { "epoch": 0.30201342281879195, "grad_norm": 0.67578125, "learning_rate": 0.0002980902532082017, "loss": 0.7949, "step": 720 }, { "epoch": 0.3036912751677852, "grad_norm": 0.400390625, "learning_rate": 0.0002980647601167154, "loss": 0.669, "step": 724 }, { "epoch": 0.3053691275167785, "grad_norm": 0.59765625, "learning_rate": 0.00029803909910331355, "loss": 0.6572, "step": 728 }, { "epoch": 0.3070469798657718, "grad_norm": 0.5234375, "learning_rate": 0.00029801327019709866, "loss": 0.7611, "step": 732 }, { "epoch": 0.3087248322147651, "grad_norm": 0.515625, "learning_rate": 0.0002979872734273635, "loss": 0.9077, "step": 736 }, { "epoch": 0.3104026845637584, "grad_norm": 0.52734375, "learning_rate": 0.00029796110882359127, "loss": 0.6851, "step": 740 }, { "epoch": 0.31208053691275167, "grad_norm": 0.51171875, "learning_rate": 0.0002979347764154557, "loss": 0.8761, "step": 744 }, { "epoch": 0.313758389261745, "grad_norm": 0.65234375, "learning_rate": 0.00029790827623282047, "loss": 0.8134, "step": 748 }, { "epoch": 0.31543624161073824, "grad_norm": 0.57421875, "learning_rate": 0.00029788160830573993, "loss": 0.7714, "step": 752 }, { "epoch": 0.31711409395973156, "grad_norm": 0.578125, "learning_rate": 0.00029785477266445836, "loss": 0.6238, "step": 756 }, { "epoch": 0.3187919463087248, "grad_norm": 0.5625, "learning_rate": 0.0002978277693394104, "loss": 0.7157, "step": 760 }, { "epoch": 0.32046979865771813, "grad_norm": 0.5546875, "learning_rate": 0.0002978005983612208, "loss": 0.72, "step": 764 }, { "epoch": 0.3221476510067114, "grad_norm": 0.5859375, "learning_rate": 0.00029777325976070447, "loss": 0.872, "step": 768 }, { "epoch": 0.3238255033557047, "grad_norm": 0.486328125, "learning_rate": 0.00029774575356886646, "loss": 0.7406, "step": 772 }, { "epoch": 0.32550335570469796, "grad_norm": 0.474609375, "learning_rate": 0.0002977180798169018, "loss": 0.8466, "step": 776 }, { "epoch": 0.3271812080536913, "grad_norm": 0.62109375, "learning_rate": 0.0002976902385361957, "loss": 0.7448, "step": 780 }, { "epoch": 0.3288590604026846, "grad_norm": 0.48828125, "learning_rate": 0.00029766222975832323, "loss": 0.7669, "step": 784 }, { "epoch": 0.33053691275167785, "grad_norm": 0.490234375, "learning_rate": 0.0002976340535150494, "loss": 0.7243, "step": 788 }, { "epoch": 0.33221476510067116, "grad_norm": 0.5, "learning_rate": 0.0002976057098383293, "loss": 0.7737, "step": 792 }, { "epoch": 0.3338926174496644, "grad_norm": 0.52734375, "learning_rate": 0.0002975771987603076, "loss": 0.6729, "step": 796 }, { "epoch": 0.33557046979865773, "grad_norm": 0.578125, "learning_rate": 0.0002975485203133194, "loss": 0.6918, "step": 800 }, { "epoch": 0.337248322147651, "grad_norm": 0.57421875, "learning_rate": 0.0002975196745298889, "loss": 0.7316, "step": 804 }, { "epoch": 0.3389261744966443, "grad_norm": 0.52734375, "learning_rate": 0.0002974906614427307, "loss": 0.7644, "step": 808 }, { "epoch": 0.34060402684563756, "grad_norm": 0.5703125, "learning_rate": 0.00029746148108474865, "loss": 0.7228, "step": 812 }, { "epoch": 0.3422818791946309, "grad_norm": 0.48046875, "learning_rate": 0.00029743213348903665, "loss": 0.7448, "step": 816 }, { "epoch": 0.34395973154362414, "grad_norm": 0.462890625, "learning_rate": 0.0002974026186888781, "loss": 0.7647, "step": 820 }, { "epoch": 0.34563758389261745, "grad_norm": 0.43359375, "learning_rate": 0.00029737293671774613, "loss": 0.7334, "step": 824 }, { "epoch": 0.34731543624161076, "grad_norm": 0.59765625, "learning_rate": 0.0002973430876093033, "loss": 0.6879, "step": 828 }, { "epoch": 0.348993288590604, "grad_norm": 0.58203125, "learning_rate": 0.00029731307139740183, "loss": 0.8587, "step": 832 }, { "epoch": 0.35067114093959734, "grad_norm": 0.4453125, "learning_rate": 0.00029728288811608355, "loss": 0.7396, "step": 836 }, { "epoch": 0.3523489932885906, "grad_norm": 0.62890625, "learning_rate": 0.0002972525377995796, "loss": 0.7245, "step": 840 }, { "epoch": 0.3540268456375839, "grad_norm": 0.5234375, "learning_rate": 0.0002972220204823105, "loss": 0.6993, "step": 844 }, { "epoch": 0.35570469798657717, "grad_norm": 0.494140625, "learning_rate": 0.0002971913361988865, "loss": 0.7608, "step": 848 }, { "epoch": 0.3573825503355705, "grad_norm": 0.54296875, "learning_rate": 0.00029716048498410684, "loss": 0.7186, "step": 852 }, { "epoch": 0.35906040268456374, "grad_norm": 0.67578125, "learning_rate": 0.00029712946687296025, "loss": 0.725, "step": 856 }, { "epoch": 0.36073825503355705, "grad_norm": 0.6015625, "learning_rate": 0.0002970982819006248, "loss": 0.7505, "step": 860 }, { "epoch": 0.3624161073825503, "grad_norm": 0.6875, "learning_rate": 0.00029706693010246756, "loss": 0.8403, "step": 864 }, { "epoch": 0.3640939597315436, "grad_norm": 0.68359375, "learning_rate": 0.0002970354115140452, "loss": 0.6651, "step": 868 }, { "epoch": 0.36577181208053694, "grad_norm": 0.54296875, "learning_rate": 0.00029700372617110307, "loss": 0.8059, "step": 872 }, { "epoch": 0.3674496644295302, "grad_norm": 0.5546875, "learning_rate": 0.00029697187410957605, "loss": 0.6971, "step": 876 }, { "epoch": 0.3691275167785235, "grad_norm": 0.52734375, "learning_rate": 0.00029693985536558774, "loss": 0.5994, "step": 880 }, { "epoch": 0.37080536912751677, "grad_norm": 0.484375, "learning_rate": 0.00029690766997545116, "loss": 0.7647, "step": 884 }, { "epoch": 0.3724832214765101, "grad_norm": 0.54296875, "learning_rate": 0.000296875317975668, "loss": 0.5874, "step": 888 }, { "epoch": 0.37416107382550334, "grad_norm": 0.546875, "learning_rate": 0.00029684279940292907, "loss": 0.6996, "step": 892 }, { "epoch": 0.37583892617449666, "grad_norm": 0.53515625, "learning_rate": 0.00029681011429411394, "loss": 0.8038, "step": 896 }, { "epoch": 0.3775167785234899, "grad_norm": 2.734375, "learning_rate": 0.00029677726268629136, "loss": 0.5272, "step": 900 }, { "epoch": 0.37919463087248323, "grad_norm": 0.59375, "learning_rate": 0.0002967442446167186, "loss": 0.6546, "step": 904 }, { "epoch": 0.3808724832214765, "grad_norm": 0.703125, "learning_rate": 0.00029671106012284183, "loss": 0.6985, "step": 908 }, { "epoch": 0.3825503355704698, "grad_norm": 0.46875, "learning_rate": 0.00029667770924229593, "loss": 0.5746, "step": 912 }, { "epoch": 0.38422818791946306, "grad_norm": 0.6328125, "learning_rate": 0.00029664419201290453, "loss": 0.6048, "step": 916 }, { "epoch": 0.3859060402684564, "grad_norm": 0.7265625, "learning_rate": 0.00029661050847267997, "loss": 0.7791, "step": 920 }, { "epoch": 0.3875838926174497, "grad_norm": 0.56640625, "learning_rate": 0.00029657665865982303, "loss": 0.6703, "step": 924 }, { "epoch": 0.38926174496644295, "grad_norm": 0.67578125, "learning_rate": 0.00029654264261272324, "loss": 0.7163, "step": 928 }, { "epoch": 0.39093959731543626, "grad_norm": 0.66796875, "learning_rate": 0.0002965084603699585, "loss": 0.8066, "step": 932 }, { "epoch": 0.3926174496644295, "grad_norm": 0.59765625, "learning_rate": 0.00029647411197029536, "loss": 0.8337, "step": 936 }, { "epoch": 0.39429530201342283, "grad_norm": 0.5078125, "learning_rate": 0.00029643959745268873, "loss": 0.6238, "step": 940 }, { "epoch": 0.3959731543624161, "grad_norm": 0.5234375, "learning_rate": 0.0002964049168562818, "loss": 0.7123, "step": 944 }, { "epoch": 0.3976510067114094, "grad_norm": 0.6484375, "learning_rate": 0.0002963700702204064, "loss": 0.8487, "step": 948 }, { "epoch": 0.39932885906040266, "grad_norm": 0.51171875, "learning_rate": 0.00029633505758458234, "loss": 0.8297, "step": 952 }, { "epoch": 0.401006711409396, "grad_norm": 0.4921875, "learning_rate": 0.0002962998789885179, "loss": 0.7055, "step": 956 }, { "epoch": 0.40268456375838924, "grad_norm": 0.515625, "learning_rate": 0.0002962645344721095, "loss": 0.7864, "step": 960 }, { "epoch": 0.40436241610738255, "grad_norm": 0.57421875, "learning_rate": 0.00029622902407544185, "loss": 0.7281, "step": 964 }, { "epoch": 0.40604026845637586, "grad_norm": 0.484375, "learning_rate": 0.0002961933478387876, "loss": 0.8818, "step": 968 }, { "epoch": 0.4077181208053691, "grad_norm": 0.58203125, "learning_rate": 0.00029615750580260755, "loss": 0.7393, "step": 972 }, { "epoch": 0.40939597315436244, "grad_norm": 0.51171875, "learning_rate": 0.00029612149800755066, "loss": 0.7776, "step": 976 }, { "epoch": 0.4110738255033557, "grad_norm": 0.458984375, "learning_rate": 0.00029608532449445374, "loss": 0.7856, "step": 980 }, { "epoch": 0.412751677852349, "grad_norm": 0.52734375, "learning_rate": 0.0002960489853043415, "loss": 0.801, "step": 984 }, { "epoch": 0.41442953020134227, "grad_norm": 0.50390625, "learning_rate": 0.0002960124804784267, "loss": 0.6354, "step": 988 }, { "epoch": 0.4161073825503356, "grad_norm": 0.51953125, "learning_rate": 0.00029597581005810986, "loss": 0.6427, "step": 992 }, { "epoch": 0.41778523489932884, "grad_norm": 0.6640625, "learning_rate": 0.00029593897408497933, "loss": 0.835, "step": 996 }, { "epoch": 0.41946308724832215, "grad_norm": 0.72265625, "learning_rate": 0.0002959019726008112, "loss": 0.6732, "step": 1000 }, { "epoch": 0.4211409395973154, "grad_norm": 0.40625, "learning_rate": 0.00029586480564756923, "loss": 0.7744, "step": 1004 }, { "epoch": 0.4228187919463087, "grad_norm": 0.67578125, "learning_rate": 0.00029582747326740495, "loss": 0.7698, "step": 1008 }, { "epoch": 0.42449664429530204, "grad_norm": 0.57421875, "learning_rate": 0.00029578997550265735, "loss": 0.6433, "step": 1012 }, { "epoch": 0.4261744966442953, "grad_norm": 0.490234375, "learning_rate": 0.0002957523123958532, "loss": 0.812, "step": 1016 }, { "epoch": 0.4278523489932886, "grad_norm": 0.65625, "learning_rate": 0.0002957144839897065, "loss": 0.6674, "step": 1020 }, { "epoch": 0.42953020134228187, "grad_norm": 0.6015625, "learning_rate": 0.00029567649032711895, "loss": 0.7687, "step": 1024 }, { "epoch": 0.4312080536912752, "grad_norm": 0.5390625, "learning_rate": 0.0002956383314511795, "loss": 0.6815, "step": 1028 }, { "epoch": 0.43288590604026844, "grad_norm": 0.71875, "learning_rate": 0.0002956000074051648, "loss": 0.7487, "step": 1032 }, { "epoch": 0.43456375838926176, "grad_norm": 0.478515625, "learning_rate": 0.00029556151823253833, "loss": 0.7588, "step": 1036 }, { "epoch": 0.436241610738255, "grad_norm": 0.6328125, "learning_rate": 0.0002955228639769513, "loss": 0.6039, "step": 1040 }, { "epoch": 0.43791946308724833, "grad_norm": 0.55859375, "learning_rate": 0.00029548404468224173, "loss": 0.7912, "step": 1044 }, { "epoch": 0.4395973154362416, "grad_norm": 0.609375, "learning_rate": 0.0002954450603924351, "loss": 0.6296, "step": 1048 }, { "epoch": 0.4412751677852349, "grad_norm": 0.71484375, "learning_rate": 0.00029540591115174406, "loss": 0.7695, "step": 1052 }, { "epoch": 0.4429530201342282, "grad_norm": 0.5234375, "learning_rate": 0.000295366597004568, "loss": 0.7162, "step": 1056 }, { "epoch": 0.4446308724832215, "grad_norm": 0.55078125, "learning_rate": 0.00029532711799549365, "loss": 0.7397, "step": 1060 }, { "epoch": 0.4463087248322148, "grad_norm": 0.546875, "learning_rate": 0.00029528747416929463, "loss": 0.6545, "step": 1064 }, { "epoch": 0.44798657718120805, "grad_norm": 0.69140625, "learning_rate": 0.00029524766557093143, "loss": 0.8298, "step": 1068 }, { "epoch": 0.44966442953020136, "grad_norm": 0.5390625, "learning_rate": 0.0002952076922455514, "loss": 0.6564, "step": 1072 }, { "epoch": 0.4513422818791946, "grad_norm": 0.58984375, "learning_rate": 0.00029516755423848864, "loss": 0.6474, "step": 1076 }, { "epoch": 0.45302013422818793, "grad_norm": 0.484375, "learning_rate": 0.0002951272515952643, "loss": 0.7872, "step": 1080 }, { "epoch": 0.4546979865771812, "grad_norm": 0.81640625, "learning_rate": 0.00029508678436158606, "loss": 0.7447, "step": 1084 }, { "epoch": 0.4563758389261745, "grad_norm": 0.515625, "learning_rate": 0.0002950461525833481, "loss": 0.6762, "step": 1088 }, { "epoch": 0.45805369127516776, "grad_norm": 0.5390625, "learning_rate": 0.0002950053563066315, "loss": 0.7192, "step": 1092 }, { "epoch": 0.4597315436241611, "grad_norm": 0.59765625, "learning_rate": 0.0002949643955777038, "loss": 0.81, "step": 1096 }, { "epoch": 0.46140939597315433, "grad_norm": 0.58203125, "learning_rate": 0.0002949232704430189, "loss": 0.619, "step": 1100 }, { "epoch": 0.46308724832214765, "grad_norm": 0.71484375, "learning_rate": 0.00029488198094921735, "loss": 0.6601, "step": 1104 }, { "epoch": 0.46476510067114096, "grad_norm": 0.5078125, "learning_rate": 0.00029484052714312607, "loss": 0.7876, "step": 1108 }, { "epoch": 0.4664429530201342, "grad_norm": 0.427734375, "learning_rate": 0.0002947989090717583, "loss": 0.7803, "step": 1112 }, { "epoch": 0.46812080536912754, "grad_norm": 0.58203125, "learning_rate": 0.00029475712678231347, "loss": 0.8532, "step": 1116 }, { "epoch": 0.4697986577181208, "grad_norm": 0.50390625, "learning_rate": 0.00029471518032217736, "loss": 0.5511, "step": 1120 }, { "epoch": 0.4714765100671141, "grad_norm": 0.58984375, "learning_rate": 0.000294673069738922, "loss": 0.7981, "step": 1124 }, { "epoch": 0.47315436241610737, "grad_norm": 0.5078125, "learning_rate": 0.0002946307950803054, "loss": 0.7926, "step": 1128 }, { "epoch": 0.4748322147651007, "grad_norm": 0.609375, "learning_rate": 0.0002945883563942719, "loss": 0.7236, "step": 1132 }, { "epoch": 0.47651006711409394, "grad_norm": 0.5859375, "learning_rate": 0.0002945457537289514, "loss": 0.765, "step": 1136 }, { "epoch": 0.47818791946308725, "grad_norm": 0.63671875, "learning_rate": 0.00029450298713266026, "loss": 0.6802, "step": 1140 }, { "epoch": 0.4798657718120805, "grad_norm": 0.5859375, "learning_rate": 0.00029446005665390056, "loss": 0.6268, "step": 1144 }, { "epoch": 0.4815436241610738, "grad_norm": 0.6640625, "learning_rate": 0.00029441696234136017, "loss": 0.535, "step": 1148 }, { "epoch": 0.48322147651006714, "grad_norm": 0.42578125, "learning_rate": 0.00029437370424391287, "loss": 0.6569, "step": 1152 }, { "epoch": 0.4848993288590604, "grad_norm": 0.54296875, "learning_rate": 0.00029433028241061815, "loss": 0.6187, "step": 1156 }, { "epoch": 0.4865771812080537, "grad_norm": 0.66015625, "learning_rate": 0.00029428669689072123, "loss": 0.6756, "step": 1160 }, { "epoch": 0.48825503355704697, "grad_norm": 0.60546875, "learning_rate": 0.0002942429477336529, "loss": 0.7356, "step": 1164 }, { "epoch": 0.4899328859060403, "grad_norm": 0.474609375, "learning_rate": 0.0002941990349890296, "loss": 0.7078, "step": 1168 }, { "epoch": 0.49161073825503354, "grad_norm": 0.6171875, "learning_rate": 0.00029415495870665327, "loss": 0.7541, "step": 1172 }, { "epoch": 0.49328859060402686, "grad_norm": 0.62109375, "learning_rate": 0.0002941107189365112, "loss": 0.6978, "step": 1176 }, { "epoch": 0.4949664429530201, "grad_norm": 0.54296875, "learning_rate": 0.00029406631572877646, "loss": 0.681, "step": 1180 }, { "epoch": 0.4966442953020134, "grad_norm": 0.53125, "learning_rate": 0.000294021749133807, "loss": 0.837, "step": 1184 }, { "epoch": 0.4983221476510067, "grad_norm": 0.57421875, "learning_rate": 0.00029397701920214645, "loss": 0.6888, "step": 1188 }, { "epoch": 0.5, "grad_norm": 0.5625, "learning_rate": 0.0002939321259845234, "loss": 0.7138, "step": 1192 }, { "epoch": 0.5016778523489933, "grad_norm": 0.7890625, "learning_rate": 0.00029388706953185185, "loss": 0.8087, "step": 1196 }, { "epoch": 0.5033557046979866, "grad_norm": 0.55078125, "learning_rate": 0.00029384184989523083, "loss": 0.7137, "step": 1200 }, { "epoch": 0.5050335570469798, "grad_norm": 0.515625, "learning_rate": 0.00029379646712594435, "loss": 0.7399, "step": 1204 }, { "epoch": 0.5067114093959731, "grad_norm": 0.52734375, "learning_rate": 0.00029375092127546164, "loss": 0.7248, "step": 1208 }, { "epoch": 0.5083892617449665, "grad_norm": 0.4921875, "learning_rate": 0.0002937052123954367, "loss": 0.5457, "step": 1212 }, { "epoch": 0.5100671140939598, "grad_norm": 0.54296875, "learning_rate": 0.0002936593405377085, "loss": 0.704, "step": 1216 }, { "epoch": 0.511744966442953, "grad_norm": 0.5859375, "learning_rate": 0.00029361330575430075, "loss": 0.7456, "step": 1220 }, { "epoch": 0.5134228187919463, "grad_norm": 0.4296875, "learning_rate": 0.0002935671080974222, "loss": 0.7451, "step": 1224 }, { "epoch": 0.5151006711409396, "grad_norm": 0.39453125, "learning_rate": 0.000293520747619466, "loss": 0.7053, "step": 1228 }, { "epoch": 0.5167785234899329, "grad_norm": 0.53125, "learning_rate": 0.0002934742243730101, "loss": 0.6954, "step": 1232 }, { "epoch": 0.5184563758389261, "grad_norm": 0.52734375, "learning_rate": 0.000293427538410817, "loss": 0.7431, "step": 1236 }, { "epoch": 0.5201342281879194, "grad_norm": 0.55859375, "learning_rate": 0.0002933806897858339, "loss": 0.5957, "step": 1240 }, { "epoch": 0.5218120805369127, "grad_norm": 0.48046875, "learning_rate": 0.0002933336785511923, "loss": 0.6099, "step": 1244 }, { "epoch": 0.5234899328859061, "grad_norm": 0.53515625, "learning_rate": 0.0002932865047602081, "loss": 0.6993, "step": 1248 }, { "epoch": 0.5251677852348994, "grad_norm": 0.6328125, "learning_rate": 0.0002932391684663818, "loss": 0.7039, "step": 1252 }, { "epoch": 0.5268456375838926, "grad_norm": 0.609375, "learning_rate": 0.0002931916697233978, "loss": 0.7828, "step": 1256 }, { "epoch": 0.5285234899328859, "grad_norm": 0.46484375, "learning_rate": 0.0002931440085851251, "loss": 0.7677, "step": 1260 }, { "epoch": 0.5302013422818792, "grad_norm": 0.44921875, "learning_rate": 0.0002930961851056167, "loss": 0.6152, "step": 1264 }, { "epoch": 0.5318791946308725, "grad_norm": 0.6953125, "learning_rate": 0.0002930481993391098, "loss": 0.7108, "step": 1268 }, { "epoch": 0.5335570469798657, "grad_norm": 0.5390625, "learning_rate": 0.00029300005134002557, "loss": 0.6498, "step": 1272 }, { "epoch": 0.535234899328859, "grad_norm": 0.56640625, "learning_rate": 0.0002929517411629691, "loss": 0.8157, "step": 1276 }, { "epoch": 0.5369127516778524, "grad_norm": 0.458984375, "learning_rate": 0.00029290326886272967, "loss": 0.6123, "step": 1280 }, { "epoch": 0.5385906040268457, "grad_norm": 0.48046875, "learning_rate": 0.0002928546344942802, "loss": 0.7641, "step": 1284 }, { "epoch": 0.540268456375839, "grad_norm": 0.484375, "learning_rate": 0.00029280583811277735, "loss": 0.7601, "step": 1288 }, { "epoch": 0.5419463087248322, "grad_norm": 0.515625, "learning_rate": 0.00029275687977356186, "loss": 0.854, "step": 1292 }, { "epoch": 0.5436241610738255, "grad_norm": 0.58203125, "learning_rate": 0.0002927077595321578, "loss": 0.7001, "step": 1296 }, { "epoch": 0.5453020134228188, "grad_norm": 0.75, "learning_rate": 0.00029265847744427303, "loss": 0.8108, "step": 1300 }, { "epoch": 0.5469798657718121, "grad_norm": 0.58984375, "learning_rate": 0.0002926090335657989, "loss": 0.6053, "step": 1304 }, { "epoch": 0.5486577181208053, "grad_norm": 0.5703125, "learning_rate": 0.0002925594279528103, "loss": 0.7318, "step": 1308 }, { "epoch": 0.5503355704697986, "grad_norm": 0.5703125, "learning_rate": 0.0002925096606615655, "loss": 0.7989, "step": 1312 }, { "epoch": 0.552013422818792, "grad_norm": 0.458984375, "learning_rate": 0.0002924597317485061, "loss": 0.5876, "step": 1316 }, { "epoch": 0.5536912751677853, "grad_norm": 0.74609375, "learning_rate": 0.00029240964127025715, "loss": 0.6705, "step": 1320 }, { "epoch": 0.5553691275167785, "grad_norm": 0.4921875, "learning_rate": 0.0002923593892836268, "loss": 0.5973, "step": 1324 }, { "epoch": 0.5570469798657718, "grad_norm": 0.7890625, "learning_rate": 0.0002923089758456063, "loss": 0.8137, "step": 1328 }, { "epoch": 0.5587248322147651, "grad_norm": 0.54296875, "learning_rate": 0.0002922584010133702, "loss": 0.7684, "step": 1332 }, { "epoch": 0.5604026845637584, "grad_norm": 0.61328125, "learning_rate": 0.00029220766484427594, "loss": 0.5918, "step": 1336 }, { "epoch": 0.5620805369127517, "grad_norm": 0.6875, "learning_rate": 0.000292156767395864, "loss": 0.8552, "step": 1340 }, { "epoch": 0.5637583892617449, "grad_norm": 0.470703125, "learning_rate": 0.0002921057087258577, "loss": 0.8348, "step": 1344 }, { "epoch": 0.5654362416107382, "grad_norm": 0.50390625, "learning_rate": 0.00029205448889216333, "loss": 0.6945, "step": 1348 }, { "epoch": 0.5671140939597316, "grad_norm": 0.5546875, "learning_rate": 0.00029200310795286983, "loss": 0.8099, "step": 1352 }, { "epoch": 0.5687919463087249, "grad_norm": 0.53125, "learning_rate": 0.00029195156596624895, "loss": 0.5569, "step": 1356 }, { "epoch": 0.5704697986577181, "grad_norm": 0.5625, "learning_rate": 0.000291899862990755, "loss": 0.5193, "step": 1360 }, { "epoch": 0.5721476510067114, "grad_norm": 0.45703125, "learning_rate": 0.00029184799908502497, "loss": 0.5083, "step": 1364 }, { "epoch": 0.5738255033557047, "grad_norm": 0.484375, "learning_rate": 0.0002917959743078782, "loss": 0.7218, "step": 1368 }, { "epoch": 0.575503355704698, "grad_norm": 0.6875, "learning_rate": 0.00029174378871831664, "loss": 0.6714, "step": 1372 }, { "epoch": 0.5771812080536913, "grad_norm": 0.640625, "learning_rate": 0.0002916914423755245, "loss": 0.8035, "step": 1376 }, { "epoch": 0.5788590604026845, "grad_norm": 0.4609375, "learning_rate": 0.00029163893533886847, "loss": 0.7344, "step": 1380 }, { "epoch": 0.5805369127516778, "grad_norm": 0.5859375, "learning_rate": 0.00029158626766789727, "loss": 0.6616, "step": 1384 }, { "epoch": 0.5822147651006712, "grad_norm": 0.54296875, "learning_rate": 0.0002915334394223419, "loss": 0.6039, "step": 1388 }, { "epoch": 0.5838926174496645, "grad_norm": 0.4609375, "learning_rate": 0.00029148045066211553, "loss": 0.5724, "step": 1392 }, { "epoch": 0.5855704697986577, "grad_norm": 0.443359375, "learning_rate": 0.0002914273014473133, "loss": 0.6755, "step": 1396 }, { "epoch": 0.587248322147651, "grad_norm": 0.578125, "learning_rate": 0.0002913739918382123, "loss": 0.6544, "step": 1400 }, { "epoch": 0.5889261744966443, "grad_norm": 0.65625, "learning_rate": 0.0002913205218952716, "loss": 0.6945, "step": 1404 }, { "epoch": 0.5906040268456376, "grad_norm": 0.5546875, "learning_rate": 0.00029126689167913205, "loss": 0.6331, "step": 1408 }, { "epoch": 0.5922818791946308, "grad_norm": 0.43359375, "learning_rate": 0.00029121310125061625, "loss": 0.5909, "step": 1412 }, { "epoch": 0.5939597315436241, "grad_norm": 0.50390625, "learning_rate": 0.0002911591506707286, "loss": 0.559, "step": 1416 }, { "epoch": 0.5956375838926175, "grad_norm": 0.59765625, "learning_rate": 0.00029110504000065497, "loss": 0.6884, "step": 1420 }, { "epoch": 0.5973154362416108, "grad_norm": 0.46484375, "learning_rate": 0.00029105076930176297, "loss": 0.6122, "step": 1424 }, { "epoch": 0.5989932885906041, "grad_norm": 0.5625, "learning_rate": 0.00029099633863560157, "loss": 0.6822, "step": 1428 }, { "epoch": 0.6006711409395973, "grad_norm": 0.61328125, "learning_rate": 0.00029094174806390115, "loss": 0.7498, "step": 1432 }, { "epoch": 0.6023489932885906, "grad_norm": 0.53515625, "learning_rate": 0.00029088699764857364, "loss": 0.6331, "step": 1436 }, { "epoch": 0.6040268456375839, "grad_norm": 0.52734375, "learning_rate": 0.0002908320874517119, "loss": 0.7323, "step": 1440 }, { "epoch": 0.6057046979865772, "grad_norm": 0.58203125, "learning_rate": 0.0002907770175355905, "loss": 0.7112, "step": 1444 }, { "epoch": 0.6073825503355704, "grad_norm": 0.57421875, "learning_rate": 0.00029072178796266454, "loss": 0.6785, "step": 1448 }, { "epoch": 0.6090604026845637, "grad_norm": 0.4765625, "learning_rate": 0.00029066639879557065, "loss": 0.6492, "step": 1452 }, { "epoch": 0.610738255033557, "grad_norm": 0.5234375, "learning_rate": 0.0002906108500971263, "loss": 0.6824, "step": 1456 }, { "epoch": 0.6124161073825504, "grad_norm": 0.5546875, "learning_rate": 0.00029055514193032986, "loss": 0.6734, "step": 1460 }, { "epoch": 0.6140939597315436, "grad_norm": 0.46875, "learning_rate": 0.00029049927435836074, "loss": 0.7382, "step": 1464 }, { "epoch": 0.6157718120805369, "grad_norm": 0.59375, "learning_rate": 0.00029044324744457875, "loss": 0.6978, "step": 1468 }, { "epoch": 0.6174496644295302, "grad_norm": 0.55078125, "learning_rate": 0.0002903870612525248, "loss": 0.7792, "step": 1472 }, { "epoch": 0.6191275167785235, "grad_norm": 0.6796875, "learning_rate": 0.0002903307158459202, "loss": 0.6491, "step": 1476 }, { "epoch": 0.6208053691275168, "grad_norm": 0.65625, "learning_rate": 0.000290274211288667, "loss": 0.7949, "step": 1480 }, { "epoch": 0.62248322147651, "grad_norm": 0.6484375, "learning_rate": 0.0002902175476448476, "loss": 0.5399, "step": 1484 }, { "epoch": 0.6241610738255033, "grad_norm": 0.5390625, "learning_rate": 0.0002901607249787249, "loss": 0.64, "step": 1488 }, { "epoch": 0.6258389261744967, "grad_norm": 0.5078125, "learning_rate": 0.0002901037433547421, "loss": 0.6886, "step": 1492 }, { "epoch": 0.62751677852349, "grad_norm": 0.6875, "learning_rate": 0.00029004660283752276, "loss": 0.7332, "step": 1496 }, { "epoch": 0.6291946308724832, "grad_norm": 0.58984375, "learning_rate": 0.0002899893034918705, "loss": 0.5901, "step": 1500 }, { "epoch": 0.6308724832214765, "grad_norm": 0.62109375, "learning_rate": 0.0002899318453827692, "loss": 0.769, "step": 1504 }, { "epoch": 0.6325503355704698, "grad_norm": 0.57421875, "learning_rate": 0.00028987422857538277, "loss": 0.798, "step": 1508 }, { "epoch": 0.6342281879194631, "grad_norm": 0.6484375, "learning_rate": 0.00028981645313505506, "loss": 0.6656, "step": 1512 }, { "epoch": 0.6359060402684564, "grad_norm": 0.55078125, "learning_rate": 0.0002897585191273098, "loss": 0.667, "step": 1516 }, { "epoch": 0.6375838926174496, "grad_norm": 0.6875, "learning_rate": 0.00028970042661785074, "loss": 0.7353, "step": 1520 }, { "epoch": 0.639261744966443, "grad_norm": 0.52734375, "learning_rate": 0.00028964217567256114, "loss": 0.5842, "step": 1524 }, { "epoch": 0.6409395973154363, "grad_norm": 0.44921875, "learning_rate": 0.0002895837663575041, "loss": 0.6155, "step": 1528 }, { "epoch": 0.6426174496644296, "grad_norm": 0.53125, "learning_rate": 0.00028952519873892213, "loss": 0.6728, "step": 1532 }, { "epoch": 0.6442953020134228, "grad_norm": 0.640625, "learning_rate": 0.00028946647288323766, "loss": 0.5326, "step": 1536 }, { "epoch": 0.6459731543624161, "grad_norm": 1.0234375, "learning_rate": 0.00028940758885705225, "loss": 0.5988, "step": 1540 }, { "epoch": 0.6476510067114094, "grad_norm": 0.671875, "learning_rate": 0.00028934854672714693, "loss": 0.8554, "step": 1544 }, { "epoch": 0.6493288590604027, "grad_norm": 0.5703125, "learning_rate": 0.000289289346560482, "loss": 0.5112, "step": 1548 }, { "epoch": 0.6510067114093959, "grad_norm": 0.6328125, "learning_rate": 0.00028922998842419715, "loss": 0.6745, "step": 1552 }, { "epoch": 0.6526845637583892, "grad_norm": 0.55859375, "learning_rate": 0.000289170472385611, "loss": 0.6021, "step": 1556 }, { "epoch": 0.6543624161073825, "grad_norm": 0.78515625, "learning_rate": 0.00028911079851222143, "loss": 0.5791, "step": 1560 }, { "epoch": 0.6560402684563759, "grad_norm": 0.5078125, "learning_rate": 0.00028905096687170526, "loss": 0.6457, "step": 1564 }, { "epoch": 0.6577181208053692, "grad_norm": 0.6171875, "learning_rate": 0.0002889909775319182, "loss": 0.5578, "step": 1568 }, { "epoch": 0.6593959731543624, "grad_norm": 0.6640625, "learning_rate": 0.0002889308305608948, "loss": 0.6746, "step": 1572 }, { "epoch": 0.6610738255033557, "grad_norm": 0.498046875, "learning_rate": 0.0002888705260268484, "loss": 0.5595, "step": 1576 }, { "epoch": 0.662751677852349, "grad_norm": 0.62109375, "learning_rate": 0.0002888100639981712, "loss": 0.661, "step": 1580 }, { "epoch": 0.6644295302013423, "grad_norm": 0.68359375, "learning_rate": 0.0002887494445434338, "loss": 0.5946, "step": 1584 }, { "epoch": 0.6661073825503355, "grad_norm": 0.4921875, "learning_rate": 0.0002886886677313853, "loss": 0.7232, "step": 1588 }, { "epoch": 0.6677852348993288, "grad_norm": 0.6328125, "learning_rate": 0.0002886277336309535, "loss": 0.6253, "step": 1592 }, { "epoch": 0.6694630872483222, "grad_norm": 0.5625, "learning_rate": 0.0002885666423112444, "loss": 0.6611, "step": 1596 }, { "epoch": 0.6711409395973155, "grad_norm": 0.57421875, "learning_rate": 0.0002885053938415424, "loss": 0.745, "step": 1600 }, { "epoch": 0.6728187919463087, "grad_norm": 0.671875, "learning_rate": 0.00028844398829131, "loss": 0.8422, "step": 1604 }, { "epoch": 0.674496644295302, "grad_norm": 0.48046875, "learning_rate": 0.000288382425730188, "loss": 0.5371, "step": 1608 }, { "epoch": 0.6761744966442953, "grad_norm": 0.63671875, "learning_rate": 0.0002883207062279951, "loss": 0.7714, "step": 1612 }, { "epoch": 0.6778523489932886, "grad_norm": 0.76171875, "learning_rate": 0.0002882588298547282, "loss": 0.6151, "step": 1616 }, { "epoch": 0.6795302013422819, "grad_norm": 0.78515625, "learning_rate": 0.0002881967966805619, "loss": 0.6846, "step": 1620 }, { "epoch": 0.6812080536912751, "grad_norm": 0.63671875, "learning_rate": 0.0002881346067758488, "loss": 0.6847, "step": 1624 }, { "epoch": 0.6828859060402684, "grad_norm": 0.515625, "learning_rate": 0.00028807226021111915, "loss": 0.775, "step": 1628 }, { "epoch": 0.6845637583892618, "grad_norm": 0.57421875, "learning_rate": 0.00028800975705708086, "loss": 0.5696, "step": 1632 }, { "epoch": 0.6862416107382551, "grad_norm": 0.52734375, "learning_rate": 0.0002879470973846195, "loss": 0.6315, "step": 1636 }, { "epoch": 0.6879194630872483, "grad_norm": 0.4921875, "learning_rate": 0.0002878842812647981, "loss": 0.7226, "step": 1640 }, { "epoch": 0.6895973154362416, "grad_norm": 0.58203125, "learning_rate": 0.0002878213087688571, "loss": 0.493, "step": 1644 }, { "epoch": 0.6912751677852349, "grad_norm": 0.50390625, "learning_rate": 0.00028775817996821437, "loss": 0.6445, "step": 1648 }, { "epoch": 0.6929530201342282, "grad_norm": 0.458984375, "learning_rate": 0.000287694894934465, "loss": 0.8196, "step": 1652 }, { "epoch": 0.6946308724832215, "grad_norm": 0.52734375, "learning_rate": 0.00028763145373938113, "loss": 0.6076, "step": 1656 }, { "epoch": 0.6963087248322147, "grad_norm": 0.50390625, "learning_rate": 0.0002875678564549123, "loss": 0.6893, "step": 1660 }, { "epoch": 0.697986577181208, "grad_norm": 0.546875, "learning_rate": 0.00028750410315318484, "loss": 0.7341, "step": 1664 }, { "epoch": 0.6996644295302014, "grad_norm": 0.58984375, "learning_rate": 0.0002874401939065021, "loss": 0.6682, "step": 1668 }, { "epoch": 0.7013422818791947, "grad_norm": 0.44921875, "learning_rate": 0.0002873761287873442, "loss": 0.6137, "step": 1672 }, { "epoch": 0.7030201342281879, "grad_norm": 0.53515625, "learning_rate": 0.00028731190786836827, "loss": 0.7249, "step": 1676 }, { "epoch": 0.7046979865771812, "grad_norm": 0.55859375, "learning_rate": 0.0002872475312224079, "loss": 0.609, "step": 1680 }, { "epoch": 0.7063758389261745, "grad_norm": 0.458984375, "learning_rate": 0.00028718299892247325, "loss": 0.5007, "step": 1684 }, { "epoch": 0.7080536912751678, "grad_norm": 0.6875, "learning_rate": 0.0002871183110417514, "loss": 0.7751, "step": 1688 }, { "epoch": 0.709731543624161, "grad_norm": 0.96875, "learning_rate": 0.00028705346765360535, "loss": 0.8296, "step": 1692 }, { "epoch": 0.7114093959731543, "grad_norm": 0.474609375, "learning_rate": 0.0002869884688315748, "loss": 0.7127, "step": 1696 }, { "epoch": 0.7130872483221476, "grad_norm": 0.6171875, "learning_rate": 0.00028692331464937575, "loss": 0.7667, "step": 1700 }, { "epoch": 0.714765100671141, "grad_norm": 0.59765625, "learning_rate": 0.00028685800518090017, "loss": 0.5851, "step": 1704 }, { "epoch": 0.7164429530201343, "grad_norm": 0.47265625, "learning_rate": 0.00028679254050021627, "loss": 0.4802, "step": 1708 }, { "epoch": 0.7181208053691275, "grad_norm": 0.54296875, "learning_rate": 0.00028672692068156837, "loss": 0.7023, "step": 1712 }, { "epoch": 0.7197986577181208, "grad_norm": 0.578125, "learning_rate": 0.00028666114579937654, "loss": 0.4751, "step": 1716 }, { "epoch": 0.7214765100671141, "grad_norm": 0.578125, "learning_rate": 0.000286595215928237, "loss": 0.8282, "step": 1720 }, { "epoch": 0.7231543624161074, "grad_norm": 0.498046875, "learning_rate": 0.00028652913114292133, "loss": 0.6352, "step": 1724 }, { "epoch": 0.7248322147651006, "grad_norm": 0.482421875, "learning_rate": 0.00028646289151837716, "loss": 0.5849, "step": 1728 }, { "epoch": 0.7265100671140939, "grad_norm": 0.5625, "learning_rate": 0.0002863964971297275, "loss": 0.753, "step": 1732 }, { "epoch": 0.7281879194630873, "grad_norm": 0.484375, "learning_rate": 0.0002863299480522711, "loss": 0.5907, "step": 1736 }, { "epoch": 0.7298657718120806, "grad_norm": 0.65234375, "learning_rate": 0.000286263244361482, "loss": 0.533, "step": 1740 }, { "epoch": 0.7315436241610739, "grad_norm": 0.546875, "learning_rate": 0.00028619638613300953, "loss": 0.7116, "step": 1744 }, { "epoch": 0.7332214765100671, "grad_norm": 0.515625, "learning_rate": 0.0002861293734426784, "loss": 0.5301, "step": 1748 }, { "epoch": 0.7348993288590604, "grad_norm": 0.47265625, "learning_rate": 0.0002860622063664884, "loss": 0.6573, "step": 1752 }, { "epoch": 0.7365771812080537, "grad_norm": 0.51171875, "learning_rate": 0.0002859948849806147, "loss": 0.5558, "step": 1756 }, { "epoch": 0.738255033557047, "grad_norm": 0.54296875, "learning_rate": 0.00028592740936140697, "loss": 0.7124, "step": 1760 }, { "epoch": 0.7399328859060402, "grad_norm": 0.61328125, "learning_rate": 0.0002858597795853902, "loss": 0.7029, "step": 1764 }, { "epoch": 0.7416107382550335, "grad_norm": 0.4296875, "learning_rate": 0.000285791995729264, "loss": 0.6177, "step": 1768 }, { "epoch": 0.7432885906040269, "grad_norm": 0.52734375, "learning_rate": 0.00028572405786990294, "loss": 0.5858, "step": 1772 }, { "epoch": 0.7449664429530202, "grad_norm": 0.56640625, "learning_rate": 0.00028565596608435596, "loss": 0.674, "step": 1776 }, { "epoch": 0.7466442953020134, "grad_norm": 0.5078125, "learning_rate": 0.0002855877204498469, "loss": 0.5498, "step": 1780 }, { "epoch": 0.7483221476510067, "grad_norm": 0.61328125, "learning_rate": 0.00028551932104377373, "loss": 0.5353, "step": 1784 }, { "epoch": 0.75, "grad_norm": 0.40625, "learning_rate": 0.0002854507679437091, "loss": 0.591, "step": 1788 }, { "epoch": 0.7516778523489933, "grad_norm": 0.474609375, "learning_rate": 0.00028538206122739976, "loss": 0.6908, "step": 1792 }, { "epoch": 0.7533557046979866, "grad_norm": 0.7109375, "learning_rate": 0.0002853132009727669, "loss": 0.6902, "step": 1796 }, { "epoch": 0.7550335570469798, "grad_norm": 0.5703125, "learning_rate": 0.0002852441872579056, "loss": 0.7647, "step": 1800 }, { "epoch": 0.7567114093959731, "grad_norm": 0.609375, "learning_rate": 0.0002851750201610852, "loss": 0.6071, "step": 1804 }, { "epoch": 0.7583892617449665, "grad_norm": 0.61328125, "learning_rate": 0.0002851056997607487, "loss": 0.5891, "step": 1808 }, { "epoch": 0.7600671140939598, "grad_norm": 0.59375, "learning_rate": 0.0002850362261355134, "loss": 0.6681, "step": 1812 }, { "epoch": 0.761744966442953, "grad_norm": 0.52734375, "learning_rate": 0.0002849665993641699, "loss": 0.6094, "step": 1816 }, { "epoch": 0.7634228187919463, "grad_norm": 0.59375, "learning_rate": 0.00028489681952568286, "loss": 0.6465, "step": 1820 }, { "epoch": 0.7651006711409396, "grad_norm": 0.71484375, "learning_rate": 0.00028482688669919027, "loss": 0.6465, "step": 1824 }, { "epoch": 0.7667785234899329, "grad_norm": 0.4921875, "learning_rate": 0.00028475680096400383, "loss": 0.7379, "step": 1828 }, { "epoch": 0.7684563758389261, "grad_norm": 0.5390625, "learning_rate": 0.0002846865623996085, "loss": 0.5328, "step": 1832 }, { "epoch": 0.7701342281879194, "grad_norm": 0.474609375, "learning_rate": 0.0002846161710856627, "loss": 0.5654, "step": 1836 }, { "epoch": 0.7718120805369127, "grad_norm": 0.5703125, "learning_rate": 0.0002845456271019979, "loss": 0.7195, "step": 1840 }, { "epoch": 0.7734899328859061, "grad_norm": 0.56640625, "learning_rate": 0.0002844749305286189, "loss": 0.7046, "step": 1844 }, { "epoch": 0.7751677852348994, "grad_norm": 0.5078125, "learning_rate": 0.0002844040814457035, "loss": 0.6693, "step": 1848 }, { "epoch": 0.7768456375838926, "grad_norm": 0.51171875, "learning_rate": 0.0002843330799336024, "loss": 0.6979, "step": 1852 }, { "epoch": 0.7785234899328859, "grad_norm": 0.515625, "learning_rate": 0.00028426192607283924, "loss": 0.5018, "step": 1856 }, { "epoch": 0.7802013422818792, "grad_norm": 0.68359375, "learning_rate": 0.0002841906199441104, "loss": 0.6121, "step": 1860 }, { "epoch": 0.7818791946308725, "grad_norm": 0.6484375, "learning_rate": 0.000284119161628285, "loss": 0.6562, "step": 1864 }, { "epoch": 0.7835570469798657, "grad_norm": 0.42578125, "learning_rate": 0.0002840475512064047, "loss": 0.7183, "step": 1868 }, { "epoch": 0.785234899328859, "grad_norm": 0.58984375, "learning_rate": 0.00028397578875968366, "loss": 0.8632, "step": 1872 }, { "epoch": 0.7869127516778524, "grad_norm": 0.50390625, "learning_rate": 0.0002839038743695085, "loss": 0.7304, "step": 1876 }, { "epoch": 0.7885906040268457, "grad_norm": 0.70703125, "learning_rate": 0.0002838318081174382, "loss": 0.642, "step": 1880 }, { "epoch": 0.790268456375839, "grad_norm": 0.62109375, "learning_rate": 0.0002837595900852038, "loss": 0.5512, "step": 1884 }, { "epoch": 0.7919463087248322, "grad_norm": 0.6484375, "learning_rate": 0.0002836872203547087, "loss": 0.561, "step": 1888 }, { "epoch": 0.7936241610738255, "grad_norm": 0.64453125, "learning_rate": 0.0002836146990080281, "loss": 0.5759, "step": 1892 }, { "epoch": 0.7953020134228188, "grad_norm": 0.6328125, "learning_rate": 0.00028354202612740944, "loss": 0.5701, "step": 1896 }, { "epoch": 0.7969798657718121, "grad_norm": 0.65625, "learning_rate": 0.0002834692017952717, "loss": 0.7574, "step": 1900 }, { "epoch": 0.7986577181208053, "grad_norm": 0.58984375, "learning_rate": 0.00028339622609420585, "loss": 0.643, "step": 1904 }, { "epoch": 0.8003355704697986, "grad_norm": 0.423828125, "learning_rate": 0.00028332309910697446, "loss": 0.6633, "step": 1908 }, { "epoch": 0.802013422818792, "grad_norm": 0.46875, "learning_rate": 0.0002832498209165117, "loss": 0.7446, "step": 1912 }, { "epoch": 0.8036912751677853, "grad_norm": 0.625, "learning_rate": 0.0002831763916059231, "loss": 0.4771, "step": 1916 }, { "epoch": 0.8053691275167785, "grad_norm": 0.6953125, "learning_rate": 0.0002831028112584857, "loss": 0.6202, "step": 1920 }, { "epoch": 0.8070469798657718, "grad_norm": 0.6015625, "learning_rate": 0.0002830290799576479, "loss": 0.6581, "step": 1924 }, { "epoch": 0.8087248322147651, "grad_norm": 0.45703125, "learning_rate": 0.00028295519778702904, "loss": 0.6566, "step": 1928 }, { "epoch": 0.8104026845637584, "grad_norm": 0.5859375, "learning_rate": 0.00028288116483041984, "loss": 0.6079, "step": 1932 }, { "epoch": 0.8120805369127517, "grad_norm": 0.53515625, "learning_rate": 0.0002828069811717819, "loss": 0.6864, "step": 1936 }, { "epoch": 0.8137583892617449, "grad_norm": 0.62890625, "learning_rate": 0.0002827326468952477, "loss": 0.5919, "step": 1940 }, { "epoch": 0.8154362416107382, "grad_norm": 0.498046875, "learning_rate": 0.0002826581620851207, "loss": 0.6594, "step": 1944 }, { "epoch": 0.8171140939597316, "grad_norm": 0.51953125, "learning_rate": 0.00028258352682587474, "loss": 0.7038, "step": 1948 }, { "epoch": 0.8187919463087249, "grad_norm": 0.6015625, "learning_rate": 0.00028250874120215467, "loss": 0.6764, "step": 1952 }, { "epoch": 0.8204697986577181, "grad_norm": 0.625, "learning_rate": 0.00028243380529877575, "loss": 0.655, "step": 1956 }, { "epoch": 0.8221476510067114, "grad_norm": 0.59765625, "learning_rate": 0.00028235871920072347, "loss": 0.7313, "step": 1960 }, { "epoch": 0.8238255033557047, "grad_norm": 0.4765625, "learning_rate": 0.00028228348299315397, "loss": 0.6301, "step": 1964 }, { "epoch": 0.825503355704698, "grad_norm": 0.4609375, "learning_rate": 0.00028220809676139343, "loss": 0.5764, "step": 1968 }, { "epoch": 0.8271812080536913, "grad_norm": 0.51953125, "learning_rate": 0.0002821325605909382, "loss": 0.6779, "step": 1972 }, { "epoch": 0.8288590604026845, "grad_norm": 0.5390625, "learning_rate": 0.00028205687456745474, "loss": 0.654, "step": 1976 }, { "epoch": 0.8305369127516778, "grad_norm": 0.423828125, "learning_rate": 0.00028198103877677936, "loss": 0.6138, "step": 1980 }, { "epoch": 0.8322147651006712, "grad_norm": 0.5, "learning_rate": 0.0002819050533049184, "loss": 0.6382, "step": 1984 }, { "epoch": 0.8338926174496645, "grad_norm": 0.55859375, "learning_rate": 0.00028182891823804774, "loss": 0.6936, "step": 1988 }, { "epoch": 0.8355704697986577, "grad_norm": 0.490234375, "learning_rate": 0.0002817526336625131, "loss": 0.7121, "step": 1992 }, { "epoch": 0.837248322147651, "grad_norm": 0.59765625, "learning_rate": 0.00028167619966482966, "loss": 0.5539, "step": 1996 }, { "epoch": 0.8389261744966443, "grad_norm": 0.55859375, "learning_rate": 0.000281599616331682, "loss": 0.7934, "step": 2000 }, { "epoch": 0.8406040268456376, "grad_norm": 0.5, "learning_rate": 0.0002815228837499242, "loss": 0.5681, "step": 2004 }, { "epoch": 0.8422818791946308, "grad_norm": 0.6171875, "learning_rate": 0.0002814460020065795, "loss": 0.7563, "step": 2008 }, { "epoch": 0.8439597315436241, "grad_norm": 0.5546875, "learning_rate": 0.00028136897118884044, "loss": 0.7189, "step": 2012 }, { "epoch": 0.8456375838926175, "grad_norm": 0.44921875, "learning_rate": 0.00028129179138406855, "loss": 0.7195, "step": 2016 }, { "epoch": 0.8473154362416108, "grad_norm": 0.58203125, "learning_rate": 0.00028121446267979417, "loss": 0.7868, "step": 2020 }, { "epoch": 0.8489932885906041, "grad_norm": 0.55078125, "learning_rate": 0.00028113698516371674, "loss": 0.6411, "step": 2024 }, { "epoch": 0.8506711409395973, "grad_norm": 0.62890625, "learning_rate": 0.00028105935892370446, "loss": 0.6864, "step": 2028 }, { "epoch": 0.8523489932885906, "grad_norm": 0.6171875, "learning_rate": 0.0002809815840477941, "loss": 0.5874, "step": 2032 }, { "epoch": 0.8540268456375839, "grad_norm": 0.703125, "learning_rate": 0.0002809036606241909, "loss": 0.6141, "step": 2036 }, { "epoch": 0.8557046979865772, "grad_norm": 0.5078125, "learning_rate": 0.0002808255887412688, "loss": 0.4632, "step": 2040 }, { "epoch": 0.8573825503355704, "grad_norm": 0.5390625, "learning_rate": 0.00028074736848757, "loss": 0.6944, "step": 2044 }, { "epoch": 0.8590604026845637, "grad_norm": 0.5, "learning_rate": 0.00028066899995180493, "loss": 0.8507, "step": 2048 }, { "epoch": 0.860738255033557, "grad_norm": 0.76953125, "learning_rate": 0.0002805904832228523, "loss": 0.6243, "step": 2052 }, { "epoch": 0.8624161073825504, "grad_norm": 0.78515625, "learning_rate": 0.00028051181838975874, "loss": 0.6233, "step": 2056 }, { "epoch": 0.8640939597315436, "grad_norm": 0.61328125, "learning_rate": 0.00028043300554173896, "loss": 0.7773, "step": 2060 }, { "epoch": 0.8657718120805369, "grad_norm": 0.51171875, "learning_rate": 0.0002803540447681755, "loss": 0.5844, "step": 2064 }, { "epoch": 0.8674496644295302, "grad_norm": 0.48828125, "learning_rate": 0.0002802749361586187, "loss": 0.5786, "step": 2068 }, { "epoch": 0.8691275167785235, "grad_norm": 0.63671875, "learning_rate": 0.00028019567980278645, "loss": 0.4789, "step": 2072 }, { "epoch": 0.8708053691275168, "grad_norm": 0.57421875, "learning_rate": 0.0002801162757905643, "loss": 0.8007, "step": 2076 }, { "epoch": 0.87248322147651, "grad_norm": 0.5703125, "learning_rate": 0.0002800367242120053, "loss": 0.4581, "step": 2080 }, { "epoch": 0.8741610738255033, "grad_norm": 0.53515625, "learning_rate": 0.00027995702515732973, "loss": 0.6566, "step": 2084 }, { "epoch": 0.8758389261744967, "grad_norm": 0.7265625, "learning_rate": 0.00027987717871692515, "loss": 0.7077, "step": 2088 }, { "epoch": 0.87751677852349, "grad_norm": 0.80859375, "learning_rate": 0.00027979718498134646, "loss": 0.6453, "step": 2092 }, { "epoch": 0.8791946308724832, "grad_norm": 0.51953125, "learning_rate": 0.0002797170440413153, "loss": 0.5516, "step": 2096 }, { "epoch": 0.8808724832214765, "grad_norm": 0.48046875, "learning_rate": 0.0002796367559877205, "loss": 0.8374, "step": 2100 }, { "epoch": 0.8825503355704698, "grad_norm": 0.6328125, "learning_rate": 0.0002795563209116176, "loss": 0.6714, "step": 2104 }, { "epoch": 0.8842281879194631, "grad_norm": 0.65625, "learning_rate": 0.000279475738904229, "loss": 0.7046, "step": 2108 }, { "epoch": 0.8859060402684564, "grad_norm": 0.609375, "learning_rate": 0.0002793950100569436, "loss": 0.5875, "step": 2112 }, { "epoch": 0.8875838926174496, "grad_norm": 0.53515625, "learning_rate": 0.000279314134461317, "loss": 0.6593, "step": 2116 }, { "epoch": 0.889261744966443, "grad_norm": 0.63671875, "learning_rate": 0.0002792331122090709, "loss": 0.613, "step": 2120 }, { "epoch": 0.8909395973154363, "grad_norm": 0.58203125, "learning_rate": 0.0002791519433920937, "loss": 0.5797, "step": 2124 }, { "epoch": 0.8926174496644296, "grad_norm": 0.5546875, "learning_rate": 0.00027907062810243993, "loss": 0.6641, "step": 2128 }, { "epoch": 0.8942953020134228, "grad_norm": 0.5703125, "learning_rate": 0.0002789891664323301, "loss": 0.6782, "step": 2132 }, { "epoch": 0.8959731543624161, "grad_norm": 0.61328125, "learning_rate": 0.0002789075584741508, "loss": 0.5882, "step": 2136 }, { "epoch": 0.8976510067114094, "grad_norm": 0.59375, "learning_rate": 0.00027882580432045455, "loss": 0.5488, "step": 2140 }, { "epoch": 0.8993288590604027, "grad_norm": 0.5234375, "learning_rate": 0.0002787439040639597, "loss": 0.7347, "step": 2144 }, { "epoch": 0.9010067114093959, "grad_norm": 0.50390625, "learning_rate": 0.0002786618577975502, "loss": 0.5828, "step": 2148 }, { "epoch": 0.9026845637583892, "grad_norm": 0.73828125, "learning_rate": 0.0002785796656142756, "loss": 0.5313, "step": 2152 }, { "epoch": 0.9043624161073825, "grad_norm": 0.5703125, "learning_rate": 0.00027849732760735113, "loss": 0.622, "step": 2156 }, { "epoch": 0.9060402684563759, "grad_norm": 0.49609375, "learning_rate": 0.0002784148438701571, "loss": 0.6536, "step": 2160 }, { "epoch": 0.9077181208053692, "grad_norm": 0.75, "learning_rate": 0.0002783322144962394, "loss": 0.4975, "step": 2164 }, { "epoch": 0.9093959731543624, "grad_norm": 0.79296875, "learning_rate": 0.0002782494395793089, "loss": 0.7683, "step": 2168 }, { "epoch": 0.9110738255033557, "grad_norm": 0.5546875, "learning_rate": 0.00027816651921324135, "loss": 0.6942, "step": 2172 }, { "epoch": 0.912751677852349, "grad_norm": 0.46484375, "learning_rate": 0.00027808345349207797, "loss": 0.6893, "step": 2176 }, { "epoch": 0.9144295302013423, "grad_norm": 0.98828125, "learning_rate": 0.00027800024251002436, "loss": 0.788, "step": 2180 }, { "epoch": 0.9161073825503355, "grad_norm": 0.58984375, "learning_rate": 0.0002779168863614511, "loss": 0.7453, "step": 2184 }, { "epoch": 0.9177852348993288, "grad_norm": 0.52734375, "learning_rate": 0.0002778333851408933, "loss": 0.679, "step": 2188 }, { "epoch": 0.9194630872483222, "grad_norm": 0.49609375, "learning_rate": 0.0002777497389430507, "loss": 0.7211, "step": 2192 }, { "epoch": 0.9211409395973155, "grad_norm": 0.54296875, "learning_rate": 0.00027766594786278736, "loss": 0.7331, "step": 2196 }, { "epoch": 0.9228187919463087, "grad_norm": 0.5234375, "learning_rate": 0.0002775820119951316, "loss": 0.6562, "step": 2200 }, { "epoch": 0.924496644295302, "grad_norm": 0.54296875, "learning_rate": 0.00027749793143527616, "loss": 0.6116, "step": 2204 }, { "epoch": 0.9261744966442953, "grad_norm": 0.578125, "learning_rate": 0.00027741370627857773, "loss": 0.843, "step": 2208 }, { "epoch": 0.9278523489932886, "grad_norm": 0.609375, "learning_rate": 0.00027732933662055694, "loss": 0.4889, "step": 2212 }, { "epoch": 0.9295302013422819, "grad_norm": 0.59375, "learning_rate": 0.00027724482255689847, "loss": 0.5375, "step": 2216 }, { "epoch": 0.9312080536912751, "grad_norm": 0.451171875, "learning_rate": 0.0002771601641834506, "loss": 0.4773, "step": 2220 }, { "epoch": 0.9328859060402684, "grad_norm": 0.5546875, "learning_rate": 0.00027707536159622545, "loss": 0.5576, "step": 2224 }, { "epoch": 0.9345637583892618, "grad_norm": 0.58203125, "learning_rate": 0.00027699041489139843, "loss": 0.5991, "step": 2228 }, { "epoch": 0.9362416107382551, "grad_norm": 0.578125, "learning_rate": 0.0002769053241653086, "loss": 0.6558, "step": 2232 }, { "epoch": 0.9379194630872483, "grad_norm": 0.5, "learning_rate": 0.00027682008951445845, "loss": 0.6475, "step": 2236 }, { "epoch": 0.9395973154362416, "grad_norm": 0.53515625, "learning_rate": 0.00027673471103551345, "loss": 0.5018, "step": 2240 }, { "epoch": 0.9412751677852349, "grad_norm": 0.55078125, "learning_rate": 0.00027664918882530225, "loss": 0.6126, "step": 2244 }, { "epoch": 0.9429530201342282, "grad_norm": 0.76953125, "learning_rate": 0.00027656352298081665, "loss": 0.8209, "step": 2248 }, { "epoch": 0.9446308724832215, "grad_norm": 0.61328125, "learning_rate": 0.0002764777135992112, "loss": 0.5255, "step": 2252 }, { "epoch": 0.9463087248322147, "grad_norm": 0.546875, "learning_rate": 0.0002763917607778033, "loss": 0.6033, "step": 2256 }, { "epoch": 0.947986577181208, "grad_norm": 0.51171875, "learning_rate": 0.00027630566461407305, "loss": 0.659, "step": 2260 }, { "epoch": 0.9496644295302014, "grad_norm": 0.53125, "learning_rate": 0.000276219425205663, "loss": 0.6277, "step": 2264 }, { "epoch": 0.9513422818791947, "grad_norm": 0.62890625, "learning_rate": 0.0002761330426503783, "loss": 0.5232, "step": 2268 }, { "epoch": 0.9530201342281879, "grad_norm": 0.6015625, "learning_rate": 0.00027604651704618636, "loss": 0.6685, "step": 2272 }, { "epoch": 0.9546979865771812, "grad_norm": 0.46484375, "learning_rate": 0.0002759598484912169, "loss": 0.5359, "step": 2276 }, { "epoch": 0.9563758389261745, "grad_norm": 0.83984375, "learning_rate": 0.00027587303708376156, "loss": 0.6774, "step": 2280 }, { "epoch": 0.9580536912751678, "grad_norm": 0.494140625, "learning_rate": 0.00027578608292227433, "loss": 0.6557, "step": 2284 }, { "epoch": 0.959731543624161, "grad_norm": 0.609375, "learning_rate": 0.0002756989861053708, "loss": 0.7289, "step": 2288 }, { "epoch": 0.9614093959731543, "grad_norm": 0.390625, "learning_rate": 0.00027561174673182843, "loss": 0.694, "step": 2292 }, { "epoch": 0.9630872483221476, "grad_norm": 0.53125, "learning_rate": 0.0002755243649005864, "loss": 0.554, "step": 2296 }, { "epoch": 0.964765100671141, "grad_norm": 0.6015625, "learning_rate": 0.00027543684071074543, "loss": 0.7357, "step": 2300 }, { "epoch": 0.9664429530201343, "grad_norm": 0.447265625, "learning_rate": 0.0002753491742615677, "loss": 0.5375, "step": 2304 }, { "epoch": 0.9681208053691275, "grad_norm": 0.83203125, "learning_rate": 0.0002752613656524768, "loss": 0.4482, "step": 2308 }, { "epoch": 0.9697986577181208, "grad_norm": 0.76171875, "learning_rate": 0.00027517341498305733, "loss": 0.568, "step": 2312 }, { "epoch": 0.9714765100671141, "grad_norm": 0.53125, "learning_rate": 0.00027508532235305516, "loss": 0.7111, "step": 2316 }, { "epoch": 0.9731543624161074, "grad_norm": 0.62890625, "learning_rate": 0.00027499708786237723, "loss": 0.6568, "step": 2320 }, { "epoch": 0.9748322147651006, "grad_norm": 0.55859375, "learning_rate": 0.00027490871161109116, "loss": 0.4877, "step": 2324 }, { "epoch": 0.9765100671140939, "grad_norm": 0.63671875, "learning_rate": 0.00027482019369942553, "loss": 0.6343, "step": 2328 }, { "epoch": 0.9781879194630873, "grad_norm": 0.53515625, "learning_rate": 0.0002747315342277695, "loss": 0.596, "step": 2332 }, { "epoch": 0.9798657718120806, "grad_norm": 0.5546875, "learning_rate": 0.00027464273329667275, "loss": 0.6091, "step": 2336 }, { "epoch": 0.9815436241610739, "grad_norm": 0.55859375, "learning_rate": 0.00027455379100684534, "loss": 0.6872, "step": 2340 }, { "epoch": 0.9832214765100671, "grad_norm": 0.54296875, "learning_rate": 0.0002744647074591579, "loss": 0.5373, "step": 2344 }, { "epoch": 0.9848993288590604, "grad_norm": 0.59765625, "learning_rate": 0.000274375482754641, "loss": 0.7614, "step": 2348 }, { "epoch": 0.9865771812080537, "grad_norm": 0.6015625, "learning_rate": 0.00027428611699448533, "loss": 0.5328, "step": 2352 }, { "epoch": 0.988255033557047, "grad_norm": 0.53125, "learning_rate": 0.0002741966102800417, "loss": 0.62, "step": 2356 }, { "epoch": 0.9899328859060402, "grad_norm": 0.7421875, "learning_rate": 0.0002741069627128206, "loss": 0.685, "step": 2360 }, { "epoch": 0.9916107382550335, "grad_norm": 0.578125, "learning_rate": 0.0002740171743944924, "loss": 0.8223, "step": 2364 }, { "epoch": 0.9932885906040269, "grad_norm": 0.5546875, "learning_rate": 0.000273927245426887, "loss": 0.5277, "step": 2368 }, { "epoch": 0.9949664429530202, "grad_norm": 0.470703125, "learning_rate": 0.00027383717591199393, "loss": 0.4908, "step": 2372 }, { "epoch": 0.9966442953020134, "grad_norm": 0.5859375, "learning_rate": 0.00027374696595196203, "loss": 0.7087, "step": 2376 }, { "epoch": 0.9983221476510067, "grad_norm": 0.61328125, "learning_rate": 0.0002736566156490994, "loss": 0.5272, "step": 2380 }, { "epoch": 1.0, "grad_norm": 0.69140625, "learning_rate": 0.00027356612510587335, "loss": 0.5662, "step": 2384 }, { "epoch": 1.0016778523489933, "grad_norm": 0.498046875, "learning_rate": 0.00027347549442491016, "loss": 0.5614, "step": 2388 }, { "epoch": 1.0033557046979866, "grad_norm": 0.59375, "learning_rate": 0.00027338472370899523, "loss": 0.4517, "step": 2392 }, { "epoch": 1.00503355704698, "grad_norm": 0.62890625, "learning_rate": 0.0002732938130610726, "loss": 0.528, "step": 2396 }, { "epoch": 1.0067114093959733, "grad_norm": 0.68359375, "learning_rate": 0.00027320276258424505, "loss": 0.5864, "step": 2400 }, { "epoch": 1.0083892617449663, "grad_norm": 0.73046875, "learning_rate": 0.0002731115723817739, "loss": 0.5331, "step": 2404 }, { "epoch": 1.0100671140939597, "grad_norm": 0.5703125, "learning_rate": 0.0002730202425570791, "loss": 0.4446, "step": 2408 }, { "epoch": 1.011744966442953, "grad_norm": 0.56640625, "learning_rate": 0.0002729287732137388, "loss": 0.539, "step": 2412 }, { "epoch": 1.0134228187919463, "grad_norm": 0.6015625, "learning_rate": 0.0002728371644554893, "loss": 0.3705, "step": 2416 }, { "epoch": 1.0151006711409396, "grad_norm": 0.546875, "learning_rate": 0.0002727454163862253, "loss": 0.5692, "step": 2420 }, { "epoch": 1.016778523489933, "grad_norm": 0.96484375, "learning_rate": 0.0002726535291099993, "loss": 0.5368, "step": 2424 }, { "epoch": 1.0184563758389262, "grad_norm": 0.51171875, "learning_rate": 0.0002725615027310216, "loss": 0.6211, "step": 2428 }, { "epoch": 1.0201342281879195, "grad_norm": 0.7734375, "learning_rate": 0.00027246933735366037, "loss": 0.4997, "step": 2432 }, { "epoch": 1.0218120805369129, "grad_norm": 0.53515625, "learning_rate": 0.0002723770330824414, "loss": 0.4137, "step": 2436 }, { "epoch": 1.023489932885906, "grad_norm": 0.447265625, "learning_rate": 0.00027228459002204806, "loss": 0.4543, "step": 2440 }, { "epoch": 1.0251677852348993, "grad_norm": 0.58203125, "learning_rate": 0.0002721920082773211, "loss": 0.5435, "step": 2444 }, { "epoch": 1.0268456375838926, "grad_norm": 0.482421875, "learning_rate": 0.0002720992879532584, "loss": 0.5038, "step": 2448 }, { "epoch": 1.028523489932886, "grad_norm": 0.58203125, "learning_rate": 0.0002720064291550152, "loss": 0.4911, "step": 2452 }, { "epoch": 1.0302013422818792, "grad_norm": 0.55078125, "learning_rate": 0.00027191343198790377, "loss": 0.527, "step": 2456 }, { "epoch": 1.0318791946308725, "grad_norm": 0.52734375, "learning_rate": 0.0002718202965573931, "loss": 0.5643, "step": 2460 }, { "epoch": 1.0335570469798658, "grad_norm": 0.486328125, "learning_rate": 0.0002717270229691093, "loss": 0.4541, "step": 2464 }, { "epoch": 1.0352348993288591, "grad_norm": 0.50390625, "learning_rate": 0.00027163361132883485, "loss": 0.4037, "step": 2468 }, { "epoch": 1.0369127516778525, "grad_norm": 0.57421875, "learning_rate": 0.000271540061742509, "loss": 0.499, "step": 2472 }, { "epoch": 1.0385906040268456, "grad_norm": 0.419921875, "learning_rate": 0.00027144637431622743, "loss": 0.4764, "step": 2476 }, { "epoch": 1.0402684563758389, "grad_norm": 0.498046875, "learning_rate": 0.0002713525491562421, "loss": 0.5517, "step": 2480 }, { "epoch": 1.0419463087248322, "grad_norm": 0.5078125, "learning_rate": 0.0002712585863689611, "loss": 0.6204, "step": 2484 }, { "epoch": 1.0436241610738255, "grad_norm": 0.474609375, "learning_rate": 0.0002711644860609488, "loss": 0.5874, "step": 2488 }, { "epoch": 1.0453020134228188, "grad_norm": 0.58203125, "learning_rate": 0.00027107024833892537, "loss": 0.6081, "step": 2492 }, { "epoch": 1.0469798657718121, "grad_norm": 0.486328125, "learning_rate": 0.00027097587330976686, "loss": 0.6469, "step": 2496 }, { "epoch": 1.0486577181208054, "grad_norm": 0.578125, "learning_rate": 0.000270881361080505, "loss": 0.4223, "step": 2500 }, { "epoch": 1.0503355704697988, "grad_norm": 0.5859375, "learning_rate": 0.0002707867117583273, "loss": 0.6648, "step": 2504 }, { "epoch": 1.0520134228187918, "grad_norm": 0.53125, "learning_rate": 0.0002706919254505766, "loss": 0.4592, "step": 2508 }, { "epoch": 1.0536912751677852, "grad_norm": 0.51953125, "learning_rate": 0.0002705970022647511, "loss": 0.4119, "step": 2512 }, { "epoch": 1.0553691275167785, "grad_norm": 0.60546875, "learning_rate": 0.0002705019423085042, "loss": 0.5872, "step": 2516 }, { "epoch": 1.0570469798657718, "grad_norm": 0.5546875, "learning_rate": 0.0002704067456896445, "loss": 0.4361, "step": 2520 }, { "epoch": 1.058724832214765, "grad_norm": 0.478515625, "learning_rate": 0.00027031141251613564, "loss": 0.4477, "step": 2524 }, { "epoch": 1.0604026845637584, "grad_norm": 0.484375, "learning_rate": 0.00027021594289609597, "loss": 0.6473, "step": 2528 }, { "epoch": 1.0620805369127517, "grad_norm": 0.51953125, "learning_rate": 0.0002701203369377986, "loss": 0.5701, "step": 2532 }, { "epoch": 1.063758389261745, "grad_norm": 0.578125, "learning_rate": 0.0002700245947496715, "loss": 0.5334, "step": 2536 }, { "epoch": 1.0654362416107384, "grad_norm": 0.474609375, "learning_rate": 0.00026992871644029685, "loss": 0.6522, "step": 2540 }, { "epoch": 1.0671140939597314, "grad_norm": 0.55078125, "learning_rate": 0.00026983270211841133, "loss": 0.4082, "step": 2544 }, { "epoch": 1.0687919463087248, "grad_norm": 0.63671875, "learning_rate": 0.00026973655189290585, "loss": 0.5156, "step": 2548 }, { "epoch": 1.070469798657718, "grad_norm": 0.4296875, "learning_rate": 0.0002696402658728255, "loss": 0.3344, "step": 2552 }, { "epoch": 1.0721476510067114, "grad_norm": 0.484375, "learning_rate": 0.0002695438441673694, "loss": 0.4751, "step": 2556 }, { "epoch": 1.0738255033557047, "grad_norm": 0.59765625, "learning_rate": 0.0002694472868858904, "loss": 0.504, "step": 2560 }, { "epoch": 1.075503355704698, "grad_norm": 0.59765625, "learning_rate": 0.0002693505941378952, "loss": 0.566, "step": 2564 }, { "epoch": 1.0771812080536913, "grad_norm": 0.5234375, "learning_rate": 0.00026925376603304424, "loss": 0.4705, "step": 2568 }, { "epoch": 1.0788590604026846, "grad_norm": 0.53515625, "learning_rate": 0.00026915680268115125, "loss": 0.5126, "step": 2572 }, { "epoch": 1.0805369127516777, "grad_norm": 0.60546875, "learning_rate": 0.0002690597041921835, "loss": 0.4089, "step": 2576 }, { "epoch": 1.082214765100671, "grad_norm": 0.427734375, "learning_rate": 0.0002689624706762615, "loss": 0.4949, "step": 2580 }, { "epoch": 1.0838926174496644, "grad_norm": 0.51171875, "learning_rate": 0.0002688651022436589, "loss": 0.5881, "step": 2584 }, { "epoch": 1.0855704697986577, "grad_norm": 0.58203125, "learning_rate": 0.00026876759900480225, "loss": 0.594, "step": 2588 }, { "epoch": 1.087248322147651, "grad_norm": 0.470703125, "learning_rate": 0.0002686699610702712, "loss": 0.4245, "step": 2592 }, { "epoch": 1.0889261744966443, "grad_norm": 0.59375, "learning_rate": 0.0002685721885507979, "loss": 0.5787, "step": 2596 }, { "epoch": 1.0906040268456376, "grad_norm": 0.60546875, "learning_rate": 0.0002684742815572674, "loss": 0.4464, "step": 2600 }, { "epoch": 1.092281879194631, "grad_norm": 0.4609375, "learning_rate": 0.00026837624020071703, "loss": 0.516, "step": 2604 }, { "epoch": 1.0939597315436242, "grad_norm": 0.671875, "learning_rate": 0.00026827806459233663, "loss": 0.4896, "step": 2608 }, { "epoch": 1.0956375838926173, "grad_norm": 0.5703125, "learning_rate": 0.00026817975484346823, "loss": 0.6858, "step": 2612 }, { "epoch": 1.0973154362416107, "grad_norm": 0.5625, "learning_rate": 0.0002680813110656061, "loss": 0.5939, "step": 2616 }, { "epoch": 1.098993288590604, "grad_norm": 0.4609375, "learning_rate": 0.00026798273337039636, "loss": 0.5213, "step": 2620 }, { "epoch": 1.1006711409395973, "grad_norm": 0.52734375, "learning_rate": 0.00026788402186963715, "loss": 0.3832, "step": 2624 }, { "epoch": 1.1023489932885906, "grad_norm": 0.6328125, "learning_rate": 0.00026778517667527823, "loss": 0.5883, "step": 2628 }, { "epoch": 1.104026845637584, "grad_norm": 0.443359375, "learning_rate": 0.0002676861978994212, "loss": 0.3528, "step": 2632 }, { "epoch": 1.1057046979865772, "grad_norm": 0.443359375, "learning_rate": 0.00026758708565431883, "loss": 0.6334, "step": 2636 }, { "epoch": 1.1073825503355705, "grad_norm": 0.5703125, "learning_rate": 0.0002674878400523755, "loss": 0.5359, "step": 2640 }, { "epoch": 1.1090604026845639, "grad_norm": 0.57421875, "learning_rate": 0.00026738846120614676, "loss": 0.6335, "step": 2644 }, { "epoch": 1.110738255033557, "grad_norm": 0.56640625, "learning_rate": 0.0002672889492283393, "loss": 0.4251, "step": 2648 }, { "epoch": 1.1124161073825503, "grad_norm": 0.5234375, "learning_rate": 0.0002671893042318108, "loss": 0.6456, "step": 2652 }, { "epoch": 1.1140939597315436, "grad_norm": 0.58203125, "learning_rate": 0.00026708952632956975, "loss": 0.506, "step": 2656 }, { "epoch": 1.1157718120805369, "grad_norm": 0.68359375, "learning_rate": 0.0002669896156347754, "loss": 0.3727, "step": 2660 }, { "epoch": 1.1174496644295302, "grad_norm": 0.6171875, "learning_rate": 0.0002668895722607376, "loss": 0.5001, "step": 2664 }, { "epoch": 1.1191275167785235, "grad_norm": 0.5859375, "learning_rate": 0.0002667893963209166, "loss": 0.4139, "step": 2668 }, { "epoch": 1.1208053691275168, "grad_norm": 0.60546875, "learning_rate": 0.0002666890879289231, "loss": 0.5406, "step": 2672 }, { "epoch": 1.1224832214765101, "grad_norm": 0.48828125, "learning_rate": 0.00026658864719851803, "loss": 0.4127, "step": 2676 }, { "epoch": 1.1241610738255035, "grad_norm": 0.5625, "learning_rate": 0.0002664880742436124, "loss": 0.5057, "step": 2680 }, { "epoch": 1.1258389261744965, "grad_norm": 0.6875, "learning_rate": 0.000266387369178267, "loss": 0.5376, "step": 2684 }, { "epoch": 1.1275167785234899, "grad_norm": 0.5234375, "learning_rate": 0.00026628653211669263, "loss": 0.4288, "step": 2688 }, { "epoch": 1.1291946308724832, "grad_norm": 0.5625, "learning_rate": 0.0002661855631732498, "loss": 0.5704, "step": 2692 }, { "epoch": 1.1308724832214765, "grad_norm": 0.5078125, "learning_rate": 0.0002660844624624484, "loss": 0.3708, "step": 2696 }, { "epoch": 1.1325503355704698, "grad_norm": 0.52734375, "learning_rate": 0.00026598323009894805, "loss": 0.4147, "step": 2700 }, { "epoch": 1.1342281879194631, "grad_norm": 0.498046875, "learning_rate": 0.00026588186619755735, "loss": 0.6966, "step": 2704 }, { "epoch": 1.1359060402684564, "grad_norm": 0.640625, "learning_rate": 0.0002657803708732344, "loss": 0.582, "step": 2708 }, { "epoch": 1.1375838926174497, "grad_norm": 0.482421875, "learning_rate": 0.0002656787442410861, "loss": 0.5736, "step": 2712 }, { "epoch": 1.139261744966443, "grad_norm": 0.451171875, "learning_rate": 0.00026557698641636835, "loss": 0.3956, "step": 2716 }, { "epoch": 1.1409395973154361, "grad_norm": 0.56640625, "learning_rate": 0.0002654750975144859, "loss": 0.4721, "step": 2720 }, { "epoch": 1.1426174496644295, "grad_norm": 0.546875, "learning_rate": 0.0002653730776509921, "loss": 0.4532, "step": 2724 }, { "epoch": 1.1442953020134228, "grad_norm": 0.56640625, "learning_rate": 0.00026527092694158877, "loss": 0.5435, "step": 2728 }, { "epoch": 1.145973154362416, "grad_norm": 0.7109375, "learning_rate": 0.0002651686455021263, "loss": 0.5493, "step": 2732 }, { "epoch": 1.1476510067114094, "grad_norm": 0.54296875, "learning_rate": 0.00026506623344860306, "loss": 0.6484, "step": 2736 }, { "epoch": 1.1493288590604027, "grad_norm": 0.5625, "learning_rate": 0.0002649636908971658, "loss": 0.3896, "step": 2740 }, { "epoch": 1.151006711409396, "grad_norm": 0.462890625, "learning_rate": 0.0002648610179641093, "loss": 0.3727, "step": 2744 }, { "epoch": 1.1526845637583893, "grad_norm": 0.5703125, "learning_rate": 0.00026475821476587585, "loss": 0.406, "step": 2748 }, { "epoch": 1.1543624161073827, "grad_norm": 0.5, "learning_rate": 0.00026465528141905595, "loss": 0.4166, "step": 2752 }, { "epoch": 1.1560402684563758, "grad_norm": 1.21875, "learning_rate": 0.0002645522180403873, "loss": 0.6006, "step": 2756 }, { "epoch": 1.157718120805369, "grad_norm": 0.55859375, "learning_rate": 0.00026444902474675536, "loss": 0.386, "step": 2760 }, { "epoch": 1.1593959731543624, "grad_norm": 0.5703125, "learning_rate": 0.0002643457016551927, "loss": 0.5712, "step": 2764 }, { "epoch": 1.1610738255033557, "grad_norm": 0.5859375, "learning_rate": 0.0002642422488828793, "loss": 0.5379, "step": 2768 }, { "epoch": 1.162751677852349, "grad_norm": 0.51953125, "learning_rate": 0.00026413866654714205, "loss": 0.3931, "step": 2772 }, { "epoch": 1.1644295302013423, "grad_norm": 0.578125, "learning_rate": 0.0002640349547654549, "loss": 0.5241, "step": 2776 }, { "epoch": 1.1661073825503356, "grad_norm": 0.4765625, "learning_rate": 0.00026393111365543854, "loss": 0.6139, "step": 2780 }, { "epoch": 1.167785234899329, "grad_norm": 0.46484375, "learning_rate": 0.0002638271433348603, "loss": 0.6125, "step": 2784 }, { "epoch": 1.1694630872483223, "grad_norm": 0.4140625, "learning_rate": 0.0002637230439216341, "loss": 0.4967, "step": 2788 }, { "epoch": 1.1711409395973154, "grad_norm": 0.54296875, "learning_rate": 0.00026361881553382035, "loss": 0.5429, "step": 2792 }, { "epoch": 1.1728187919463087, "grad_norm": 0.478515625, "learning_rate": 0.00026351445828962555, "loss": 0.4951, "step": 2796 }, { "epoch": 1.174496644295302, "grad_norm": 0.5703125, "learning_rate": 0.00026340997230740244, "loss": 0.5566, "step": 2800 }, { "epoch": 1.1761744966442953, "grad_norm": 0.578125, "learning_rate": 0.0002633053577056498, "loss": 0.4459, "step": 2804 }, { "epoch": 1.1778523489932886, "grad_norm": 0.6015625, "learning_rate": 0.0002632006146030122, "loss": 0.5993, "step": 2808 }, { "epoch": 1.179530201342282, "grad_norm": 0.482421875, "learning_rate": 0.00026309574311827994, "loss": 0.6096, "step": 2812 }, { "epoch": 1.1812080536912752, "grad_norm": 0.404296875, "learning_rate": 0.000262990743370389, "loss": 0.4859, "step": 2816 }, { "epoch": 1.1828859060402686, "grad_norm": 0.5703125, "learning_rate": 0.00026288561547842075, "loss": 0.5298, "step": 2820 }, { "epoch": 1.1845637583892619, "grad_norm": 0.68359375, "learning_rate": 0.00026278035956160196, "loss": 0.4554, "step": 2824 }, { "epoch": 1.186241610738255, "grad_norm": 0.55078125, "learning_rate": 0.0002626749757393045, "loss": 0.4685, "step": 2828 }, { "epoch": 1.1879194630872483, "grad_norm": 0.5859375, "learning_rate": 0.0002625694641310454, "loss": 0.5112, "step": 2832 }, { "epoch": 1.1895973154362416, "grad_norm": 0.482421875, "learning_rate": 0.0002624638248564865, "loss": 0.4167, "step": 2836 }, { "epoch": 1.191275167785235, "grad_norm": 0.58203125, "learning_rate": 0.00026235805803543466, "loss": 0.5819, "step": 2840 }, { "epoch": 1.1929530201342282, "grad_norm": 0.546875, "learning_rate": 0.00026225216378784097, "loss": 0.6191, "step": 2844 }, { "epoch": 1.1946308724832215, "grad_norm": 0.52734375, "learning_rate": 0.00026214614223380144, "loss": 0.68, "step": 2848 }, { "epoch": 1.1963087248322148, "grad_norm": 0.640625, "learning_rate": 0.0002620399934935564, "loss": 0.6027, "step": 2852 }, { "epoch": 1.197986577181208, "grad_norm": 0.55859375, "learning_rate": 0.00026193371768749017, "loss": 0.3831, "step": 2856 }, { "epoch": 1.1996644295302012, "grad_norm": 0.58203125, "learning_rate": 0.0002618273149361314, "loss": 0.656, "step": 2860 }, { "epoch": 1.2013422818791946, "grad_norm": 0.46484375, "learning_rate": 0.0002617207853601526, "loss": 0.4668, "step": 2864 }, { "epoch": 1.2030201342281879, "grad_norm": 0.498046875, "learning_rate": 0.00026161412908037027, "loss": 0.505, "step": 2868 }, { "epoch": 1.2046979865771812, "grad_norm": 0.5, "learning_rate": 0.0002615073462177445, "loss": 0.4745, "step": 2872 }, { "epoch": 1.2063758389261745, "grad_norm": 0.48046875, "learning_rate": 0.0002614004368933788, "loss": 0.347, "step": 2876 }, { "epoch": 1.2080536912751678, "grad_norm": 0.52734375, "learning_rate": 0.00026129340122852036, "loss": 0.5724, "step": 2880 }, { "epoch": 1.2097315436241611, "grad_norm": 0.58203125, "learning_rate": 0.00026118623934455953, "loss": 0.5486, "step": 2884 }, { "epoch": 1.2114093959731544, "grad_norm": 0.482421875, "learning_rate": 0.00026107895136302973, "loss": 0.4814, "step": 2888 }, { "epoch": 1.2130872483221475, "grad_norm": 0.5546875, "learning_rate": 0.0002609715374056076, "loss": 0.4704, "step": 2892 }, { "epoch": 1.2147651006711409, "grad_norm": 0.486328125, "learning_rate": 0.0002608639975941124, "loss": 0.4168, "step": 2896 }, { "epoch": 1.2164429530201342, "grad_norm": 0.6171875, "learning_rate": 0.0002607563320505063, "loss": 0.5237, "step": 2900 }, { "epoch": 1.2181208053691275, "grad_norm": 0.55859375, "learning_rate": 0.00026064854089689405, "loss": 0.4708, "step": 2904 }, { "epoch": 1.2197986577181208, "grad_norm": 0.4765625, "learning_rate": 0.0002605406242555227, "loss": 0.6208, "step": 2908 }, { "epoch": 1.221476510067114, "grad_norm": 0.451171875, "learning_rate": 0.0002604325822487818, "loss": 0.5432, "step": 2912 }, { "epoch": 1.2231543624161074, "grad_norm": 0.419921875, "learning_rate": 0.00026032441499920306, "loss": 0.4868, "step": 2916 }, { "epoch": 1.2248322147651007, "grad_norm": 0.515625, "learning_rate": 0.00026021612262946004, "loss": 0.4702, "step": 2920 }, { "epoch": 1.226510067114094, "grad_norm": 0.474609375, "learning_rate": 0.0002601077052623685, "loss": 0.6609, "step": 2924 }, { "epoch": 1.2281879194630871, "grad_norm": 0.404296875, "learning_rate": 0.0002599991630208857, "loss": 0.4927, "step": 2928 }, { "epoch": 1.2298657718120805, "grad_norm": 0.470703125, "learning_rate": 0.00025989049602811066, "loss": 0.3429, "step": 2932 }, { "epoch": 1.2315436241610738, "grad_norm": 0.53125, "learning_rate": 0.0002597817044072838, "loss": 0.5033, "step": 2936 }, { "epoch": 1.233221476510067, "grad_norm": 0.63671875, "learning_rate": 0.0002596727882817869, "loss": 0.5491, "step": 2940 }, { "epoch": 1.2348993288590604, "grad_norm": 0.6171875, "learning_rate": 0.0002595637477751431, "loss": 0.5145, "step": 2944 }, { "epoch": 1.2365771812080537, "grad_norm": 0.55078125, "learning_rate": 0.00025945458301101636, "loss": 0.6148, "step": 2948 }, { "epoch": 1.238255033557047, "grad_norm": 0.55859375, "learning_rate": 0.0002593452941132117, "loss": 0.6807, "step": 2952 }, { "epoch": 1.2399328859060403, "grad_norm": 0.5703125, "learning_rate": 0.0002592358812056749, "loss": 0.6035, "step": 2956 }, { "epoch": 1.2416107382550337, "grad_norm": 0.5625, "learning_rate": 0.0002591263444124923, "loss": 0.4779, "step": 2960 }, { "epoch": 1.2432885906040267, "grad_norm": 0.65234375, "learning_rate": 0.000259016683857891, "loss": 0.4523, "step": 2964 }, { "epoch": 1.24496644295302, "grad_norm": 0.69921875, "learning_rate": 0.00025890689966623804, "loss": 0.4962, "step": 2968 }, { "epoch": 1.2466442953020134, "grad_norm": 0.61328125, "learning_rate": 0.0002587969919620411, "loss": 0.3631, "step": 2972 }, { "epoch": 1.2483221476510067, "grad_norm": 0.58203125, "learning_rate": 0.0002586869608699476, "loss": 0.6061, "step": 2976 }, { "epoch": 1.25, "grad_norm": 0.5625, "learning_rate": 0.0002585768065147452, "loss": 0.5574, "step": 2980 }, { "epoch": 1.2516778523489933, "grad_norm": 0.462890625, "learning_rate": 0.00025846652902136106, "loss": 0.5519, "step": 2984 }, { "epoch": 1.2533557046979866, "grad_norm": 0.5703125, "learning_rate": 0.00025835612851486217, "loss": 0.641, "step": 2988 }, { "epoch": 1.25503355704698, "grad_norm": 0.48828125, "learning_rate": 0.000258245605120455, "loss": 0.3748, "step": 2992 }, { "epoch": 1.2567114093959733, "grad_norm": 0.578125, "learning_rate": 0.00025813495896348537, "loss": 0.5434, "step": 2996 }, { "epoch": 1.2583892617449663, "grad_norm": 0.498046875, "learning_rate": 0.00025802419016943834, "loss": 0.4467, "step": 3000 }, { "epoch": 1.2600671140939597, "grad_norm": 0.55078125, "learning_rate": 0.0002579132988639381, "loss": 0.5544, "step": 3004 }, { "epoch": 1.261744966442953, "grad_norm": 0.4765625, "learning_rate": 0.0002578022851727477, "loss": 0.4767, "step": 3008 }, { "epoch": 1.2634228187919463, "grad_norm": 0.546875, "learning_rate": 0.00025769114922176894, "loss": 0.6306, "step": 3012 }, { "epoch": 1.2651006711409396, "grad_norm": 0.50390625, "learning_rate": 0.0002575798911370425, "loss": 0.4262, "step": 3016 }, { "epoch": 1.266778523489933, "grad_norm": 0.58984375, "learning_rate": 0.0002574685110447472, "loss": 0.4976, "step": 3020 }, { "epoch": 1.2684563758389262, "grad_norm": 0.62890625, "learning_rate": 0.00025735700907120076, "loss": 0.39, "step": 3024 }, { "epoch": 1.2701342281879195, "grad_norm": 0.5546875, "learning_rate": 0.0002572453853428586, "loss": 0.3318, "step": 3028 }, { "epoch": 1.2718120805369129, "grad_norm": 0.56640625, "learning_rate": 0.0002571336399863146, "loss": 0.445, "step": 3032 }, { "epoch": 1.273489932885906, "grad_norm": 0.52734375, "learning_rate": 0.0002570217731283003, "loss": 0.5707, "step": 3036 }, { "epoch": 1.2751677852348993, "grad_norm": 0.61328125, "learning_rate": 0.0002569097848956852, "loss": 0.4862, "step": 3040 }, { "epoch": 1.2768456375838926, "grad_norm": 0.6328125, "learning_rate": 0.00025679767541547655, "loss": 0.5913, "step": 3044 }, { "epoch": 1.278523489932886, "grad_norm": 0.52734375, "learning_rate": 0.00025668544481481875, "loss": 0.4933, "step": 3048 }, { "epoch": 1.2802013422818792, "grad_norm": 0.470703125, "learning_rate": 0.000256573093220994, "loss": 0.5017, "step": 3052 }, { "epoch": 1.2818791946308725, "grad_norm": 0.453125, "learning_rate": 0.0002564606207614214, "loss": 0.5025, "step": 3056 }, { "epoch": 1.2835570469798658, "grad_norm": 0.484375, "learning_rate": 0.0002563480275636572, "loss": 0.3677, "step": 3060 }, { "epoch": 1.285234899328859, "grad_norm": 0.56640625, "learning_rate": 0.0002562353137553947, "loss": 0.4973, "step": 3064 }, { "epoch": 1.2869127516778525, "grad_norm": 0.5625, "learning_rate": 0.0002561224794644639, "loss": 0.5753, "step": 3068 }, { "epoch": 1.2885906040268456, "grad_norm": 0.5234375, "learning_rate": 0.00025600952481883143, "loss": 0.5331, "step": 3072 }, { "epoch": 1.2902684563758389, "grad_norm": 0.52734375, "learning_rate": 0.00025589644994660035, "loss": 0.5814, "step": 3076 }, { "epoch": 1.2919463087248322, "grad_norm": 0.625, "learning_rate": 0.0002557832549760103, "loss": 0.5233, "step": 3080 }, { "epoch": 1.2936241610738255, "grad_norm": 0.53515625, "learning_rate": 0.00025566994003543684, "loss": 0.4596, "step": 3084 }, { "epoch": 1.2953020134228188, "grad_norm": 0.58984375, "learning_rate": 0.00025555650525339174, "loss": 0.5059, "step": 3088 }, { "epoch": 1.2969798657718121, "grad_norm": 0.5546875, "learning_rate": 0.00025544295075852273, "loss": 0.4915, "step": 3092 }, { "epoch": 1.2986577181208054, "grad_norm": 0.451171875, "learning_rate": 0.0002553292766796132, "loss": 0.4566, "step": 3096 }, { "epoch": 1.3003355704697985, "grad_norm": 0.54296875, "learning_rate": 0.00025521548314558223, "loss": 0.5283, "step": 3100 }, { "epoch": 1.302013422818792, "grad_norm": 0.52734375, "learning_rate": 0.00025510157028548435, "loss": 0.4373, "step": 3104 }, { "epoch": 1.3036912751677852, "grad_norm": 0.55078125, "learning_rate": 0.0002549875382285094, "loss": 0.5551, "step": 3108 }, { "epoch": 1.3053691275167785, "grad_norm": 0.5078125, "learning_rate": 0.00025487338710398247, "loss": 0.4634, "step": 3112 }, { "epoch": 1.3070469798657718, "grad_norm": 0.5703125, "learning_rate": 0.00025475911704136354, "loss": 0.5841, "step": 3116 }, { "epoch": 1.308724832214765, "grad_norm": 0.4921875, "learning_rate": 0.00025464472817024773, "loss": 0.3746, "step": 3120 }, { "epoch": 1.3104026845637584, "grad_norm": 0.65234375, "learning_rate": 0.0002545302206203646, "loss": 0.7312, "step": 3124 }, { "epoch": 1.3120805369127517, "grad_norm": 0.5859375, "learning_rate": 0.0002544155945215785, "loss": 0.5979, "step": 3128 }, { "epoch": 1.313758389261745, "grad_norm": 0.640625, "learning_rate": 0.00025430085000388813, "loss": 0.5668, "step": 3132 }, { "epoch": 1.3154362416107381, "grad_norm": 0.494140625, "learning_rate": 0.00025418598719742676, "loss": 0.4657, "step": 3136 }, { "epoch": 1.3171140939597317, "grad_norm": 0.5625, "learning_rate": 0.00025407100623246135, "loss": 0.4713, "step": 3140 }, { "epoch": 1.3187919463087248, "grad_norm": 0.51171875, "learning_rate": 0.0002539559072393932, "loss": 0.4822, "step": 3144 }, { "epoch": 1.320469798657718, "grad_norm": 0.486328125, "learning_rate": 0.0002538406903487574, "loss": 0.501, "step": 3148 }, { "epoch": 1.3221476510067114, "grad_norm": 0.5546875, "learning_rate": 0.00025372535569122273, "loss": 0.4696, "step": 3152 }, { "epoch": 1.3238255033557047, "grad_norm": 0.5859375, "learning_rate": 0.0002536099033975915, "loss": 0.5214, "step": 3156 }, { "epoch": 1.325503355704698, "grad_norm": 0.546875, "learning_rate": 0.0002534943335987994, "loss": 0.3838, "step": 3160 }, { "epoch": 1.3271812080536913, "grad_norm": 0.88671875, "learning_rate": 0.0002533786464259156, "loss": 0.5118, "step": 3164 }, { "epoch": 1.3288590604026846, "grad_norm": 0.470703125, "learning_rate": 0.00025326284201014203, "loss": 0.3885, "step": 3168 }, { "epoch": 1.3305369127516777, "grad_norm": 0.470703125, "learning_rate": 0.00025314692048281393, "loss": 0.4826, "step": 3172 }, { "epoch": 1.3322147651006713, "grad_norm": 0.76171875, "learning_rate": 0.0002530308819753992, "loss": 0.4998, "step": 3176 }, { "epoch": 1.3338926174496644, "grad_norm": 0.75, "learning_rate": 0.0002529147266194984, "loss": 0.5291, "step": 3180 }, { "epoch": 1.3355704697986577, "grad_norm": 0.49609375, "learning_rate": 0.0002527984545468446, "loss": 0.5075, "step": 3184 }, { "epoch": 1.337248322147651, "grad_norm": 0.53125, "learning_rate": 0.0002526820658893033, "loss": 0.4657, "step": 3188 }, { "epoch": 1.3389261744966443, "grad_norm": 0.66796875, "learning_rate": 0.0002525655607788722, "loss": 0.4945, "step": 3192 }, { "epoch": 1.3406040268456376, "grad_norm": 0.58203125, "learning_rate": 0.00025244893934768097, "loss": 0.4597, "step": 3196 }, { "epoch": 1.342281879194631, "grad_norm": 0.56640625, "learning_rate": 0.00025233220172799143, "loss": 0.6244, "step": 3200 }, { "epoch": 1.3439597315436242, "grad_norm": 0.5703125, "learning_rate": 0.00025221534805219695, "loss": 0.4871, "step": 3204 }, { "epoch": 1.3456375838926173, "grad_norm": 0.474609375, "learning_rate": 0.0002520983784528226, "loss": 0.4474, "step": 3208 }, { "epoch": 1.3473154362416109, "grad_norm": 0.70703125, "learning_rate": 0.000251981293062525, "loss": 0.4203, "step": 3212 }, { "epoch": 1.348993288590604, "grad_norm": 0.58203125, "learning_rate": 0.0002518640920140919, "loss": 0.5212, "step": 3216 }, { "epoch": 1.3506711409395973, "grad_norm": 0.765625, "learning_rate": 0.0002517467754404424, "loss": 0.5098, "step": 3220 }, { "epoch": 1.3523489932885906, "grad_norm": 0.63671875, "learning_rate": 0.0002516293434746265, "loss": 0.5478, "step": 3224 }, { "epoch": 1.354026845637584, "grad_norm": 0.67578125, "learning_rate": 0.0002515117962498252, "loss": 0.4759, "step": 3228 }, { "epoch": 1.3557046979865772, "grad_norm": 0.61328125, "learning_rate": 0.0002513941338993501, "loss": 0.5777, "step": 3232 }, { "epoch": 1.3573825503355705, "grad_norm": 0.53125, "learning_rate": 0.0002512763565566434, "loss": 0.5178, "step": 3236 }, { "epoch": 1.3590604026845639, "grad_norm": 0.5625, "learning_rate": 0.00025115846435527767, "loss": 0.503, "step": 3240 }, { "epoch": 1.360738255033557, "grad_norm": 0.7265625, "learning_rate": 0.00025104045742895583, "loss": 0.5086, "step": 3244 }, { "epoch": 1.3624161073825503, "grad_norm": 0.515625, "learning_rate": 0.0002509223359115108, "loss": 0.6678, "step": 3248 }, { "epoch": 1.3640939597315436, "grad_norm": 0.55078125, "learning_rate": 0.0002508040999369056, "loss": 0.6408, "step": 3252 }, { "epoch": 1.3657718120805369, "grad_norm": 0.60546875, "learning_rate": 0.00025068574963923307, "loss": 0.5954, "step": 3256 }, { "epoch": 1.3674496644295302, "grad_norm": 0.5078125, "learning_rate": 0.0002505672851527154, "loss": 0.5222, "step": 3260 }, { "epoch": 1.3691275167785235, "grad_norm": 0.59375, "learning_rate": 0.00025044870661170465, "loss": 0.4723, "step": 3264 }, { "epoch": 1.3708053691275168, "grad_norm": 0.55078125, "learning_rate": 0.00025033001415068203, "loss": 0.4832, "step": 3268 }, { "epoch": 1.3724832214765101, "grad_norm": 0.6640625, "learning_rate": 0.00025021120790425807, "loss": 0.602, "step": 3272 }, { "epoch": 1.3741610738255035, "grad_norm": 0.5625, "learning_rate": 0.00025009228800717214, "loss": 0.4181, "step": 3276 }, { "epoch": 1.3758389261744965, "grad_norm": 0.5234375, "learning_rate": 0.00024997325459429276, "loss": 0.5125, "step": 3280 }, { "epoch": 1.3775167785234899, "grad_norm": 0.44140625, "learning_rate": 0.00024985410780061705, "loss": 0.4101, "step": 3284 }, { "epoch": 1.3791946308724832, "grad_norm": 0.470703125, "learning_rate": 0.0002497348477612707, "loss": 0.4383, "step": 3288 }, { "epoch": 1.3808724832214765, "grad_norm": 0.48046875, "learning_rate": 0.0002496154746115078, "loss": 0.5289, "step": 3292 }, { "epoch": 1.3825503355704698, "grad_norm": 0.4765625, "learning_rate": 0.0002494959884867109, "loss": 0.4725, "step": 3296 }, { "epoch": 1.3842281879194631, "grad_norm": 0.671875, "learning_rate": 0.0002493763895223905, "loss": 0.6606, "step": 3300 }, { "epoch": 1.3859060402684564, "grad_norm": 0.52734375, "learning_rate": 0.00024925667785418527, "loss": 0.577, "step": 3304 }, { "epoch": 1.3875838926174497, "grad_norm": 0.53125, "learning_rate": 0.0002491368536178614, "loss": 0.5565, "step": 3308 }, { "epoch": 1.389261744966443, "grad_norm": 0.6328125, "learning_rate": 0.00024901691694931294, "loss": 0.4737, "step": 3312 }, { "epoch": 1.3909395973154361, "grad_norm": 0.59765625, "learning_rate": 0.0002488968679845614, "loss": 0.4634, "step": 3316 }, { "epoch": 1.3926174496644295, "grad_norm": 0.478515625, "learning_rate": 0.0002487767068597558, "loss": 0.4693, "step": 3320 }, { "epoch": 1.3942953020134228, "grad_norm": 0.5078125, "learning_rate": 0.00024865643371117205, "loss": 0.4568, "step": 3324 }, { "epoch": 1.395973154362416, "grad_norm": 0.546875, "learning_rate": 0.00024853604867521343, "loss": 0.6128, "step": 3328 }, { "epoch": 1.3976510067114094, "grad_norm": 0.57421875, "learning_rate": 0.00024841555188840985, "loss": 0.5186, "step": 3332 }, { "epoch": 1.3993288590604027, "grad_norm": 0.55859375, "learning_rate": 0.0002482949434874182, "loss": 0.4148, "step": 3336 }, { "epoch": 1.401006711409396, "grad_norm": 0.5703125, "learning_rate": 0.00024817422360902175, "loss": 0.4149, "step": 3340 }, { "epoch": 1.4026845637583891, "grad_norm": 0.57421875, "learning_rate": 0.00024805339239013024, "loss": 0.5939, "step": 3344 }, { "epoch": 1.4043624161073827, "grad_norm": 0.51953125, "learning_rate": 0.00024793244996777975, "loss": 0.4573, "step": 3348 }, { "epoch": 1.4060402684563758, "grad_norm": 0.6640625, "learning_rate": 0.0002478113964791326, "loss": 0.5961, "step": 3352 }, { "epoch": 1.407718120805369, "grad_norm": 0.71484375, "learning_rate": 0.00024769023206147665, "loss": 0.5965, "step": 3356 }, { "epoch": 1.4093959731543624, "grad_norm": 0.53515625, "learning_rate": 0.000247568956852226, "loss": 0.652, "step": 3360 }, { "epoch": 1.4110738255033557, "grad_norm": 0.578125, "learning_rate": 0.00024744757098892024, "loss": 0.4899, "step": 3364 }, { "epoch": 1.412751677852349, "grad_norm": 0.5703125, "learning_rate": 0.00024732607460922437, "loss": 0.4882, "step": 3368 }, { "epoch": 1.4144295302013423, "grad_norm": 0.53515625, "learning_rate": 0.00024720446785092884, "loss": 0.4658, "step": 3372 }, { "epoch": 1.4161073825503356, "grad_norm": 0.6328125, "learning_rate": 0.0002470827508519492, "loss": 0.5341, "step": 3376 }, { "epoch": 1.4177852348993287, "grad_norm": 0.51171875, "learning_rate": 0.00024696092375032624, "loss": 0.5026, "step": 3380 }, { "epoch": 1.4194630872483223, "grad_norm": 0.49609375, "learning_rate": 0.0002468389866842253, "loss": 0.428, "step": 3384 }, { "epoch": 1.4211409395973154, "grad_norm": 0.58203125, "learning_rate": 0.00024671693979193656, "loss": 0.6631, "step": 3388 }, { "epoch": 1.4228187919463087, "grad_norm": 0.53515625, "learning_rate": 0.00024659478321187485, "loss": 0.5971, "step": 3392 }, { "epoch": 1.424496644295302, "grad_norm": 0.578125, "learning_rate": 0.0002464725170825793, "loss": 0.4316, "step": 3396 }, { "epoch": 1.4261744966442953, "grad_norm": 0.51171875, "learning_rate": 0.00024635014154271337, "loss": 0.528, "step": 3400 }, { "epoch": 1.4278523489932886, "grad_norm": 0.7421875, "learning_rate": 0.00024622765673106445, "loss": 0.5132, "step": 3404 }, { "epoch": 1.429530201342282, "grad_norm": 0.5859375, "learning_rate": 0.00024610506278654397, "loss": 0.4848, "step": 3408 }, { "epoch": 1.4312080536912752, "grad_norm": 0.75390625, "learning_rate": 0.00024598235984818717, "loss": 0.5323, "step": 3412 }, { "epoch": 1.4328859060402683, "grad_norm": 0.6171875, "learning_rate": 0.00024585954805515277, "loss": 0.6784, "step": 3416 }, { "epoch": 1.4345637583892619, "grad_norm": 0.62109375, "learning_rate": 0.000245736627546723, "loss": 0.5989, "step": 3420 }, { "epoch": 1.436241610738255, "grad_norm": 0.48828125, "learning_rate": 0.0002456135984623034, "loss": 0.5897, "step": 3424 }, { "epoch": 1.4379194630872483, "grad_norm": 0.57421875, "learning_rate": 0.00024549046094142274, "loss": 0.3977, "step": 3428 }, { "epoch": 1.4395973154362416, "grad_norm": 0.54296875, "learning_rate": 0.0002453672151237326, "loss": 0.4185, "step": 3432 }, { "epoch": 1.441275167785235, "grad_norm": 0.6484375, "learning_rate": 0.0002452438611490074, "loss": 0.5641, "step": 3436 }, { "epoch": 1.4429530201342282, "grad_norm": 0.55859375, "learning_rate": 0.0002451203991571444, "loss": 0.5503, "step": 3440 }, { "epoch": 1.4446308724832215, "grad_norm": 0.4765625, "learning_rate": 0.0002449968292881631, "loss": 0.513, "step": 3444 }, { "epoch": 1.4463087248322148, "grad_norm": 0.51171875, "learning_rate": 0.00024487315168220553, "loss": 0.5724, "step": 3448 }, { "epoch": 1.447986577181208, "grad_norm": 0.466796875, "learning_rate": 0.0002447493664795359, "loss": 0.4854, "step": 3452 }, { "epoch": 1.4496644295302015, "grad_norm": 0.51171875, "learning_rate": 0.0002446254738205404, "loss": 0.5001, "step": 3456 }, { "epoch": 1.4513422818791946, "grad_norm": 0.5234375, "learning_rate": 0.0002445014738457271, "loss": 0.3674, "step": 3460 }, { "epoch": 1.4530201342281879, "grad_norm": 0.54296875, "learning_rate": 0.00024437736669572575, "loss": 0.4891, "step": 3464 }, { "epoch": 1.4546979865771812, "grad_norm": 0.466796875, "learning_rate": 0.0002442531525112876, "loss": 0.432, "step": 3468 }, { "epoch": 1.4563758389261745, "grad_norm": 0.59765625, "learning_rate": 0.00024412883143328548, "loss": 0.4987, "step": 3472 }, { "epoch": 1.4580536912751678, "grad_norm": 0.5625, "learning_rate": 0.00024400440360271324, "loss": 0.3825, "step": 3476 }, { "epoch": 1.4597315436241611, "grad_norm": 0.50390625, "learning_rate": 0.00024387986916068596, "loss": 0.6343, "step": 3480 }, { "epoch": 1.4614093959731544, "grad_norm": 0.70703125, "learning_rate": 0.0002437552282484395, "loss": 0.5255, "step": 3484 }, { "epoch": 1.4630872483221475, "grad_norm": 0.51953125, "learning_rate": 0.00024363048100733048, "loss": 0.5286, "step": 3488 }, { "epoch": 1.464765100671141, "grad_norm": 0.515625, "learning_rate": 0.0002435056275788363, "loss": 0.4504, "step": 3492 }, { "epoch": 1.4664429530201342, "grad_norm": 0.71875, "learning_rate": 0.00024338066810455453, "loss": 0.6087, "step": 3496 }, { "epoch": 1.4681208053691275, "grad_norm": 0.57421875, "learning_rate": 0.00024325560272620315, "loss": 0.4728, "step": 3500 }, { "epoch": 1.4697986577181208, "grad_norm": 0.6796875, "learning_rate": 0.00024313043158562023, "loss": 0.6116, "step": 3504 }, { "epoch": 1.471476510067114, "grad_norm": 0.5859375, "learning_rate": 0.0002430051548247637, "loss": 0.5536, "step": 3508 }, { "epoch": 1.4731543624161074, "grad_norm": 0.56640625, "learning_rate": 0.0002428797725857115, "loss": 0.5504, "step": 3512 }, { "epoch": 1.4748322147651007, "grad_norm": 0.51171875, "learning_rate": 0.00024275428501066085, "loss": 0.4927, "step": 3516 }, { "epoch": 1.476510067114094, "grad_norm": 1.0703125, "learning_rate": 0.00024262869224192876, "loss": 0.4392, "step": 3520 }, { "epoch": 1.4781879194630871, "grad_norm": 0.55859375, "learning_rate": 0.00024250299442195138, "loss": 0.513, "step": 3524 }, { "epoch": 1.4798657718120805, "grad_norm": 0.546875, "learning_rate": 0.00024237719169328402, "loss": 0.5226, "step": 3528 }, { "epoch": 1.4815436241610738, "grad_norm": 0.546875, "learning_rate": 0.00024225128419860098, "loss": 0.5829, "step": 3532 }, { "epoch": 1.483221476510067, "grad_norm": 0.470703125, "learning_rate": 0.00024212527208069537, "loss": 0.5313, "step": 3536 }, { "epoch": 1.4848993288590604, "grad_norm": 0.5625, "learning_rate": 0.00024199915548247893, "loss": 0.5382, "step": 3540 }, { "epoch": 1.4865771812080537, "grad_norm": 0.51171875, "learning_rate": 0.00024187293454698195, "loss": 0.4024, "step": 3544 }, { "epoch": 1.488255033557047, "grad_norm": 0.478515625, "learning_rate": 0.00024174660941735297, "loss": 0.3868, "step": 3548 }, { "epoch": 1.4899328859060403, "grad_norm": 0.515625, "learning_rate": 0.00024162018023685876, "loss": 0.5765, "step": 3552 }, { "epoch": 1.4916107382550337, "grad_norm": 0.6328125, "learning_rate": 0.00024149364714888414, "loss": 0.5279, "step": 3556 }, { "epoch": 1.4932885906040267, "grad_norm": 0.5234375, "learning_rate": 0.00024136701029693166, "loss": 0.5752, "step": 3560 }, { "epoch": 1.49496644295302, "grad_norm": 0.466796875, "learning_rate": 0.0002412402698246216, "loss": 0.4976, "step": 3564 }, { "epoch": 1.4966442953020134, "grad_norm": 0.609375, "learning_rate": 0.0002411134258756918, "loss": 0.4269, "step": 3568 }, { "epoch": 1.4983221476510067, "grad_norm": 0.62109375, "learning_rate": 0.00024098647859399736, "loss": 0.6749, "step": 3572 }, { "epoch": 1.5, "grad_norm": 0.73828125, "learning_rate": 0.0002408594281235107, "loss": 0.4544, "step": 3576 }, { "epoch": 1.5016778523489933, "grad_norm": 0.59375, "learning_rate": 0.00024073227460832112, "loss": 0.5184, "step": 3580 }, { "epoch": 1.5033557046979866, "grad_norm": 0.6171875, "learning_rate": 0.0002406050181926349, "loss": 0.6094, "step": 3584 }, { "epoch": 1.5050335570469797, "grad_norm": 0.66015625, "learning_rate": 0.00024047765902077492, "loss": 0.4112, "step": 3588 }, { "epoch": 1.5067114093959733, "grad_norm": 0.70703125, "learning_rate": 0.00024035019723718068, "loss": 0.6192, "step": 3592 }, { "epoch": 1.5083892617449663, "grad_norm": 0.6953125, "learning_rate": 0.00024022263298640806, "loss": 0.7145, "step": 3596 }, { "epoch": 1.5100671140939599, "grad_norm": 0.51171875, "learning_rate": 0.00024009496641312912, "loss": 0.5474, "step": 3600 }, { "epoch": 1.511744966442953, "grad_norm": 0.48828125, "learning_rate": 0.00023996719766213194, "loss": 0.4226, "step": 3604 }, { "epoch": 1.5134228187919463, "grad_norm": 0.76953125, "learning_rate": 0.0002398393268783205, "loss": 0.3876, "step": 3608 }, { "epoch": 1.5151006711409396, "grad_norm": 0.6015625, "learning_rate": 0.00023971135420671448, "loss": 0.4131, "step": 3612 }, { "epoch": 1.516778523489933, "grad_norm": 0.50390625, "learning_rate": 0.0002395832797924492, "loss": 0.417, "step": 3616 }, { "epoch": 1.5184563758389262, "grad_norm": 0.57421875, "learning_rate": 0.0002394551037807752, "loss": 0.492, "step": 3620 }, { "epoch": 1.5201342281879193, "grad_norm": 0.5859375, "learning_rate": 0.00023932682631705842, "loss": 0.5419, "step": 3624 }, { "epoch": 1.5218120805369129, "grad_norm": 0.53125, "learning_rate": 0.00023919844754677973, "loss": 0.566, "step": 3628 }, { "epoch": 1.523489932885906, "grad_norm": 0.609375, "learning_rate": 0.00023906996761553496, "loss": 0.375, "step": 3632 }, { "epoch": 1.5251677852348995, "grad_norm": 0.50390625, "learning_rate": 0.00023894138666903468, "loss": 0.4264, "step": 3636 }, { "epoch": 1.5268456375838926, "grad_norm": 0.58984375, "learning_rate": 0.00023881270485310395, "loss": 0.4413, "step": 3640 }, { "epoch": 1.528523489932886, "grad_norm": 0.52734375, "learning_rate": 0.00023868392231368227, "loss": 0.6576, "step": 3644 }, { "epoch": 1.5302013422818792, "grad_norm": 0.6484375, "learning_rate": 0.00023855503919682344, "loss": 0.3061, "step": 3648 }, { "epoch": 1.5318791946308725, "grad_norm": 0.5390625, "learning_rate": 0.0002384260556486952, "loss": 0.403, "step": 3652 }, { "epoch": 1.5335570469798658, "grad_norm": 0.4765625, "learning_rate": 0.00023829697181557922, "loss": 0.4704, "step": 3656 }, { "epoch": 1.535234899328859, "grad_norm": 0.71484375, "learning_rate": 0.00023816778784387094, "loss": 0.5499, "step": 3660 }, { "epoch": 1.5369127516778525, "grad_norm": 0.53125, "learning_rate": 0.00023803850388007947, "loss": 0.4735, "step": 3664 }, { "epoch": 1.5385906040268456, "grad_norm": 0.6328125, "learning_rate": 0.00023790912007082702, "loss": 0.3983, "step": 3668 }, { "epoch": 1.540268456375839, "grad_norm": 0.6875, "learning_rate": 0.0002377796365628494, "loss": 0.4329, "step": 3672 }, { "epoch": 1.5419463087248322, "grad_norm": 0.62890625, "learning_rate": 0.00023765005350299525, "loss": 0.5901, "step": 3676 }, { "epoch": 1.5436241610738255, "grad_norm": 0.51953125, "learning_rate": 0.00023752037103822617, "loss": 0.4801, "step": 3680 }, { "epoch": 1.5453020134228188, "grad_norm": 0.5703125, "learning_rate": 0.00023739058931561654, "loss": 0.4827, "step": 3684 }, { "epoch": 1.5469798657718121, "grad_norm": 0.63671875, "learning_rate": 0.00023726070848235328, "loss": 0.5377, "step": 3688 }, { "epoch": 1.5486577181208054, "grad_norm": 0.578125, "learning_rate": 0.0002371307286857356, "loss": 0.4717, "step": 3692 }, { "epoch": 1.5503355704697985, "grad_norm": 0.51171875, "learning_rate": 0.0002370006500731752, "loss": 0.5291, "step": 3696 }, { "epoch": 1.552013422818792, "grad_norm": 0.515625, "learning_rate": 0.00023687047279219567, "loss": 0.4488, "step": 3700 }, { "epoch": 1.5536912751677852, "grad_norm": 0.61328125, "learning_rate": 0.00023674019699043252, "loss": 0.4304, "step": 3704 }, { "epoch": 1.5553691275167785, "grad_norm": 0.6015625, "learning_rate": 0.0002366098228156329, "loss": 0.5579, "step": 3708 }, { "epoch": 1.5570469798657718, "grad_norm": 0.5703125, "learning_rate": 0.0002364793504156558, "loss": 0.5693, "step": 3712 }, { "epoch": 1.558724832214765, "grad_norm": 0.50390625, "learning_rate": 0.00023634877993847133, "loss": 0.476, "step": 3716 }, { "epoch": 1.5604026845637584, "grad_norm": 0.55859375, "learning_rate": 0.00023621811153216104, "loss": 0.5084, "step": 3720 }, { "epoch": 1.5620805369127517, "grad_norm": 0.69140625, "learning_rate": 0.00023608734534491733, "loss": 0.5346, "step": 3724 }, { "epoch": 1.563758389261745, "grad_norm": 0.578125, "learning_rate": 0.00023595648152504366, "loss": 0.4944, "step": 3728 }, { "epoch": 1.5654362416107381, "grad_norm": 0.59375, "learning_rate": 0.0002358255202209542, "loss": 0.5323, "step": 3732 }, { "epoch": 1.5671140939597317, "grad_norm": 0.53515625, "learning_rate": 0.0002356944615811736, "loss": 0.445, "step": 3736 }, { "epoch": 1.5687919463087248, "grad_norm": 0.470703125, "learning_rate": 0.00023556330575433696, "loss": 0.5445, "step": 3740 }, { "epoch": 1.570469798657718, "grad_norm": 0.6484375, "learning_rate": 0.00023543205288918957, "loss": 0.4587, "step": 3744 }, { "epoch": 1.5721476510067114, "grad_norm": 0.5546875, "learning_rate": 0.0002353007031345868, "loss": 0.5247, "step": 3748 }, { "epoch": 1.5738255033557047, "grad_norm": 0.734375, "learning_rate": 0.00023516925663949383, "loss": 0.6116, "step": 3752 }, { "epoch": 1.575503355704698, "grad_norm": 0.484375, "learning_rate": 0.00023503771355298575, "loss": 0.36, "step": 3756 }, { "epoch": 1.5771812080536913, "grad_norm": 0.435546875, "learning_rate": 0.00023490607402424691, "loss": 0.3965, "step": 3760 }, { "epoch": 1.5788590604026846, "grad_norm": 0.5625, "learning_rate": 0.00023477433820257127, "loss": 0.5312, "step": 3764 }, { "epoch": 1.5805369127516777, "grad_norm": 0.5625, "learning_rate": 0.00023464250623736192, "loss": 0.4279, "step": 3768 }, { "epoch": 1.5822147651006713, "grad_norm": 0.55859375, "learning_rate": 0.00023451057827813095, "loss": 0.4593, "step": 3772 }, { "epoch": 1.5838926174496644, "grad_norm": 0.5703125, "learning_rate": 0.0002343785544744993, "loss": 0.5191, "step": 3776 }, { "epoch": 1.5855704697986577, "grad_norm": 0.60546875, "learning_rate": 0.0002342464349761968, "loss": 0.5816, "step": 3780 }, { "epoch": 1.587248322147651, "grad_norm": 0.6328125, "learning_rate": 0.0002341142199330615, "loss": 0.5066, "step": 3784 }, { "epoch": 1.5889261744966443, "grad_norm": 0.6328125, "learning_rate": 0.00023398190949504004, "loss": 0.436, "step": 3788 }, { "epoch": 1.5906040268456376, "grad_norm": 0.609375, "learning_rate": 0.0002338495038121872, "loss": 0.3622, "step": 3792 }, { "epoch": 1.5922818791946307, "grad_norm": 0.71484375, "learning_rate": 0.00023371700303466574, "loss": 0.4383, "step": 3796 }, { "epoch": 1.5939597315436242, "grad_norm": 0.54296875, "learning_rate": 0.00023358440731274625, "loss": 0.4881, "step": 3800 }, { "epoch": 1.5956375838926173, "grad_norm": 0.54296875, "learning_rate": 0.00023345171679680713, "loss": 0.4693, "step": 3804 }, { "epoch": 1.5973154362416109, "grad_norm": 0.470703125, "learning_rate": 0.00023331893163733414, "loss": 0.3743, "step": 3808 }, { "epoch": 1.598993288590604, "grad_norm": 0.58984375, "learning_rate": 0.00023318605198492038, "loss": 0.5592, "step": 3812 }, { "epoch": 1.6006711409395973, "grad_norm": 0.8359375, "learning_rate": 0.00023305307799026623, "loss": 0.5544, "step": 3816 }, { "epoch": 1.6023489932885906, "grad_norm": 0.73828125, "learning_rate": 0.00023292000980417897, "loss": 0.6613, "step": 3820 }, { "epoch": 1.604026845637584, "grad_norm": 0.482421875, "learning_rate": 0.00023278684757757276, "loss": 0.4431, "step": 3824 }, { "epoch": 1.6057046979865772, "grad_norm": 0.427734375, "learning_rate": 0.00023265359146146835, "loss": 0.5021, "step": 3828 }, { "epoch": 1.6073825503355703, "grad_norm": 0.55078125, "learning_rate": 0.00023252024160699304, "loss": 0.4475, "step": 3832 }, { "epoch": 1.6090604026845639, "grad_norm": 0.5078125, "learning_rate": 0.00023238679816538034, "loss": 0.4918, "step": 3836 }, { "epoch": 1.610738255033557, "grad_norm": 0.625, "learning_rate": 0.00023225326128797, "loss": 0.5082, "step": 3840 }, { "epoch": 1.6124161073825505, "grad_norm": 0.63671875, "learning_rate": 0.00023211963112620775, "loss": 0.503, "step": 3844 }, { "epoch": 1.6140939597315436, "grad_norm": 0.57421875, "learning_rate": 0.00023198590783164496, "loss": 0.4799, "step": 3848 }, { "epoch": 1.6157718120805369, "grad_norm": 0.427734375, "learning_rate": 0.00023185209155593883, "loss": 0.4281, "step": 3852 }, { "epoch": 1.6174496644295302, "grad_norm": 0.65625, "learning_rate": 0.00023171818245085185, "loss": 0.6759, "step": 3856 }, { "epoch": 1.6191275167785235, "grad_norm": 0.60546875, "learning_rate": 0.0002315841806682519, "loss": 0.5164, "step": 3860 }, { "epoch": 1.6208053691275168, "grad_norm": 0.490234375, "learning_rate": 0.00023145008636011187, "loss": 0.4527, "step": 3864 }, { "epoch": 1.62248322147651, "grad_norm": 0.56640625, "learning_rate": 0.00023131589967850962, "loss": 0.5335, "step": 3868 }, { "epoch": 1.6241610738255035, "grad_norm": 0.50390625, "learning_rate": 0.00023118162077562777, "loss": 0.479, "step": 3872 }, { "epoch": 1.6258389261744965, "grad_norm": 0.48828125, "learning_rate": 0.00023104724980375357, "loss": 0.4194, "step": 3876 }, { "epoch": 1.62751677852349, "grad_norm": 0.5546875, "learning_rate": 0.00023091278691527868, "loss": 0.4474, "step": 3880 }, { "epoch": 1.6291946308724832, "grad_norm": 0.5859375, "learning_rate": 0.00023077823226269893, "loss": 0.5023, "step": 3884 }, { "epoch": 1.6308724832214765, "grad_norm": 0.609375, "learning_rate": 0.00023064358599861432, "loss": 0.5248, "step": 3888 }, { "epoch": 1.6325503355704698, "grad_norm": 0.53515625, "learning_rate": 0.00023050884827572861, "loss": 0.406, "step": 3892 }, { "epoch": 1.6342281879194631, "grad_norm": 0.57421875, "learning_rate": 0.00023037401924684946, "loss": 0.6574, "step": 3896 }, { "epoch": 1.6359060402684564, "grad_norm": 0.5546875, "learning_rate": 0.000230239099064888, "loss": 0.4413, "step": 3900 }, { "epoch": 1.6375838926174495, "grad_norm": 0.4296875, "learning_rate": 0.00023010408788285864, "loss": 0.4538, "step": 3904 }, { "epoch": 1.639261744966443, "grad_norm": 0.478515625, "learning_rate": 0.00022996898585387915, "loss": 0.3937, "step": 3908 }, { "epoch": 1.6409395973154361, "grad_norm": 0.46484375, "learning_rate": 0.00022983379313117028, "loss": 0.3731, "step": 3912 }, { "epoch": 1.6426174496644297, "grad_norm": 0.55859375, "learning_rate": 0.0002296985098680556, "loss": 0.4868, "step": 3916 }, { "epoch": 1.6442953020134228, "grad_norm": 0.5, "learning_rate": 0.00022956313621796135, "loss": 0.5267, "step": 3920 }, { "epoch": 1.645973154362416, "grad_norm": 0.6015625, "learning_rate": 0.00022942767233441634, "loss": 0.5659, "step": 3924 }, { "epoch": 1.6476510067114094, "grad_norm": 0.6484375, "learning_rate": 0.00022929211837105176, "loss": 0.5864, "step": 3928 }, { "epoch": 1.6493288590604027, "grad_norm": 0.64453125, "learning_rate": 0.00022915647448160076, "loss": 0.5689, "step": 3932 }, { "epoch": 1.651006711409396, "grad_norm": 0.43359375, "learning_rate": 0.00022902074081989872, "loss": 0.3348, "step": 3936 }, { "epoch": 1.6526845637583891, "grad_norm": 0.5, "learning_rate": 0.00022888491753988264, "loss": 0.425, "step": 3940 }, { "epoch": 1.6543624161073827, "grad_norm": 0.609375, "learning_rate": 0.00022874900479559132, "loss": 0.4395, "step": 3944 }, { "epoch": 1.6560402684563758, "grad_norm": 0.69921875, "learning_rate": 0.00022861300274116484, "loss": 0.4854, "step": 3948 }, { "epoch": 1.6577181208053693, "grad_norm": 0.48046875, "learning_rate": 0.0002284769115308447, "loss": 0.454, "step": 3952 }, { "epoch": 1.6593959731543624, "grad_norm": 0.6640625, "learning_rate": 0.00022834073131897353, "loss": 0.5152, "step": 3956 }, { "epoch": 1.6610738255033557, "grad_norm": 0.65234375, "learning_rate": 0.00022820446225999474, "loss": 0.526, "step": 3960 }, { "epoch": 1.662751677852349, "grad_norm": 0.486328125, "learning_rate": 0.00022806810450845273, "loss": 0.5056, "step": 3964 }, { "epoch": 1.6644295302013423, "grad_norm": 0.578125, "learning_rate": 0.00022793165821899223, "loss": 0.7066, "step": 3968 }, { "epoch": 1.6661073825503356, "grad_norm": 0.52734375, "learning_rate": 0.00022779512354635862, "loss": 0.4905, "step": 3972 }, { "epoch": 1.6677852348993287, "grad_norm": 0.5703125, "learning_rate": 0.00022765850064539742, "loss": 0.4531, "step": 3976 }, { "epoch": 1.6694630872483223, "grad_norm": 0.45703125, "learning_rate": 0.0002275217896710541, "loss": 0.4455, "step": 3980 }, { "epoch": 1.6711409395973154, "grad_norm": 0.5390625, "learning_rate": 0.0002273849907783743, "loss": 0.4138, "step": 3984 }, { "epoch": 1.6728187919463087, "grad_norm": 0.55859375, "learning_rate": 0.00022724810412250293, "loss": 0.5325, "step": 3988 }, { "epoch": 1.674496644295302, "grad_norm": 0.484375, "learning_rate": 0.00022711112985868492, "loss": 0.412, "step": 3992 }, { "epoch": 1.6761744966442953, "grad_norm": 0.625, "learning_rate": 0.00022697406814226425, "loss": 0.5579, "step": 3996 }, { "epoch": 1.6778523489932886, "grad_norm": 0.51171875, "learning_rate": 0.00022683691912868411, "loss": 0.4497, "step": 4000 }, { "epoch": 1.679530201342282, "grad_norm": 0.42578125, "learning_rate": 0.0002266996829734868, "loss": 0.556, "step": 4004 }, { "epoch": 1.6812080536912752, "grad_norm": 0.765625, "learning_rate": 0.00022656235983231344, "loss": 0.5172, "step": 4008 }, { "epoch": 1.6828859060402683, "grad_norm": 0.47265625, "learning_rate": 0.0002264249498609036, "loss": 0.4466, "step": 4012 }, { "epoch": 1.6845637583892619, "grad_norm": 0.46875, "learning_rate": 0.00022628745321509567, "loss": 0.5438, "step": 4016 }, { "epoch": 1.686241610738255, "grad_norm": 0.71484375, "learning_rate": 0.00022614987005082596, "loss": 0.5737, "step": 4020 }, { "epoch": 1.6879194630872483, "grad_norm": 0.546875, "learning_rate": 0.00022601220052412925, "loss": 0.5506, "step": 4024 }, { "epoch": 1.6895973154362416, "grad_norm": 0.53515625, "learning_rate": 0.00022587444479113804, "loss": 0.6613, "step": 4028 }, { "epoch": 1.691275167785235, "grad_norm": 0.54296875, "learning_rate": 0.0002257366030080826, "loss": 0.475, "step": 4032 }, { "epoch": 1.6929530201342282, "grad_norm": 0.53125, "learning_rate": 0.00022559867533129092, "loss": 0.6255, "step": 4036 }, { "epoch": 1.6946308724832215, "grad_norm": 0.423828125, "learning_rate": 0.00022546066191718835, "loss": 0.3814, "step": 4040 }, { "epoch": 1.6963087248322148, "grad_norm": 0.6484375, "learning_rate": 0.0002253225629222974, "loss": 0.4607, "step": 4044 }, { "epoch": 1.697986577181208, "grad_norm": 0.5078125, "learning_rate": 0.00022518437850323778, "loss": 0.5456, "step": 4048 }, { "epoch": 1.6996644295302015, "grad_norm": 0.66796875, "learning_rate": 0.0002250461088167259, "loss": 0.5702, "step": 4052 }, { "epoch": 1.7013422818791946, "grad_norm": 0.48828125, "learning_rate": 0.00022490775401957504, "loss": 0.556, "step": 4056 }, { "epoch": 1.7030201342281879, "grad_norm": 0.5859375, "learning_rate": 0.00022476931426869496, "loss": 0.5298, "step": 4060 }, { "epoch": 1.7046979865771812, "grad_norm": 0.49609375, "learning_rate": 0.00022463078972109168, "loss": 0.5186, "step": 4064 }, { "epoch": 1.7063758389261745, "grad_norm": 0.416015625, "learning_rate": 0.00022449218053386747, "loss": 0.4619, "step": 4068 }, { "epoch": 1.7080536912751678, "grad_norm": 0.42578125, "learning_rate": 0.00022435348686422062, "loss": 0.3636, "step": 4072 }, { "epoch": 1.709731543624161, "grad_norm": 0.5, "learning_rate": 0.0002242147088694451, "loss": 0.4912, "step": 4076 }, { "epoch": 1.7114093959731544, "grad_norm": 0.423828125, "learning_rate": 0.00022407584670693063, "loss": 0.4437, "step": 4080 }, { "epoch": 1.7130872483221475, "grad_norm": 0.64453125, "learning_rate": 0.00022393690053416245, "loss": 0.5208, "step": 4084 }, { "epoch": 1.714765100671141, "grad_norm": 0.578125, "learning_rate": 0.0002237978705087208, "loss": 0.4311, "step": 4088 }, { "epoch": 1.7164429530201342, "grad_norm": 0.46875, "learning_rate": 0.00022365875678828138, "loss": 0.4618, "step": 4092 }, { "epoch": 1.7181208053691275, "grad_norm": 0.5390625, "learning_rate": 0.00022351955953061453, "loss": 0.5298, "step": 4096 }, { "epoch": 1.7197986577181208, "grad_norm": 0.50390625, "learning_rate": 0.0002233802788935855, "loss": 0.4769, "step": 4100 }, { "epoch": 1.721476510067114, "grad_norm": 0.60546875, "learning_rate": 0.000223240915035154, "loss": 0.4335, "step": 4104 }, { "epoch": 1.7231543624161074, "grad_norm": 0.59765625, "learning_rate": 0.0002231014681133741, "loss": 0.5497, "step": 4108 }, { "epoch": 1.7248322147651005, "grad_norm": 0.56640625, "learning_rate": 0.00022296193828639415, "loss": 0.6183, "step": 4112 }, { "epoch": 1.726510067114094, "grad_norm": 0.62890625, "learning_rate": 0.00022282232571245656, "loss": 0.4518, "step": 4116 }, { "epoch": 1.7281879194630871, "grad_norm": 0.6015625, "learning_rate": 0.0002226826305498975, "loss": 0.4448, "step": 4120 }, { "epoch": 1.7298657718120807, "grad_norm": 0.51171875, "learning_rate": 0.00022254285295714683, "loss": 0.4705, "step": 4124 }, { "epoch": 1.7315436241610738, "grad_norm": 0.46875, "learning_rate": 0.00022240299309272786, "loss": 0.4241, "step": 4128 }, { "epoch": 1.733221476510067, "grad_norm": 0.609375, "learning_rate": 0.00022226305111525726, "loss": 0.4936, "step": 4132 }, { "epoch": 1.7348993288590604, "grad_norm": 0.55078125, "learning_rate": 0.00022212302718344485, "loss": 0.5194, "step": 4136 }, { "epoch": 1.7365771812080537, "grad_norm": 0.51171875, "learning_rate": 0.00022198292145609328, "loss": 0.3134, "step": 4140 }, { "epoch": 1.738255033557047, "grad_norm": 0.73828125, "learning_rate": 0.00022184273409209813, "loss": 0.6054, "step": 4144 }, { "epoch": 1.7399328859060401, "grad_norm": 0.421875, "learning_rate": 0.00022170246525044733, "loss": 0.4806, "step": 4148 }, { "epoch": 1.7416107382550337, "grad_norm": 0.5859375, "learning_rate": 0.0002215621150902215, "loss": 0.5868, "step": 4152 }, { "epoch": 1.7432885906040267, "grad_norm": 0.5078125, "learning_rate": 0.00022142168377059325, "loss": 0.5169, "step": 4156 }, { "epoch": 1.7449664429530203, "grad_norm": 0.62890625, "learning_rate": 0.00022128117145082737, "loss": 0.6067, "step": 4160 }, { "epoch": 1.7466442953020134, "grad_norm": 0.6796875, "learning_rate": 0.00022114057829028042, "loss": 0.638, "step": 4164 }, { "epoch": 1.7483221476510067, "grad_norm": 0.5078125, "learning_rate": 0.0002209999044484008, "loss": 0.477, "step": 4168 }, { "epoch": 1.75, "grad_norm": 0.59765625, "learning_rate": 0.00022085915008472815, "loss": 0.4752, "step": 4172 }, { "epoch": 1.7516778523489933, "grad_norm": 0.5390625, "learning_rate": 0.00022071831535889366, "loss": 0.6064, "step": 4176 }, { "epoch": 1.7533557046979866, "grad_norm": 0.76953125, "learning_rate": 0.0002205774004306196, "loss": 0.5688, "step": 4180 }, { "epoch": 1.7550335570469797, "grad_norm": 0.6015625, "learning_rate": 0.00022043640545971915, "loss": 0.581, "step": 4184 }, { "epoch": 1.7567114093959733, "grad_norm": 0.5078125, "learning_rate": 0.00022029533060609636, "loss": 0.5297, "step": 4188 }, { "epoch": 1.7583892617449663, "grad_norm": 0.65625, "learning_rate": 0.00022015417602974573, "loss": 0.6405, "step": 4192 }, { "epoch": 1.7600671140939599, "grad_norm": 0.5390625, "learning_rate": 0.00022001294189075225, "loss": 0.6019, "step": 4196 }, { "epoch": 1.761744966442953, "grad_norm": 0.3828125, "learning_rate": 0.00021987162834929123, "loss": 0.4544, "step": 4200 }, { "epoch": 1.7634228187919463, "grad_norm": 0.71875, "learning_rate": 0.0002197302355656279, "loss": 0.7441, "step": 4204 }, { "epoch": 1.7651006711409396, "grad_norm": 0.4375, "learning_rate": 0.0002195887637001174, "loss": 0.5025, "step": 4208 }, { "epoch": 1.766778523489933, "grad_norm": 0.60546875, "learning_rate": 0.00021944721291320465, "loss": 0.4081, "step": 4212 }, { "epoch": 1.7684563758389262, "grad_norm": 0.51953125, "learning_rate": 0.00021930558336542388, "loss": 0.5282, "step": 4216 }, { "epoch": 1.7701342281879193, "grad_norm": 0.62109375, "learning_rate": 0.00021916387521739886, "loss": 0.5104, "step": 4220 }, { "epoch": 1.7718120805369129, "grad_norm": 0.52734375, "learning_rate": 0.00021902208862984235, "loss": 0.5055, "step": 4224 }, { "epoch": 1.773489932885906, "grad_norm": 0.57421875, "learning_rate": 0.0002188802237635561, "loss": 0.4425, "step": 4228 }, { "epoch": 1.7751677852348995, "grad_norm": 0.59375, "learning_rate": 0.00021873828077943072, "loss": 0.5321, "step": 4232 }, { "epoch": 1.7768456375838926, "grad_norm": 0.66796875, "learning_rate": 0.0002185962598384453, "loss": 0.4372, "step": 4236 }, { "epoch": 1.778523489932886, "grad_norm": 0.61328125, "learning_rate": 0.0002184541611016674, "loss": 0.3959, "step": 4240 }, { "epoch": 1.7802013422818792, "grad_norm": 0.50390625, "learning_rate": 0.0002183119847302528, "loss": 0.2845, "step": 4244 }, { "epoch": 1.7818791946308725, "grad_norm": 0.62890625, "learning_rate": 0.00021816973088544536, "loss": 0.511, "step": 4248 }, { "epoch": 1.7835570469798658, "grad_norm": 0.439453125, "learning_rate": 0.00021802739972857671, "loss": 0.5291, "step": 4252 }, { "epoch": 1.785234899328859, "grad_norm": 0.578125, "learning_rate": 0.00021788499142106623, "loss": 0.4543, "step": 4256 }, { "epoch": 1.7869127516778525, "grad_norm": 0.65625, "learning_rate": 0.00021774250612442087, "loss": 0.4043, "step": 4260 }, { "epoch": 1.7885906040268456, "grad_norm": 0.498046875, "learning_rate": 0.00021759994400023477, "loss": 0.4576, "step": 4264 }, { "epoch": 1.790268456375839, "grad_norm": 0.65234375, "learning_rate": 0.00021745730521018918, "loss": 0.4762, "step": 4268 }, { "epoch": 1.7919463087248322, "grad_norm": 0.6328125, "learning_rate": 0.00021731458991605242, "loss": 0.5246, "step": 4272 }, { "epoch": 1.7936241610738255, "grad_norm": 0.515625, "learning_rate": 0.00021717179827967955, "loss": 0.5178, "step": 4276 }, { "epoch": 1.7953020134228188, "grad_norm": 0.50390625, "learning_rate": 0.00021702893046301208, "loss": 0.508, "step": 4280 }, { "epoch": 1.7969798657718121, "grad_norm": 0.6484375, "learning_rate": 0.00021688598662807814, "loss": 0.5413, "step": 4284 }, { "epoch": 1.7986577181208054, "grad_norm": 0.63671875, "learning_rate": 0.0002167429669369918, "loss": 0.3597, "step": 4288 }, { "epoch": 1.8003355704697985, "grad_norm": 0.5390625, "learning_rate": 0.00021659987155195343, "loss": 0.5642, "step": 4292 }, { "epoch": 1.802013422818792, "grad_norm": 0.65234375, "learning_rate": 0.00021645670063524905, "loss": 0.4935, "step": 4296 }, { "epoch": 1.8036912751677852, "grad_norm": 0.55078125, "learning_rate": 0.00021631345434925046, "loss": 0.4889, "step": 4300 }, { "epoch": 1.8053691275167785, "grad_norm": 0.5390625, "learning_rate": 0.00021617013285641482, "loss": 0.5035, "step": 4304 }, { "epoch": 1.8070469798657718, "grad_norm": 0.5859375, "learning_rate": 0.00021602673631928475, "loss": 0.3912, "step": 4308 }, { "epoch": 1.808724832214765, "grad_norm": 0.54296875, "learning_rate": 0.0002158832649004878, "loss": 0.4925, "step": 4312 }, { "epoch": 1.8104026845637584, "grad_norm": 0.640625, "learning_rate": 0.00021573971876273656, "loss": 0.4948, "step": 4316 }, { "epoch": 1.8120805369127517, "grad_norm": 0.515625, "learning_rate": 0.00021559609806882831, "loss": 0.4422, "step": 4320 }, { "epoch": 1.813758389261745, "grad_norm": 0.54296875, "learning_rate": 0.0002154524029816449, "loss": 0.5595, "step": 4324 }, { "epoch": 1.8154362416107381, "grad_norm": 0.4609375, "learning_rate": 0.00021530863366415258, "loss": 0.4027, "step": 4328 }, { "epoch": 1.8171140939597317, "grad_norm": 0.51953125, "learning_rate": 0.0002151647902794017, "loss": 0.4171, "step": 4332 }, { "epoch": 1.8187919463087248, "grad_norm": 0.60546875, "learning_rate": 0.00021502087299052675, "loss": 0.4492, "step": 4336 }, { "epoch": 1.820469798657718, "grad_norm": 0.45703125, "learning_rate": 0.00021487688196074583, "loss": 0.4064, "step": 4340 }, { "epoch": 1.8221476510067114, "grad_norm": 0.7265625, "learning_rate": 0.00021473281735336091, "loss": 0.6201, "step": 4344 }, { "epoch": 1.8238255033557047, "grad_norm": 0.451171875, "learning_rate": 0.00021458867933175726, "loss": 0.4547, "step": 4348 }, { "epoch": 1.825503355704698, "grad_norm": 0.70703125, "learning_rate": 0.0002144444680594034, "loss": 0.586, "step": 4352 }, { "epoch": 1.8271812080536913, "grad_norm": 0.55078125, "learning_rate": 0.00021430018369985096, "loss": 0.3949, "step": 4356 }, { "epoch": 1.8288590604026846, "grad_norm": 0.50390625, "learning_rate": 0.00021415582641673452, "loss": 0.4313, "step": 4360 }, { "epoch": 1.8305369127516777, "grad_norm": 0.6015625, "learning_rate": 0.00021401139637377124, "loss": 0.5732, "step": 4364 }, { "epoch": 1.8322147651006713, "grad_norm": 0.486328125, "learning_rate": 0.00021386689373476087, "loss": 0.5632, "step": 4368 }, { "epoch": 1.8338926174496644, "grad_norm": 0.546875, "learning_rate": 0.00021372231866358555, "loss": 0.3843, "step": 4372 }, { "epoch": 1.8355704697986577, "grad_norm": 0.89453125, "learning_rate": 0.00021357767132420942, "loss": 0.5858, "step": 4376 }, { "epoch": 1.837248322147651, "grad_norm": 0.65625, "learning_rate": 0.0002134329518806787, "loss": 0.6213, "step": 4380 }, { "epoch": 1.8389261744966443, "grad_norm": 0.46875, "learning_rate": 0.0002132881604971213, "loss": 0.5251, "step": 4384 }, { "epoch": 1.8406040268456376, "grad_norm": 0.5625, "learning_rate": 0.0002131432973377468, "loss": 0.4842, "step": 4388 }, { "epoch": 1.8422818791946307, "grad_norm": 0.6640625, "learning_rate": 0.00021299836256684617, "loss": 0.5452, "step": 4392 }, { "epoch": 1.8439597315436242, "grad_norm": 0.5625, "learning_rate": 0.0002128533563487915, "loss": 0.4334, "step": 4396 }, { "epoch": 1.8456375838926173, "grad_norm": 0.52734375, "learning_rate": 0.00021270827884803606, "loss": 0.5084, "step": 4400 }, { "epoch": 1.8473154362416109, "grad_norm": 0.5546875, "learning_rate": 0.00021256313022911382, "loss": 0.4007, "step": 4404 }, { "epoch": 1.848993288590604, "grad_norm": 0.5546875, "learning_rate": 0.0002124179106566395, "loss": 0.3599, "step": 4408 }, { "epoch": 1.8506711409395973, "grad_norm": 0.58203125, "learning_rate": 0.0002122726202953082, "loss": 0.6829, "step": 4412 }, { "epoch": 1.8523489932885906, "grad_norm": 0.5546875, "learning_rate": 0.00021212725930989547, "loss": 0.3589, "step": 4416 }, { "epoch": 1.854026845637584, "grad_norm": 0.625, "learning_rate": 0.0002119818278652567, "loss": 0.4376, "step": 4420 }, { "epoch": 1.8557046979865772, "grad_norm": 0.462890625, "learning_rate": 0.00021183632612632744, "loss": 0.5166, "step": 4424 }, { "epoch": 1.8573825503355703, "grad_norm": 0.55859375, "learning_rate": 0.00021169075425812284, "loss": 0.4784, "step": 4428 }, { "epoch": 1.8590604026845639, "grad_norm": 0.6328125, "learning_rate": 0.00021154511242573755, "loss": 0.4797, "step": 4432 }, { "epoch": 1.860738255033557, "grad_norm": 0.51953125, "learning_rate": 0.00021139940079434568, "loss": 0.5271, "step": 4436 }, { "epoch": 1.8624161073825505, "grad_norm": 0.5546875, "learning_rate": 0.00021125361952920038, "loss": 0.4168, "step": 4440 }, { "epoch": 1.8640939597315436, "grad_norm": 0.5390625, "learning_rate": 0.00021110776879563388, "loss": 0.5385, "step": 4444 }, { "epoch": 1.8657718120805369, "grad_norm": 0.5625, "learning_rate": 0.00021096184875905708, "loss": 0.4093, "step": 4448 }, { "epoch": 1.8674496644295302, "grad_norm": 0.48828125, "learning_rate": 0.0002108158595849596, "loss": 0.3512, "step": 4452 }, { "epoch": 1.8691275167785235, "grad_norm": 0.44921875, "learning_rate": 0.00021066980143890935, "loss": 0.378, "step": 4456 }, { "epoch": 1.8708053691275168, "grad_norm": 0.8515625, "learning_rate": 0.00021052367448655258, "loss": 0.6073, "step": 4460 }, { "epoch": 1.87248322147651, "grad_norm": 0.6328125, "learning_rate": 0.00021037747889361347, "loss": 0.4326, "step": 4464 }, { "epoch": 1.8741610738255035, "grad_norm": 0.5234375, "learning_rate": 0.00021023121482589412, "loss": 0.4513, "step": 4468 }, { "epoch": 1.8758389261744965, "grad_norm": 0.54296875, "learning_rate": 0.00021008488244927425, "loss": 0.5755, "step": 4472 }, { "epoch": 1.87751677852349, "grad_norm": 0.48828125, "learning_rate": 0.0002099384819297111, "loss": 0.4752, "step": 4476 }, { "epoch": 1.8791946308724832, "grad_norm": 0.5078125, "learning_rate": 0.00020979201343323907, "loss": 0.443, "step": 4480 }, { "epoch": 1.8808724832214765, "grad_norm": 0.56640625, "learning_rate": 0.00020964547712596979, "loss": 0.3393, "step": 4484 }, { "epoch": 1.8825503355704698, "grad_norm": 0.515625, "learning_rate": 0.00020949887317409175, "loss": 0.4577, "step": 4488 }, { "epoch": 1.8842281879194631, "grad_norm": 0.50390625, "learning_rate": 0.0002093522017438701, "loss": 0.513, "step": 4492 }, { "epoch": 1.8859060402684564, "grad_norm": 0.53125, "learning_rate": 0.00020920546300164658, "loss": 0.4592, "step": 4496 }, { "epoch": 1.8875838926174495, "grad_norm": 0.52734375, "learning_rate": 0.00020905865711383934, "loss": 0.6192, "step": 4500 }, { "epoch": 1.889261744966443, "grad_norm": 0.5625, "learning_rate": 0.00020891178424694242, "loss": 0.5337, "step": 4504 }, { "epoch": 1.8909395973154361, "grad_norm": 0.5234375, "learning_rate": 0.00020876484456752614, "loss": 0.5276, "step": 4508 }, { "epoch": 1.8926174496644297, "grad_norm": 0.470703125, "learning_rate": 0.0002086178382422364, "loss": 0.5645, "step": 4512 }, { "epoch": 1.8942953020134228, "grad_norm": 0.486328125, "learning_rate": 0.0002084707654377947, "loss": 0.4891, "step": 4516 }, { "epoch": 1.895973154362416, "grad_norm": 0.453125, "learning_rate": 0.0002083236263209981, "loss": 0.479, "step": 4520 }, { "epoch": 1.8976510067114094, "grad_norm": 0.5859375, "learning_rate": 0.00020817642105871857, "loss": 0.4341, "step": 4524 }, { "epoch": 1.8993288590604027, "grad_norm": 0.49609375, "learning_rate": 0.00020802914981790332, "loss": 0.3891, "step": 4528 }, { "epoch": 1.901006711409396, "grad_norm": 0.498046875, "learning_rate": 0.00020788181276557436, "loss": 0.4094, "step": 4532 }, { "epoch": 1.9026845637583891, "grad_norm": 0.51171875, "learning_rate": 0.00020773441006882823, "loss": 0.3939, "step": 4536 }, { "epoch": 1.9043624161073827, "grad_norm": 0.5625, "learning_rate": 0.00020758694189483608, "loss": 0.4212, "step": 4540 }, { "epoch": 1.9060402684563758, "grad_norm": 0.51171875, "learning_rate": 0.0002074394084108432, "loss": 0.4294, "step": 4544 }, { "epoch": 1.9077181208053693, "grad_norm": 0.65234375, "learning_rate": 0.00020729180978416888, "loss": 0.4997, "step": 4548 }, { "epoch": 1.9093959731543624, "grad_norm": 0.384765625, "learning_rate": 0.0002071441461822066, "loss": 0.4848, "step": 4552 }, { "epoch": 1.9110738255033557, "grad_norm": 0.53515625, "learning_rate": 0.00020699641777242308, "loss": 0.4425, "step": 4556 }, { "epoch": 1.912751677852349, "grad_norm": 0.5078125, "learning_rate": 0.00020684862472235895, "loss": 0.4492, "step": 4560 }, { "epoch": 1.9144295302013423, "grad_norm": 0.5546875, "learning_rate": 0.00020670076719962795, "loss": 0.5043, "step": 4564 }, { "epoch": 1.9161073825503356, "grad_norm": 0.453125, "learning_rate": 0.00020655284537191688, "loss": 0.4197, "step": 4568 }, { "epoch": 1.9177852348993287, "grad_norm": 0.53515625, "learning_rate": 0.0002064048594069856, "loss": 0.412, "step": 4572 }, { "epoch": 1.9194630872483223, "grad_norm": 0.70703125, "learning_rate": 0.00020625680947266672, "loss": 0.4999, "step": 4576 }, { "epoch": 1.9211409395973154, "grad_norm": 0.5703125, "learning_rate": 0.00020610869573686524, "loss": 0.6804, "step": 4580 }, { "epoch": 1.9228187919463087, "grad_norm": 0.56640625, "learning_rate": 0.0002059605183675587, "loss": 0.4828, "step": 4584 }, { "epoch": 1.924496644295302, "grad_norm": 0.59375, "learning_rate": 0.00020581227753279664, "loss": 0.5228, "step": 4588 }, { "epoch": 1.9261744966442953, "grad_norm": 0.47265625, "learning_rate": 0.00020566397340070078, "loss": 0.4269, "step": 4592 }, { "epoch": 1.9278523489932886, "grad_norm": 0.63671875, "learning_rate": 0.00020551560613946444, "loss": 0.4973, "step": 4596 }, { "epoch": 1.929530201342282, "grad_norm": 0.54296875, "learning_rate": 0.0002053671759173526, "loss": 0.4928, "step": 4600 }, { "epoch": 1.9312080536912752, "grad_norm": 0.59765625, "learning_rate": 0.0002052186829027017, "loss": 0.5533, "step": 4604 }, { "epoch": 1.9328859060402683, "grad_norm": 0.546875, "learning_rate": 0.00020507012726391928, "loss": 0.423, "step": 4608 }, { "epoch": 1.9345637583892619, "grad_norm": 0.56640625, "learning_rate": 0.00020492150916948397, "loss": 0.4657, "step": 4612 }, { "epoch": 1.936241610738255, "grad_norm": 0.58984375, "learning_rate": 0.00020477282878794532, "loss": 0.5397, "step": 4616 }, { "epoch": 1.9379194630872483, "grad_norm": 0.494140625, "learning_rate": 0.00020462408628792332, "loss": 0.5182, "step": 4620 }, { "epoch": 1.9395973154362416, "grad_norm": 0.5625, "learning_rate": 0.00020447528183810857, "loss": 0.5173, "step": 4624 }, { "epoch": 1.941275167785235, "grad_norm": 0.5234375, "learning_rate": 0.0002043264156072619, "loss": 0.3352, "step": 4628 }, { "epoch": 1.9429530201342282, "grad_norm": 0.60546875, "learning_rate": 0.00020417748776421408, "loss": 0.4949, "step": 4632 }, { "epoch": 1.9446308724832215, "grad_norm": 0.408203125, "learning_rate": 0.00020402849847786598, "loss": 0.4904, "step": 4636 }, { "epoch": 1.9463087248322148, "grad_norm": 0.58984375, "learning_rate": 0.00020387944791718795, "loss": 0.5267, "step": 4640 }, { "epoch": 1.947986577181208, "grad_norm": 0.51953125, "learning_rate": 0.00020373033625121996, "loss": 0.6703, "step": 4644 }, { "epoch": 1.9496644295302015, "grad_norm": 0.328125, "learning_rate": 0.00020358116364907125, "loss": 0.4166, "step": 4648 }, { "epoch": 1.9513422818791946, "grad_norm": 0.54296875, "learning_rate": 0.00020343193027992006, "loss": 0.513, "step": 4652 }, { "epoch": 1.9530201342281879, "grad_norm": 0.56640625, "learning_rate": 0.0002032826363130137, "loss": 0.4994, "step": 4656 }, { "epoch": 1.9546979865771812, "grad_norm": 0.56640625, "learning_rate": 0.00020313328191766815, "loss": 0.5255, "step": 4660 }, { "epoch": 1.9563758389261745, "grad_norm": 0.578125, "learning_rate": 0.0002029838672632679, "loss": 0.4096, "step": 4664 }, { "epoch": 1.9580536912751678, "grad_norm": 0.53125, "learning_rate": 0.0002028343925192658, "loss": 0.5064, "step": 4668 }, { "epoch": 1.959731543624161, "grad_norm": 0.703125, "learning_rate": 0.00020268485785518287, "loss": 0.5973, "step": 4672 }, { "epoch": 1.9614093959731544, "grad_norm": 0.5078125, "learning_rate": 0.00020253526344060798, "loss": 0.6148, "step": 4676 }, { "epoch": 1.9630872483221475, "grad_norm": 0.5390625, "learning_rate": 0.00020238560944519794, "loss": 0.493, "step": 4680 }, { "epoch": 1.964765100671141, "grad_norm": 0.5, "learning_rate": 0.00020223589603867698, "loss": 0.4042, "step": 4684 }, { "epoch": 1.9664429530201342, "grad_norm": 0.65234375, "learning_rate": 0.00020208612339083678, "loss": 0.4162, "step": 4688 }, { "epoch": 1.9681208053691275, "grad_norm": 0.53515625, "learning_rate": 0.00020193629167153623, "loss": 0.4693, "step": 4692 }, { "epoch": 1.9697986577181208, "grad_norm": 0.5703125, "learning_rate": 0.00020178640105070103, "loss": 0.4774, "step": 4696 }, { "epoch": 1.971476510067114, "grad_norm": 0.7265625, "learning_rate": 0.000201636451698324, "loss": 0.6057, "step": 4700 }, { "epoch": 1.9731543624161074, "grad_norm": 0.380859375, "learning_rate": 0.00020148644378446432, "loss": 0.38, "step": 4704 }, { "epoch": 1.9748322147651005, "grad_norm": 0.51953125, "learning_rate": 0.00020133637747924763, "loss": 0.5664, "step": 4708 }, { "epoch": 1.976510067114094, "grad_norm": 0.61328125, "learning_rate": 0.00020118625295286583, "loss": 0.424, "step": 4712 }, { "epoch": 1.9781879194630871, "grad_norm": 0.578125, "learning_rate": 0.0002010360703755769, "loss": 0.549, "step": 4716 }, { "epoch": 1.9798657718120807, "grad_norm": 0.455078125, "learning_rate": 0.0002008858299177045, "loss": 0.39, "step": 4720 }, { "epoch": 1.9815436241610738, "grad_norm": 0.5859375, "learning_rate": 0.00020073553174963815, "loss": 0.386, "step": 4724 }, { "epoch": 1.983221476510067, "grad_norm": 0.498046875, "learning_rate": 0.0002005851760418326, "loss": 0.6946, "step": 4728 }, { "epoch": 1.9848993288590604, "grad_norm": 0.546875, "learning_rate": 0.000200434762964808, "loss": 0.6034, "step": 4732 }, { "epoch": 1.9865771812080537, "grad_norm": 0.60546875, "learning_rate": 0.00020028429268914946, "loss": 0.529, "step": 4736 }, { "epoch": 1.988255033557047, "grad_norm": 0.625, "learning_rate": 0.00020013376538550708, "loss": 0.5584, "step": 4740 }, { "epoch": 1.9899328859060401, "grad_norm": 0.462890625, "learning_rate": 0.0001999831812245956, "loss": 0.4823, "step": 4744 }, { "epoch": 1.9916107382550337, "grad_norm": 0.46484375, "learning_rate": 0.00019983254037719413, "loss": 0.5077, "step": 4748 }, { "epoch": 1.9932885906040267, "grad_norm": 0.54296875, "learning_rate": 0.0001996818430141462, "loss": 0.4551, "step": 4752 }, { "epoch": 1.9949664429530203, "grad_norm": 0.6640625, "learning_rate": 0.00019953108930635937, "loss": 0.4814, "step": 4756 }, { "epoch": 1.9966442953020134, "grad_norm": 0.57421875, "learning_rate": 0.00019938027942480516, "loss": 0.3257, "step": 4760 }, { "epoch": 1.9983221476510067, "grad_norm": 0.455078125, "learning_rate": 0.00019922941354051874, "loss": 0.4984, "step": 4764 }, { "epoch": 2.0, "grad_norm": 0.6328125, "learning_rate": 0.0001990784918245988, "loss": 0.4056, "step": 4768 }, { "epoch": 2.001677852348993, "grad_norm": 0.373046875, "learning_rate": 0.00019892751444820737, "loss": 0.2791, "step": 4772 }, { "epoch": 2.0033557046979866, "grad_norm": 0.4765625, "learning_rate": 0.00019877648158256965, "loss": 0.3686, "step": 4776 }, { "epoch": 2.0050335570469797, "grad_norm": 0.46875, "learning_rate": 0.0001986253933989736, "loss": 0.3055, "step": 4780 }, { "epoch": 2.0067114093959733, "grad_norm": 0.53125, "learning_rate": 0.00019847425006877008, "loss": 0.3713, "step": 4784 }, { "epoch": 2.0083892617449663, "grad_norm": 0.53125, "learning_rate": 0.00019832305176337256, "loss": 0.3755, "step": 4788 }, { "epoch": 2.01006711409396, "grad_norm": 0.34765625, "learning_rate": 0.00019817179865425658, "loss": 0.3769, "step": 4792 }, { "epoch": 2.011744966442953, "grad_norm": 0.52734375, "learning_rate": 0.00019802049091296008, "loss": 0.5192, "step": 4796 }, { "epoch": 2.0134228187919465, "grad_norm": 0.578125, "learning_rate": 0.00019786912871108285, "loss": 0.2641, "step": 4800 }, { "epoch": 2.0151006711409396, "grad_norm": 0.474609375, "learning_rate": 0.00019771771222028652, "loss": 0.3044, "step": 4804 }, { "epoch": 2.0167785234899327, "grad_norm": 0.55859375, "learning_rate": 0.00019756624161229423, "loss": 0.2614, "step": 4808 }, { "epoch": 2.0184563758389262, "grad_norm": 0.515625, "learning_rate": 0.0001974147170588904, "loss": 0.3519, "step": 4812 }, { "epoch": 2.0201342281879193, "grad_norm": 0.5078125, "learning_rate": 0.00019726313873192087, "loss": 0.3157, "step": 4816 }, { "epoch": 2.021812080536913, "grad_norm": 0.5546875, "learning_rate": 0.00019711150680329232, "loss": 0.2888, "step": 4820 }, { "epoch": 2.023489932885906, "grad_norm": 0.48828125, "learning_rate": 0.00019695982144497215, "loss": 0.3318, "step": 4824 }, { "epoch": 2.0251677852348995, "grad_norm": 0.53515625, "learning_rate": 0.0001968080828289885, "loss": 0.2986, "step": 4828 }, { "epoch": 2.0268456375838926, "grad_norm": 0.482421875, "learning_rate": 0.0001966562911274298, "loss": 0.2018, "step": 4832 }, { "epoch": 2.028523489932886, "grad_norm": 0.349609375, "learning_rate": 0.00019650444651244476, "loss": 0.26, "step": 4836 }, { "epoch": 2.030201342281879, "grad_norm": 0.5625, "learning_rate": 0.0001963525491562421, "loss": 0.337, "step": 4840 }, { "epoch": 2.0318791946308723, "grad_norm": 0.53125, "learning_rate": 0.00019620059923109028, "loss": 0.2168, "step": 4844 }, { "epoch": 2.033557046979866, "grad_norm": 0.498046875, "learning_rate": 0.00019604859690931743, "loss": 0.3588, "step": 4848 }, { "epoch": 2.035234899328859, "grad_norm": 0.54296875, "learning_rate": 0.00019589654236331117, "loss": 0.3051, "step": 4852 }, { "epoch": 2.0369127516778525, "grad_norm": 0.416015625, "learning_rate": 0.00019574443576551813, "loss": 0.3502, "step": 4856 }, { "epoch": 2.0385906040268456, "grad_norm": 0.62890625, "learning_rate": 0.00019559227728844423, "loss": 0.3462, "step": 4860 }, { "epoch": 2.040268456375839, "grad_norm": 0.53515625, "learning_rate": 0.00019544006710465415, "loss": 0.4165, "step": 4864 }, { "epoch": 2.041946308724832, "grad_norm": 0.53125, "learning_rate": 0.00019528780538677107, "loss": 0.4001, "step": 4868 }, { "epoch": 2.0436241610738257, "grad_norm": 0.51953125, "learning_rate": 0.00019513549230747683, "loss": 0.2988, "step": 4872 }, { "epoch": 2.045302013422819, "grad_norm": 0.54296875, "learning_rate": 0.0001949831280395113, "loss": 0.3333, "step": 4876 }, { "epoch": 2.046979865771812, "grad_norm": 0.5390625, "learning_rate": 0.00019483071275567252, "loss": 0.42, "step": 4880 }, { "epoch": 2.0486577181208054, "grad_norm": 0.51953125, "learning_rate": 0.00019467824662881654, "loss": 0.2445, "step": 4884 }, { "epoch": 2.0503355704697985, "grad_norm": 0.46875, "learning_rate": 0.00019452572983185673, "loss": 0.5153, "step": 4888 }, { "epoch": 2.052013422818792, "grad_norm": 0.5859375, "learning_rate": 0.00019437316253776422, "loss": 0.3658, "step": 4892 }, { "epoch": 2.053691275167785, "grad_norm": 0.44921875, "learning_rate": 0.0001942205449195673, "loss": 0.3623, "step": 4896 }, { "epoch": 2.0553691275167787, "grad_norm": 0.388671875, "learning_rate": 0.0001940678771503513, "loss": 0.4514, "step": 4900 }, { "epoch": 2.057046979865772, "grad_norm": 0.51171875, "learning_rate": 0.00019391515940325843, "loss": 0.3856, "step": 4904 }, { "epoch": 2.0587248322147653, "grad_norm": 0.75, "learning_rate": 0.00019376239185148767, "loss": 0.4237, "step": 4908 }, { "epoch": 2.0604026845637584, "grad_norm": 0.423828125, "learning_rate": 0.0001936095746682944, "loss": 0.3115, "step": 4912 }, { "epoch": 2.0620805369127515, "grad_norm": 0.4453125, "learning_rate": 0.0001934567080269903, "loss": 0.3624, "step": 4916 }, { "epoch": 2.063758389261745, "grad_norm": 0.5234375, "learning_rate": 0.00019330379210094313, "loss": 0.3633, "step": 4920 }, { "epoch": 2.065436241610738, "grad_norm": 0.451171875, "learning_rate": 0.00019315082706357665, "loss": 0.2783, "step": 4924 }, { "epoch": 2.0671140939597317, "grad_norm": 0.498046875, "learning_rate": 0.00019299781308837016, "loss": 0.3568, "step": 4928 }, { "epoch": 2.0687919463087248, "grad_norm": 0.57421875, "learning_rate": 0.00019284475034885858, "loss": 0.2982, "step": 4932 }, { "epoch": 2.0704697986577183, "grad_norm": 0.59375, "learning_rate": 0.0001926916390186321, "loss": 0.2902, "step": 4936 }, { "epoch": 2.0721476510067114, "grad_norm": 0.4609375, "learning_rate": 0.00019253847927133593, "loss": 0.2284, "step": 4940 }, { "epoch": 2.073825503355705, "grad_norm": 0.5078125, "learning_rate": 0.00019238527128067034, "loss": 0.3508, "step": 4944 }, { "epoch": 2.075503355704698, "grad_norm": 0.35546875, "learning_rate": 0.00019223201522039027, "loss": 0.3114, "step": 4948 }, { "epoch": 2.077181208053691, "grad_norm": 0.4921875, "learning_rate": 0.000192078711264305, "loss": 0.3435, "step": 4952 }, { "epoch": 2.0788590604026846, "grad_norm": 0.6640625, "learning_rate": 0.00019192535958627844, "loss": 0.2736, "step": 4956 }, { "epoch": 2.0805369127516777, "grad_norm": 0.5078125, "learning_rate": 0.00019177196036022838, "loss": 0.3174, "step": 4960 }, { "epoch": 2.0822147651006713, "grad_norm": 0.51171875, "learning_rate": 0.00019161851376012663, "loss": 0.3621, "step": 4964 }, { "epoch": 2.0838926174496644, "grad_norm": 0.60546875, "learning_rate": 0.00019146501995999873, "loss": 0.326, "step": 4968 }, { "epoch": 2.085570469798658, "grad_norm": 0.37109375, "learning_rate": 0.00019131147913392367, "loss": 0.2677, "step": 4972 }, { "epoch": 2.087248322147651, "grad_norm": 0.5625, "learning_rate": 0.0001911578914560339, "loss": 0.3106, "step": 4976 }, { "epoch": 2.088926174496644, "grad_norm": 0.51171875, "learning_rate": 0.0001910042571005149, "loss": 0.3112, "step": 4980 }, { "epoch": 2.0906040268456376, "grad_norm": 0.48046875, "learning_rate": 0.0001908505762416052, "loss": 0.2405, "step": 4984 }, { "epoch": 2.0922818791946307, "grad_norm": 0.58203125, "learning_rate": 0.00019069684905359584, "loss": 0.3924, "step": 4988 }, { "epoch": 2.0939597315436242, "grad_norm": 0.51953125, "learning_rate": 0.0001905430757108307, "loss": 0.4061, "step": 4992 }, { "epoch": 2.0956375838926173, "grad_norm": 0.484375, "learning_rate": 0.00019038925638770583, "loss": 0.2961, "step": 4996 }, { "epoch": 2.097315436241611, "grad_norm": 0.4921875, "learning_rate": 0.00019023539125866942, "loss": 0.3272, "step": 5000 }, { "epoch": 2.098993288590604, "grad_norm": 0.474609375, "learning_rate": 0.00019008148049822168, "loss": 0.4633, "step": 5004 }, { "epoch": 2.1006711409395975, "grad_norm": 0.5859375, "learning_rate": 0.00018992752428091455, "loss": 0.3473, "step": 5008 }, { "epoch": 2.1023489932885906, "grad_norm": 0.625, "learning_rate": 0.0001897735227813515, "loss": 0.4137, "step": 5012 }, { "epoch": 2.1040268456375837, "grad_norm": 0.48046875, "learning_rate": 0.0001896194761741874, "loss": 0.3519, "step": 5016 }, { "epoch": 2.1057046979865772, "grad_norm": 0.61328125, "learning_rate": 0.00018946538463412814, "loss": 0.6012, "step": 5020 }, { "epoch": 2.1073825503355703, "grad_norm": 0.50390625, "learning_rate": 0.00018931124833593084, "loss": 0.3197, "step": 5024 }, { "epoch": 2.109060402684564, "grad_norm": 0.51171875, "learning_rate": 0.00018915706745440308, "loss": 0.3037, "step": 5028 }, { "epoch": 2.110738255033557, "grad_norm": 0.5390625, "learning_rate": 0.00018900284216440325, "loss": 0.2634, "step": 5032 }, { "epoch": 2.1124161073825505, "grad_norm": 0.546875, "learning_rate": 0.00018884857264083984, "loss": 0.4427, "step": 5036 }, { "epoch": 2.1140939597315436, "grad_norm": 0.486328125, "learning_rate": 0.00018869425905867175, "loss": 0.3346, "step": 5040 }, { "epoch": 2.115771812080537, "grad_norm": 0.57421875, "learning_rate": 0.0001885399015929078, "loss": 0.4476, "step": 5044 }, { "epoch": 2.11744966442953, "grad_norm": 0.5546875, "learning_rate": 0.00018838550041860643, "loss": 0.4035, "step": 5048 }, { "epoch": 2.1191275167785233, "grad_norm": 0.44921875, "learning_rate": 0.00018823105571087582, "loss": 0.2567, "step": 5052 }, { "epoch": 2.120805369127517, "grad_norm": 0.490234375, "learning_rate": 0.0001880765676448735, "loss": 0.4139, "step": 5056 }, { "epoch": 2.12248322147651, "grad_norm": 0.578125, "learning_rate": 0.00018792203639580603, "loss": 0.3638, "step": 5060 }, { "epoch": 2.1241610738255035, "grad_norm": 0.55859375, "learning_rate": 0.0001877674621389291, "loss": 0.3847, "step": 5064 }, { "epoch": 2.1258389261744965, "grad_norm": 0.515625, "learning_rate": 0.00018761284504954708, "loss": 0.3381, "step": 5068 }, { "epoch": 2.12751677852349, "grad_norm": 0.515625, "learning_rate": 0.000187458185303013, "loss": 0.31, "step": 5072 }, { "epoch": 2.129194630872483, "grad_norm": 0.515625, "learning_rate": 0.00018730348307472824, "loss": 0.4449, "step": 5076 }, { "epoch": 2.1308724832214767, "grad_norm": 0.50390625, "learning_rate": 0.00018714873854014225, "loss": 0.3364, "step": 5080 }, { "epoch": 2.13255033557047, "grad_norm": 0.498046875, "learning_rate": 0.00018699395187475265, "loss": 0.2797, "step": 5084 }, { "epoch": 2.134228187919463, "grad_norm": 0.5234375, "learning_rate": 0.00018683912325410468, "loss": 0.2883, "step": 5088 }, { "epoch": 2.1359060402684564, "grad_norm": 0.57421875, "learning_rate": 0.00018668425285379127, "loss": 0.256, "step": 5092 }, { "epoch": 2.1375838926174495, "grad_norm": 0.51171875, "learning_rate": 0.00018652934084945267, "loss": 0.4092, "step": 5096 }, { "epoch": 2.139261744966443, "grad_norm": 0.63671875, "learning_rate": 0.00018637438741677631, "loss": 0.3676, "step": 5100 }, { "epoch": 2.140939597315436, "grad_norm": 0.6953125, "learning_rate": 0.0001862193927314967, "loss": 0.4648, "step": 5104 }, { "epoch": 2.1426174496644297, "grad_norm": 0.39453125, "learning_rate": 0.00018606435696939501, "loss": 0.2864, "step": 5108 }, { "epoch": 2.1442953020134228, "grad_norm": 0.57421875, "learning_rate": 0.00018590928030629903, "loss": 0.3711, "step": 5112 }, { "epoch": 2.1459731543624163, "grad_norm": 0.390625, "learning_rate": 0.00018575416291808297, "loss": 0.2486, "step": 5116 }, { "epoch": 2.1476510067114094, "grad_norm": 0.66015625, "learning_rate": 0.00018559900498066724, "loss": 0.2859, "step": 5120 }, { "epoch": 2.1493288590604025, "grad_norm": 0.5234375, "learning_rate": 0.00018544380667001814, "loss": 0.3679, "step": 5124 }, { "epoch": 2.151006711409396, "grad_norm": 0.46484375, "learning_rate": 0.00018528856816214787, "loss": 0.3888, "step": 5128 }, { "epoch": 2.152684563758389, "grad_norm": 0.3515625, "learning_rate": 0.00018513328963311424, "loss": 0.2705, "step": 5132 }, { "epoch": 2.1543624161073827, "grad_norm": 0.73046875, "learning_rate": 0.00018497797125902022, "loss": 0.3243, "step": 5136 }, { "epoch": 2.1560402684563758, "grad_norm": 0.54296875, "learning_rate": 0.00018482261321601433, "loss": 0.3515, "step": 5140 }, { "epoch": 2.1577181208053693, "grad_norm": 0.578125, "learning_rate": 0.0001846672156802897, "loss": 0.4485, "step": 5144 }, { "epoch": 2.1593959731543624, "grad_norm": 0.60546875, "learning_rate": 0.0001845117788280845, "loss": 0.2976, "step": 5148 }, { "epoch": 2.1610738255033555, "grad_norm": 0.515625, "learning_rate": 0.0001843563028356815, "loss": 0.3295, "step": 5152 }, { "epoch": 2.162751677852349, "grad_norm": 0.58984375, "learning_rate": 0.00018420078787940764, "loss": 0.3624, "step": 5156 }, { "epoch": 2.164429530201342, "grad_norm": 0.609375, "learning_rate": 0.00018404523413563425, "loss": 0.2818, "step": 5160 }, { "epoch": 2.1661073825503356, "grad_norm": 0.59765625, "learning_rate": 0.00018388964178077662, "loss": 0.2967, "step": 5164 }, { "epoch": 2.1677852348993287, "grad_norm": 0.58203125, "learning_rate": 0.00018373401099129374, "loss": 0.5096, "step": 5168 }, { "epoch": 2.1694630872483223, "grad_norm": 0.466796875, "learning_rate": 0.00018357834194368823, "loss": 0.3049, "step": 5172 }, { "epoch": 2.1711409395973154, "grad_norm": 0.59375, "learning_rate": 0.00018342263481450617, "loss": 0.4255, "step": 5176 }, { "epoch": 2.172818791946309, "grad_norm": 0.51953125, "learning_rate": 0.00018326688978033678, "loss": 0.2723, "step": 5180 }, { "epoch": 2.174496644295302, "grad_norm": 0.51953125, "learning_rate": 0.00018311110701781222, "loss": 0.4863, "step": 5184 }, { "epoch": 2.176174496644295, "grad_norm": 0.546875, "learning_rate": 0.00018295528670360747, "loss": 0.4002, "step": 5188 }, { "epoch": 2.1778523489932886, "grad_norm": 0.5859375, "learning_rate": 0.0001827994290144402, "loss": 0.4429, "step": 5192 }, { "epoch": 2.1795302013422817, "grad_norm": 0.474609375, "learning_rate": 0.00018264353412707026, "loss": 0.3377, "step": 5196 }, { "epoch": 2.1812080536912752, "grad_norm": 0.4921875, "learning_rate": 0.0001824876022182998, "loss": 0.4838, "step": 5200 }, { "epoch": 2.1828859060402683, "grad_norm": 0.36328125, "learning_rate": 0.00018233163346497304, "loss": 0.3429, "step": 5204 }, { "epoch": 2.184563758389262, "grad_norm": 0.59765625, "learning_rate": 0.00018217562804397587, "loss": 0.3521, "step": 5208 }, { "epoch": 2.186241610738255, "grad_norm": 0.478515625, "learning_rate": 0.00018201958613223572, "loss": 0.3518, "step": 5212 }, { "epoch": 2.1879194630872485, "grad_norm": 0.56640625, "learning_rate": 0.00018186350790672166, "loss": 0.3204, "step": 5216 }, { "epoch": 2.1895973154362416, "grad_norm": 0.50390625, "learning_rate": 0.00018170739354444364, "loss": 0.4312, "step": 5220 }, { "epoch": 2.1912751677852347, "grad_norm": 0.58984375, "learning_rate": 0.00018155124322245277, "loss": 0.3391, "step": 5224 }, { "epoch": 2.192953020134228, "grad_norm": 0.609375, "learning_rate": 0.00018139505711784078, "loss": 0.4298, "step": 5228 }, { "epoch": 2.1946308724832213, "grad_norm": 0.6328125, "learning_rate": 0.00018123883540774023, "loss": 0.3877, "step": 5232 }, { "epoch": 2.196308724832215, "grad_norm": 0.7578125, "learning_rate": 0.0001810825782693239, "loss": 0.4095, "step": 5236 }, { "epoch": 2.197986577181208, "grad_norm": 0.53125, "learning_rate": 0.0001809262858798047, "loss": 0.3233, "step": 5240 }, { "epoch": 2.1996644295302015, "grad_norm": 0.546875, "learning_rate": 0.00018076995841643565, "loss": 0.3374, "step": 5244 }, { "epoch": 2.2013422818791946, "grad_norm": 0.57421875, "learning_rate": 0.00018061359605650956, "loss": 0.3436, "step": 5248 }, { "epoch": 2.203020134228188, "grad_norm": 0.55859375, "learning_rate": 0.00018045719897735857, "loss": 0.3513, "step": 5252 }, { "epoch": 2.204697986577181, "grad_norm": 0.5625, "learning_rate": 0.00018030076735635453, "loss": 0.311, "step": 5256 }, { "epoch": 2.2063758389261743, "grad_norm": 0.4453125, "learning_rate": 0.00018014430137090824, "loss": 0.45, "step": 5260 }, { "epoch": 2.208053691275168, "grad_norm": 0.61328125, "learning_rate": 0.00017998780119846952, "loss": 0.3649, "step": 5264 }, { "epoch": 2.209731543624161, "grad_norm": 0.59765625, "learning_rate": 0.0001798312670165271, "loss": 0.2823, "step": 5268 }, { "epoch": 2.2114093959731544, "grad_norm": 0.50390625, "learning_rate": 0.000179674699002608, "loss": 0.2457, "step": 5272 }, { "epoch": 2.2130872483221475, "grad_norm": 0.58984375, "learning_rate": 0.00017951809733427783, "loss": 0.3783, "step": 5276 }, { "epoch": 2.214765100671141, "grad_norm": 0.5625, "learning_rate": 0.00017936146218914037, "loss": 0.4519, "step": 5280 }, { "epoch": 2.216442953020134, "grad_norm": 0.43359375, "learning_rate": 0.00017920479374483725, "loss": 0.2612, "step": 5284 }, { "epoch": 2.2181208053691277, "grad_norm": 0.59375, "learning_rate": 0.00017904809217904793, "loss": 0.3702, "step": 5288 }, { "epoch": 2.219798657718121, "grad_norm": 0.53515625, "learning_rate": 0.00017889135766948944, "loss": 0.3235, "step": 5292 }, { "epoch": 2.221476510067114, "grad_norm": 0.400390625, "learning_rate": 0.00017873459039391607, "loss": 0.3067, "step": 5296 }, { "epoch": 2.2231543624161074, "grad_norm": 0.37890625, "learning_rate": 0.00017857779053011952, "loss": 0.3411, "step": 5300 }, { "epoch": 2.2248322147651005, "grad_norm": 0.59375, "learning_rate": 0.00017842095825592816, "loss": 0.4263, "step": 5304 }, { "epoch": 2.226510067114094, "grad_norm": 0.58203125, "learning_rate": 0.00017826409374920726, "loss": 0.3356, "step": 5308 }, { "epoch": 2.228187919463087, "grad_norm": 0.59375, "learning_rate": 0.0001781071971878587, "loss": 0.4039, "step": 5312 }, { "epoch": 2.2298657718120807, "grad_norm": 0.66796875, "learning_rate": 0.00017795026874982054, "loss": 0.3998, "step": 5316 }, { "epoch": 2.2315436241610738, "grad_norm": 0.546875, "learning_rate": 0.00017779330861306713, "loss": 0.3366, "step": 5320 }, { "epoch": 2.2332214765100673, "grad_norm": 0.59375, "learning_rate": 0.00017763631695560876, "loss": 0.2722, "step": 5324 }, { "epoch": 2.2348993288590604, "grad_norm": 0.482421875, "learning_rate": 0.00017747929395549143, "loss": 0.3728, "step": 5328 }, { "epoch": 2.2365771812080535, "grad_norm": 0.58984375, "learning_rate": 0.0001773222397907967, "loss": 0.4272, "step": 5332 }, { "epoch": 2.238255033557047, "grad_norm": 0.5, "learning_rate": 0.00017716515463964139, "loss": 0.27, "step": 5336 }, { "epoch": 2.23993288590604, "grad_norm": 0.69921875, "learning_rate": 0.00017700803868017764, "loss": 0.2795, "step": 5340 }, { "epoch": 2.2416107382550337, "grad_norm": 0.60546875, "learning_rate": 0.00017685089209059247, "loss": 0.3944, "step": 5344 }, { "epoch": 2.2432885906040267, "grad_norm": 0.7109375, "learning_rate": 0.00017669371504910745, "loss": 0.3447, "step": 5348 }, { "epoch": 2.2449664429530203, "grad_norm": 0.5234375, "learning_rate": 0.000176536507733979, "loss": 0.2796, "step": 5352 }, { "epoch": 2.2466442953020134, "grad_norm": 0.56640625, "learning_rate": 0.00017637927032349755, "loss": 0.4342, "step": 5356 }, { "epoch": 2.248322147651007, "grad_norm": 0.66796875, "learning_rate": 0.00017622200299598792, "loss": 0.4424, "step": 5360 }, { "epoch": 2.25, "grad_norm": 0.53125, "learning_rate": 0.0001760647059298088, "loss": 0.3115, "step": 5364 }, { "epoch": 2.251677852348993, "grad_norm": 0.59375, "learning_rate": 0.0001759073793033525, "loss": 0.307, "step": 5368 }, { "epoch": 2.2533557046979866, "grad_norm": 0.5625, "learning_rate": 0.00017575002329504492, "loss": 0.3741, "step": 5372 }, { "epoch": 2.2550335570469797, "grad_norm": 0.625, "learning_rate": 0.00017559263808334527, "loss": 0.4381, "step": 5376 }, { "epoch": 2.2567114093959733, "grad_norm": 0.55859375, "learning_rate": 0.00017543522384674595, "loss": 0.3759, "step": 5380 }, { "epoch": 2.2583892617449663, "grad_norm": 0.55859375, "learning_rate": 0.0001752777807637722, "loss": 0.3867, "step": 5384 }, { "epoch": 2.26006711409396, "grad_norm": 0.58984375, "learning_rate": 0.000175120309012982, "loss": 0.4489, "step": 5388 }, { "epoch": 2.261744966442953, "grad_norm": 0.453125, "learning_rate": 0.0001749628087729658, "loss": 0.3015, "step": 5392 }, { "epoch": 2.2634228187919465, "grad_norm": 0.470703125, "learning_rate": 0.0001748052802223465, "loss": 0.2638, "step": 5396 }, { "epoch": 2.2651006711409396, "grad_norm": 0.48046875, "learning_rate": 0.0001746477235397789, "loss": 0.2855, "step": 5400 }, { "epoch": 2.2667785234899327, "grad_norm": 0.51171875, "learning_rate": 0.00017449013890394986, "loss": 0.2603, "step": 5404 }, { "epoch": 2.2684563758389262, "grad_norm": 0.5625, "learning_rate": 0.00017433252649357793, "loss": 0.3312, "step": 5408 }, { "epoch": 2.2701342281879193, "grad_norm": 0.400390625, "learning_rate": 0.00017417488648741302, "loss": 0.2859, "step": 5412 }, { "epoch": 2.271812080536913, "grad_norm": 0.609375, "learning_rate": 0.0001740172190642365, "loss": 0.3996, "step": 5416 }, { "epoch": 2.273489932885906, "grad_norm": 0.53125, "learning_rate": 0.0001738595244028608, "loss": 0.3496, "step": 5420 }, { "epoch": 2.2751677852348995, "grad_norm": 0.515625, "learning_rate": 0.0001737018026821292, "loss": 0.354, "step": 5424 }, { "epoch": 2.2768456375838926, "grad_norm": 0.5859375, "learning_rate": 0.00017354405408091567, "loss": 0.3059, "step": 5428 }, { "epoch": 2.278523489932886, "grad_norm": 0.546875, "learning_rate": 0.00017338627877812467, "loss": 0.4665, "step": 5432 }, { "epoch": 2.280201342281879, "grad_norm": 0.76171875, "learning_rate": 0.00017322847695269095, "loss": 0.5214, "step": 5436 }, { "epoch": 2.2818791946308723, "grad_norm": 0.65234375, "learning_rate": 0.0001730706487835794, "loss": 0.3766, "step": 5440 }, { "epoch": 2.283557046979866, "grad_norm": 0.498046875, "learning_rate": 0.00017291279444978466, "loss": 0.3108, "step": 5444 }, { "epoch": 2.285234899328859, "grad_norm": 0.578125, "learning_rate": 0.00017275491413033114, "loss": 0.384, "step": 5448 }, { "epoch": 2.2869127516778525, "grad_norm": 0.66015625, "learning_rate": 0.00017259700800427267, "loss": 0.3421, "step": 5452 }, { "epoch": 2.2885906040268456, "grad_norm": 0.7421875, "learning_rate": 0.0001724390762506924, "loss": 0.3467, "step": 5456 }, { "epoch": 2.290268456375839, "grad_norm": 0.498046875, "learning_rate": 0.0001722811190487025, "loss": 0.3601, "step": 5460 }, { "epoch": 2.291946308724832, "grad_norm": 0.515625, "learning_rate": 0.000172123136577444, "loss": 0.443, "step": 5464 }, { "epoch": 2.2936241610738257, "grad_norm": 0.546875, "learning_rate": 0.00017196512901608666, "loss": 0.4186, "step": 5468 }, { "epoch": 2.295302013422819, "grad_norm": 0.478515625, "learning_rate": 0.0001718070965438286, "loss": 0.2993, "step": 5472 }, { "epoch": 2.296979865771812, "grad_norm": 0.6640625, "learning_rate": 0.0001716490393398962, "loss": 0.4752, "step": 5476 }, { "epoch": 2.2986577181208054, "grad_norm": 0.53125, "learning_rate": 0.00017149095758354395, "loss": 0.4293, "step": 5480 }, { "epoch": 2.3003355704697985, "grad_norm": 0.71875, "learning_rate": 0.00017133285145405421, "loss": 0.3998, "step": 5484 }, { "epoch": 2.302013422818792, "grad_norm": 0.62109375, "learning_rate": 0.00017117472113073681, "loss": 0.3982, "step": 5488 }, { "epoch": 2.303691275167785, "grad_norm": 0.57421875, "learning_rate": 0.00017101656679292923, "loss": 0.3738, "step": 5492 }, { "epoch": 2.3053691275167787, "grad_norm": 0.47265625, "learning_rate": 0.00017085838861999602, "loss": 0.4652, "step": 5496 }, { "epoch": 2.307046979865772, "grad_norm": 0.55859375, "learning_rate": 0.00017070018679132885, "loss": 0.4264, "step": 5500 }, { "epoch": 2.3087248322147653, "grad_norm": 0.63671875, "learning_rate": 0.00017054196148634623, "loss": 0.4204, "step": 5504 }, { "epoch": 2.3104026845637584, "grad_norm": 0.58203125, "learning_rate": 0.00017038371288449327, "loss": 0.2886, "step": 5508 }, { "epoch": 2.3120805369127515, "grad_norm": 0.4609375, "learning_rate": 0.00017022544116524147, "loss": 0.3807, "step": 5512 }, { "epoch": 2.313758389261745, "grad_norm": 0.5625, "learning_rate": 0.00017006714650808855, "loss": 0.4193, "step": 5516 }, { "epoch": 2.315436241610738, "grad_norm": 0.58984375, "learning_rate": 0.00016990882909255828, "loss": 0.2077, "step": 5520 }, { "epoch": 2.3171140939597317, "grad_norm": 0.466796875, "learning_rate": 0.0001697504890982003, "loss": 0.3763, "step": 5524 }, { "epoch": 2.3187919463087248, "grad_norm": 0.5078125, "learning_rate": 0.00016959212670458965, "loss": 0.2827, "step": 5528 }, { "epoch": 2.3204697986577183, "grad_norm": 0.60546875, "learning_rate": 0.00016943374209132706, "loss": 0.401, "step": 5532 }, { "epoch": 2.3221476510067114, "grad_norm": 0.52734375, "learning_rate": 0.0001692753354380382, "loss": 0.4052, "step": 5536 }, { "epoch": 2.323825503355705, "grad_norm": 0.56640625, "learning_rate": 0.00016911690692437384, "loss": 0.4704, "step": 5540 }, { "epoch": 2.325503355704698, "grad_norm": 0.58203125, "learning_rate": 0.00016895845673000964, "loss": 0.242, "step": 5544 }, { "epoch": 2.327181208053691, "grad_norm": 0.6875, "learning_rate": 0.00016879998503464561, "loss": 0.4102, "step": 5548 }, { "epoch": 2.3288590604026846, "grad_norm": 0.6484375, "learning_rate": 0.00016864149201800644, "loss": 0.3602, "step": 5552 }, { "epoch": 2.3305369127516777, "grad_norm": 0.484375, "learning_rate": 0.00016848297785984075, "loss": 0.345, "step": 5556 }, { "epoch": 2.3322147651006713, "grad_norm": 0.5703125, "learning_rate": 0.00016832444273992127, "loss": 0.3208, "step": 5560 }, { "epoch": 2.3338926174496644, "grad_norm": 0.54296875, "learning_rate": 0.00016816588683804447, "loss": 0.4222, "step": 5564 }, { "epoch": 2.335570469798658, "grad_norm": 0.55859375, "learning_rate": 0.0001680073103340304, "loss": 0.3743, "step": 5568 }, { "epoch": 2.337248322147651, "grad_norm": 0.5078125, "learning_rate": 0.00016784871340772247, "loss": 0.3043, "step": 5572 }, { "epoch": 2.3389261744966445, "grad_norm": 0.51953125, "learning_rate": 0.00016769009623898715, "loss": 0.3649, "step": 5576 }, { "epoch": 2.3406040268456376, "grad_norm": 0.54296875, "learning_rate": 0.00016753145900771409, "loss": 0.3799, "step": 5580 }, { "epoch": 2.3422818791946307, "grad_norm": 0.69921875, "learning_rate": 0.00016737280189381542, "loss": 0.3295, "step": 5584 }, { "epoch": 2.3439597315436242, "grad_norm": 0.62109375, "learning_rate": 0.00016721412507722617, "loss": 0.3604, "step": 5588 }, { "epoch": 2.3456375838926173, "grad_norm": 0.52734375, "learning_rate": 0.00016705542873790327, "loss": 0.3121, "step": 5592 }, { "epoch": 2.347315436241611, "grad_norm": 0.6484375, "learning_rate": 0.00016689671305582623, "loss": 0.3879, "step": 5596 }, { "epoch": 2.348993288590604, "grad_norm": 0.51171875, "learning_rate": 0.0001667379782109962, "loss": 0.2487, "step": 5600 }, { "epoch": 2.3506711409395975, "grad_norm": 0.359375, "learning_rate": 0.00016657922438343615, "loss": 0.3312, "step": 5604 }, { "epoch": 2.3523489932885906, "grad_norm": 0.5234375, "learning_rate": 0.0001664204517531906, "loss": 0.4999, "step": 5608 }, { "epoch": 2.354026845637584, "grad_norm": 0.54296875, "learning_rate": 0.00016626166050032543, "loss": 0.3055, "step": 5612 }, { "epoch": 2.3557046979865772, "grad_norm": 0.54296875, "learning_rate": 0.00016610285080492754, "loss": 0.3941, "step": 5616 }, { "epoch": 2.3573825503355703, "grad_norm": 0.62890625, "learning_rate": 0.0001659440228471048, "loss": 0.3844, "step": 5620 }, { "epoch": 2.359060402684564, "grad_norm": 0.60546875, "learning_rate": 0.00016578517680698583, "loss": 0.3719, "step": 5624 }, { "epoch": 2.360738255033557, "grad_norm": 0.5625, "learning_rate": 0.00016562631286471964, "loss": 0.3864, "step": 5628 }, { "epoch": 2.3624161073825505, "grad_norm": 0.47265625, "learning_rate": 0.0001654674312004757, "loss": 0.4221, "step": 5632 }, { "epoch": 2.3640939597315436, "grad_norm": 0.56640625, "learning_rate": 0.00016530853199444345, "loss": 0.3111, "step": 5636 }, { "epoch": 2.365771812080537, "grad_norm": 0.53125, "learning_rate": 0.0001651496154268323, "loss": 0.3642, "step": 5640 }, { "epoch": 2.36744966442953, "grad_norm": 0.5234375, "learning_rate": 0.00016499068167787133, "loss": 0.4385, "step": 5644 }, { "epoch": 2.3691275167785237, "grad_norm": 0.49609375, "learning_rate": 0.00016483173092780908, "loss": 0.2452, "step": 5648 }, { "epoch": 2.370805369127517, "grad_norm": 0.439453125, "learning_rate": 0.00016467276335691336, "loss": 0.2804, "step": 5652 }, { "epoch": 2.37248322147651, "grad_norm": 0.56640625, "learning_rate": 0.00016451377914547115, "loss": 0.3306, "step": 5656 }, { "epoch": 2.3741610738255035, "grad_norm": 0.671875, "learning_rate": 0.00016435477847378816, "loss": 0.4852, "step": 5660 }, { "epoch": 2.3758389261744965, "grad_norm": 0.5859375, "learning_rate": 0.00016419576152218896, "loss": 0.3523, "step": 5664 }, { "epoch": 2.37751677852349, "grad_norm": 0.484375, "learning_rate": 0.00016403672847101628, "loss": 0.4007, "step": 5668 }, { "epoch": 2.379194630872483, "grad_norm": 0.51171875, "learning_rate": 0.00016387767950063153, "loss": 0.2751, "step": 5672 }, { "epoch": 2.3808724832214763, "grad_norm": 0.62109375, "learning_rate": 0.0001637186147914138, "loss": 0.3832, "step": 5676 }, { "epoch": 2.38255033557047, "grad_norm": 0.55078125, "learning_rate": 0.0001635595345237602, "loss": 0.4116, "step": 5680 }, { "epoch": 2.384228187919463, "grad_norm": 0.5, "learning_rate": 0.00016340043887808547, "loss": 0.2975, "step": 5684 }, { "epoch": 2.3859060402684564, "grad_norm": 0.6171875, "learning_rate": 0.00016324132803482178, "loss": 0.4898, "step": 5688 }, { "epoch": 2.3875838926174495, "grad_norm": 0.51171875, "learning_rate": 0.0001630822021744185, "loss": 0.2692, "step": 5692 }, { "epoch": 2.389261744966443, "grad_norm": 0.53125, "learning_rate": 0.0001629230614773421, "loss": 0.3446, "step": 5696 }, { "epoch": 2.390939597315436, "grad_norm": 0.578125, "learning_rate": 0.0001627639061240758, "loss": 0.3106, "step": 5700 }, { "epoch": 2.3926174496644297, "grad_norm": 0.54296875, "learning_rate": 0.00016260473629511952, "loss": 0.3943, "step": 5704 }, { "epoch": 2.3942953020134228, "grad_norm": 0.6484375, "learning_rate": 0.0001624455521709896, "loss": 0.4069, "step": 5708 }, { "epoch": 2.395973154362416, "grad_norm": 0.72265625, "learning_rate": 0.0001622863539322184, "loss": 0.4554, "step": 5712 }, { "epoch": 2.3976510067114094, "grad_norm": 0.62109375, "learning_rate": 0.0001621271417593546, "loss": 0.3619, "step": 5716 }, { "epoch": 2.3993288590604025, "grad_norm": 0.546875, "learning_rate": 0.00016196791583296245, "loss": 0.3658, "step": 5720 }, { "epoch": 2.401006711409396, "grad_norm": 0.5078125, "learning_rate": 0.00016180867633362189, "loss": 0.3407, "step": 5724 }, { "epoch": 2.402684563758389, "grad_norm": 0.48828125, "learning_rate": 0.0001616494234419282, "loss": 0.3028, "step": 5728 }, { "epoch": 2.4043624161073827, "grad_norm": 0.3984375, "learning_rate": 0.00016149015733849195, "loss": 0.3047, "step": 5732 }, { "epoch": 2.4060402684563758, "grad_norm": 0.44921875, "learning_rate": 0.00016133087820393853, "loss": 0.3802, "step": 5736 }, { "epoch": 2.4077181208053693, "grad_norm": 0.5625, "learning_rate": 0.00016117158621890832, "loss": 0.3049, "step": 5740 }, { "epoch": 2.4093959731543624, "grad_norm": 0.9609375, "learning_rate": 0.00016101228156405604, "loss": 0.4142, "step": 5744 }, { "epoch": 2.4110738255033555, "grad_norm": 0.4765625, "learning_rate": 0.00016085296442005096, "loss": 0.2772, "step": 5748 }, { "epoch": 2.412751677852349, "grad_norm": 0.578125, "learning_rate": 0.00016069363496757646, "loss": 0.3654, "step": 5752 }, { "epoch": 2.414429530201342, "grad_norm": 0.5859375, "learning_rate": 0.0001605342933873298, "loss": 0.3719, "step": 5756 }, { "epoch": 2.4161073825503356, "grad_norm": 0.70703125, "learning_rate": 0.0001603749398600222, "loss": 0.3868, "step": 5760 }, { "epoch": 2.4177852348993287, "grad_norm": 0.62109375, "learning_rate": 0.00016021557456637816, "loss": 0.3576, "step": 5764 }, { "epoch": 2.4194630872483223, "grad_norm": 0.59375, "learning_rate": 0.0001600561976871357, "loss": 0.4184, "step": 5768 }, { "epoch": 2.4211409395973154, "grad_norm": 0.62890625, "learning_rate": 0.00015989680940304603, "loss": 0.286, "step": 5772 }, { "epoch": 2.422818791946309, "grad_norm": 0.55078125, "learning_rate": 0.00015973740989487303, "loss": 0.4222, "step": 5776 }, { "epoch": 2.424496644295302, "grad_norm": 0.56640625, "learning_rate": 0.0001595779993433936, "loss": 0.3452, "step": 5780 }, { "epoch": 2.426174496644295, "grad_norm": 0.5234375, "learning_rate": 0.000159418577929397, "loss": 0.3595, "step": 5784 }, { "epoch": 2.4278523489932886, "grad_norm": 0.5, "learning_rate": 0.0001592591458336849, "loss": 0.4358, "step": 5788 }, { "epoch": 2.4295302013422817, "grad_norm": 0.640625, "learning_rate": 0.00015909970323707095, "loss": 0.4009, "step": 5792 }, { "epoch": 2.4312080536912752, "grad_norm": 0.578125, "learning_rate": 0.0001589402503203809, "loss": 0.3784, "step": 5796 }, { "epoch": 2.4328859060402683, "grad_norm": 0.76953125, "learning_rate": 0.00015878078726445206, "loss": 0.36, "step": 5800 }, { "epoch": 2.434563758389262, "grad_norm": 0.5390625, "learning_rate": 0.00015862131425013328, "loss": 0.3444, "step": 5804 }, { "epoch": 2.436241610738255, "grad_norm": 0.66015625, "learning_rate": 0.00015846183145828472, "loss": 0.4293, "step": 5808 }, { "epoch": 2.4379194630872485, "grad_norm": 0.40234375, "learning_rate": 0.0001583023390697776, "loss": 0.3279, "step": 5812 }, { "epoch": 2.4395973154362416, "grad_norm": 0.578125, "learning_rate": 0.000158142837265494, "loss": 0.276, "step": 5816 }, { "epoch": 2.4412751677852347, "grad_norm": 0.59765625, "learning_rate": 0.0001579833262263268, "loss": 0.3932, "step": 5820 }, { "epoch": 2.442953020134228, "grad_norm": 0.435546875, "learning_rate": 0.00015782380613317922, "loss": 0.2845, "step": 5824 }, { "epoch": 2.4446308724832213, "grad_norm": 0.5390625, "learning_rate": 0.00015766427716696477, "loss": 0.3362, "step": 5828 }, { "epoch": 2.446308724832215, "grad_norm": 0.578125, "learning_rate": 0.00015750473950860712, "loss": 0.3862, "step": 5832 }, { "epoch": 2.447986577181208, "grad_norm": 0.60546875, "learning_rate": 0.00015734519333903966, "loss": 0.3964, "step": 5836 }, { "epoch": 2.4496644295302015, "grad_norm": 0.66015625, "learning_rate": 0.00015718563883920553, "loss": 0.3869, "step": 5840 }, { "epoch": 2.4513422818791946, "grad_norm": 0.43359375, "learning_rate": 0.00015702607619005735, "loss": 0.2976, "step": 5844 }, { "epoch": 2.453020134228188, "grad_norm": 0.361328125, "learning_rate": 0.00015686650557255683, "loss": 0.3162, "step": 5848 }, { "epoch": 2.454697986577181, "grad_norm": 0.8203125, "learning_rate": 0.00015670692716767483, "loss": 0.301, "step": 5852 }, { "epoch": 2.4563758389261743, "grad_norm": 0.625, "learning_rate": 0.00015654734115639105, "loss": 0.248, "step": 5856 }, { "epoch": 2.458053691275168, "grad_norm": 0.5234375, "learning_rate": 0.00015638774771969377, "loss": 0.2852, "step": 5860 }, { "epoch": 2.459731543624161, "grad_norm": 0.5234375, "learning_rate": 0.0001562281470385797, "loss": 0.3906, "step": 5864 }, { "epoch": 2.4614093959731544, "grad_norm": 0.4765625, "learning_rate": 0.0001560685392940538, "loss": 0.3431, "step": 5868 }, { "epoch": 2.4630872483221475, "grad_norm": 0.5078125, "learning_rate": 0.00015590892466712898, "loss": 0.3743, "step": 5872 }, { "epoch": 2.464765100671141, "grad_norm": 0.52734375, "learning_rate": 0.000155749303338826, "loss": 0.4745, "step": 5876 }, { "epoch": 2.466442953020134, "grad_norm": 0.625, "learning_rate": 0.00015558967549017319, "loss": 0.4065, "step": 5880 }, { "epoch": 2.4681208053691277, "grad_norm": 0.64453125, "learning_rate": 0.00015543004130220633, "loss": 0.274, "step": 5884 }, { "epoch": 2.469798657718121, "grad_norm": 0.48046875, "learning_rate": 0.00015527040095596843, "loss": 0.3541, "step": 5888 }, { "epoch": 2.471476510067114, "grad_norm": 0.44921875, "learning_rate": 0.00015511075463250923, "loss": 0.3771, "step": 5892 }, { "epoch": 2.4731543624161074, "grad_norm": 0.53515625, "learning_rate": 0.00015495110251288556, "loss": 0.4159, "step": 5896 }, { "epoch": 2.4748322147651005, "grad_norm": 0.61328125, "learning_rate": 0.00015479144477816066, "loss": 0.3691, "step": 5900 }, { "epoch": 2.476510067114094, "grad_norm": 0.609375, "learning_rate": 0.00015463178160940412, "loss": 0.3923, "step": 5904 }, { "epoch": 2.478187919463087, "grad_norm": 0.474609375, "learning_rate": 0.00015447211318769178, "loss": 0.3222, "step": 5908 }, { "epoch": 2.4798657718120807, "grad_norm": 0.6875, "learning_rate": 0.0001543124396941055, "loss": 0.3946, "step": 5912 }, { "epoch": 2.4815436241610738, "grad_norm": 0.416015625, "learning_rate": 0.00015415276130973255, "loss": 0.406, "step": 5916 }, { "epoch": 2.4832214765100673, "grad_norm": 0.455078125, "learning_rate": 0.0001539930782156662, "loss": 0.1882, "step": 5920 }, { "epoch": 2.4848993288590604, "grad_norm": 0.5703125, "learning_rate": 0.00015383339059300475, "loss": 0.3814, "step": 5924 }, { "epoch": 2.4865771812080535, "grad_norm": 0.55078125, "learning_rate": 0.0001536736986228518, "loss": 0.2611, "step": 5928 }, { "epoch": 2.488255033557047, "grad_norm": 0.59765625, "learning_rate": 0.00015351400248631578, "loss": 0.3716, "step": 5932 }, { "epoch": 2.48993288590604, "grad_norm": 0.5390625, "learning_rate": 0.00015335430236450986, "loss": 0.2996, "step": 5936 }, { "epoch": 2.4916107382550337, "grad_norm": 0.515625, "learning_rate": 0.00015319459843855178, "loss": 0.3028, "step": 5940 }, { "epoch": 2.4932885906040267, "grad_norm": 0.6328125, "learning_rate": 0.00015303489088956356, "loss": 0.3818, "step": 5944 }, { "epoch": 2.4949664429530203, "grad_norm": 0.66796875, "learning_rate": 0.00015287517989867133, "loss": 0.3369, "step": 5948 }, { "epoch": 2.4966442953020134, "grad_norm": 0.546875, "learning_rate": 0.00015271546564700513, "loss": 0.3567, "step": 5952 }, { "epoch": 2.498322147651007, "grad_norm": 0.46875, "learning_rate": 0.00015255574831569868, "loss": 0.3977, "step": 5956 }, { "epoch": 2.5, "grad_norm": 0.5078125, "learning_rate": 0.0001523960280858892, "loss": 0.4096, "step": 5960 }, { "epoch": 2.501677852348993, "grad_norm": 0.515625, "learning_rate": 0.00015223630513871728, "loss": 0.2851, "step": 5964 }, { "epoch": 2.5033557046979866, "grad_norm": 0.58984375, "learning_rate": 0.0001520765796553264, "loss": 0.2805, "step": 5968 }, { "epoch": 2.5050335570469797, "grad_norm": 0.48828125, "learning_rate": 0.00015191685181686315, "loss": 0.3703, "step": 5972 }, { "epoch": 2.5067114093959733, "grad_norm": 0.6640625, "learning_rate": 0.0001517571218044766, "loss": 0.3662, "step": 5976 }, { "epoch": 2.5083892617449663, "grad_norm": 0.54296875, "learning_rate": 0.00015159738979931837, "loss": 0.4309, "step": 5980 }, { "epoch": 2.51006711409396, "grad_norm": 0.65625, "learning_rate": 0.00015143765598254235, "loss": 0.3627, "step": 5984 }, { "epoch": 2.511744966442953, "grad_norm": 0.5703125, "learning_rate": 0.00015127792053530449, "loss": 0.4155, "step": 5988 }, { "epoch": 2.5134228187919465, "grad_norm": 0.51171875, "learning_rate": 0.00015111818363876248, "loss": 0.3456, "step": 5992 }, { "epoch": 2.5151006711409396, "grad_norm": 0.578125, "learning_rate": 0.0001509584454740758, "loss": 0.3378, "step": 5996 }, { "epoch": 2.5167785234899327, "grad_norm": 0.56640625, "learning_rate": 0.00015079870622240524, "loss": 0.3889, "step": 6000 }, { "epoch": 2.5184563758389262, "grad_norm": 0.474609375, "learning_rate": 0.00015063896606491305, "loss": 0.3076, "step": 6004 }, { "epoch": 2.5201342281879193, "grad_norm": 0.5703125, "learning_rate": 0.00015047922518276213, "loss": 0.3327, "step": 6008 }, { "epoch": 2.521812080536913, "grad_norm": 0.65234375, "learning_rate": 0.00015031948375711655, "loss": 0.3553, "step": 6012 }, { "epoch": 2.523489932885906, "grad_norm": 0.51953125, "learning_rate": 0.00015015974196914088, "loss": 0.2557, "step": 6016 }, { "epoch": 2.5251677852348995, "grad_norm": 0.4765625, "learning_rate": 0.00015, "loss": 0.3627, "step": 6020 }, { "epoch": 2.5268456375838926, "grad_norm": 0.5703125, "learning_rate": 0.00014984025803085912, "loss": 0.3147, "step": 6024 }, { "epoch": 2.528523489932886, "grad_norm": 0.458984375, "learning_rate": 0.00014968051624288342, "loss": 0.3009, "step": 6028 }, { "epoch": 2.530201342281879, "grad_norm": 0.6328125, "learning_rate": 0.00014952077481723787, "loss": 0.322, "step": 6032 }, { "epoch": 2.5318791946308723, "grad_norm": 0.54296875, "learning_rate": 0.00014936103393508698, "loss": 0.2764, "step": 6036 }, { "epoch": 2.533557046979866, "grad_norm": 0.5546875, "learning_rate": 0.00014920129377759474, "loss": 0.318, "step": 6040 }, { "epoch": 2.535234899328859, "grad_norm": 0.6875, "learning_rate": 0.00014904155452592424, "loss": 0.3617, "step": 6044 }, { "epoch": 2.5369127516778525, "grad_norm": 0.474609375, "learning_rate": 0.00014888181636123752, "loss": 0.4151, "step": 6048 }, { "epoch": 2.5385906040268456, "grad_norm": 0.48046875, "learning_rate": 0.00014872207946469554, "loss": 0.2989, "step": 6052 }, { "epoch": 2.540268456375839, "grad_norm": 0.71875, "learning_rate": 0.00014856234401745765, "loss": 0.3782, "step": 6056 }, { "epoch": 2.541946308724832, "grad_norm": 0.5859375, "learning_rate": 0.00014840261020068166, "loss": 0.3383, "step": 6060 }, { "epoch": 2.5436241610738257, "grad_norm": 0.6875, "learning_rate": 0.00014824287819552336, "loss": 0.4203, "step": 6064 }, { "epoch": 2.545302013422819, "grad_norm": 0.57421875, "learning_rate": 0.00014808314818313682, "loss": 0.4255, "step": 6068 }, { "epoch": 2.546979865771812, "grad_norm": 0.5078125, "learning_rate": 0.00014792342034467356, "loss": 0.4595, "step": 6072 }, { "epoch": 2.5486577181208054, "grad_norm": 0.51171875, "learning_rate": 0.0001477636948612827, "loss": 0.2584, "step": 6076 }, { "epoch": 2.5503355704697985, "grad_norm": 0.5703125, "learning_rate": 0.00014760397191411077, "loss": 0.3641, "step": 6080 }, { "epoch": 2.552013422818792, "grad_norm": 0.69140625, "learning_rate": 0.0001474442516843013, "loss": 0.357, "step": 6084 }, { "epoch": 2.553691275167785, "grad_norm": 0.515625, "learning_rate": 0.00014728453435299484, "loss": 0.3764, "step": 6088 }, { "epoch": 2.5553691275167782, "grad_norm": 0.447265625, "learning_rate": 0.00014712482010132867, "loss": 0.2868, "step": 6092 }, { "epoch": 2.557046979865772, "grad_norm": 0.609375, "learning_rate": 0.0001469651091104364, "loss": 0.3255, "step": 6096 }, { "epoch": 2.5587248322147653, "grad_norm": 0.462890625, "learning_rate": 0.00014680540156144823, "loss": 0.3416, "step": 6100 }, { "epoch": 2.5604026845637584, "grad_norm": 0.6171875, "learning_rate": 0.0001466456976354901, "loss": 0.3408, "step": 6104 }, { "epoch": 2.5620805369127515, "grad_norm": 0.5546875, "learning_rate": 0.0001464859975136842, "loss": 0.3532, "step": 6108 }, { "epoch": 2.563758389261745, "grad_norm": 0.63671875, "learning_rate": 0.00014632630137714818, "loss": 0.4569, "step": 6112 }, { "epoch": 2.565436241610738, "grad_norm": 0.75390625, "learning_rate": 0.00014616660940699523, "loss": 0.3752, "step": 6116 }, { "epoch": 2.5671140939597317, "grad_norm": 0.48046875, "learning_rate": 0.00014600692178433377, "loss": 0.348, "step": 6120 }, { "epoch": 2.5687919463087248, "grad_norm": 0.640625, "learning_rate": 0.00014584723869026745, "loss": 0.4321, "step": 6124 }, { "epoch": 2.570469798657718, "grad_norm": 0.671875, "learning_rate": 0.00014568756030589457, "loss": 0.462, "step": 6128 }, { "epoch": 2.5721476510067114, "grad_norm": 0.62109375, "learning_rate": 0.0001455278868123082, "loss": 0.3857, "step": 6132 }, { "epoch": 2.573825503355705, "grad_norm": 0.51171875, "learning_rate": 0.00014536821839059589, "loss": 0.3663, "step": 6136 }, { "epoch": 2.575503355704698, "grad_norm": 0.6171875, "learning_rate": 0.00014520855522183937, "loss": 0.4191, "step": 6140 }, { "epoch": 2.577181208053691, "grad_norm": 0.6171875, "learning_rate": 0.00014504889748711444, "loss": 0.3557, "step": 6144 }, { "epoch": 2.5788590604026846, "grad_norm": 0.43359375, "learning_rate": 0.00014488924536749072, "loss": 0.2226, "step": 6148 }, { "epoch": 2.5805369127516777, "grad_norm": 0.625, "learning_rate": 0.00014472959904403157, "loss": 0.3682, "step": 6152 }, { "epoch": 2.5822147651006713, "grad_norm": 0.515625, "learning_rate": 0.0001445699586977936, "loss": 0.3603, "step": 6156 }, { "epoch": 2.5838926174496644, "grad_norm": 0.625, "learning_rate": 0.00014441032450982676, "loss": 0.4521, "step": 6160 }, { "epoch": 2.5855704697986575, "grad_norm": 0.78515625, "learning_rate": 0.00014425069666117402, "loss": 0.4528, "step": 6164 }, { "epoch": 2.587248322147651, "grad_norm": 0.56640625, "learning_rate": 0.00014409107533287102, "loss": 0.4057, "step": 6168 }, { "epoch": 2.5889261744966445, "grad_norm": 0.44921875, "learning_rate": 0.00014393146070594619, "loss": 0.2504, "step": 6172 }, { "epoch": 2.5906040268456376, "grad_norm": 0.55078125, "learning_rate": 0.0001437718529614203, "loss": 0.4155, "step": 6176 }, { "epoch": 2.5922818791946307, "grad_norm": 0.73828125, "learning_rate": 0.0001436122522803062, "loss": 0.3445, "step": 6180 }, { "epoch": 2.5939597315436242, "grad_norm": 0.6328125, "learning_rate": 0.00014345265884360892, "loss": 0.2492, "step": 6184 }, { "epoch": 2.5956375838926173, "grad_norm": 0.5625, "learning_rate": 0.00014329307283232517, "loss": 0.3363, "step": 6188 }, { "epoch": 2.597315436241611, "grad_norm": 0.458984375, "learning_rate": 0.00014313349442744317, "loss": 0.2806, "step": 6192 }, { "epoch": 2.598993288590604, "grad_norm": 0.58984375, "learning_rate": 0.00014297392380994265, "loss": 0.2702, "step": 6196 }, { "epoch": 2.600671140939597, "grad_norm": 0.6640625, "learning_rate": 0.00014281436116079447, "loss": 0.4343, "step": 6200 }, { "epoch": 2.6023489932885906, "grad_norm": 0.52734375, "learning_rate": 0.00014265480666096034, "loss": 0.2412, "step": 6204 }, { "epoch": 2.604026845637584, "grad_norm": 0.482421875, "learning_rate": 0.00014249526049139288, "loss": 0.2626, "step": 6208 }, { "epoch": 2.6057046979865772, "grad_norm": 0.66015625, "learning_rate": 0.00014233572283303523, "loss": 0.3317, "step": 6212 }, { "epoch": 2.6073825503355703, "grad_norm": 0.59765625, "learning_rate": 0.0001421761938668208, "loss": 0.3376, "step": 6216 }, { "epoch": 2.609060402684564, "grad_norm": 0.69140625, "learning_rate": 0.00014201667377367324, "loss": 0.4597, "step": 6220 }, { "epoch": 2.610738255033557, "grad_norm": 0.60546875, "learning_rate": 0.000141857162734506, "loss": 0.4688, "step": 6224 }, { "epoch": 2.6124161073825505, "grad_norm": 0.53125, "learning_rate": 0.00014169766093022237, "loss": 0.3268, "step": 6228 }, { "epoch": 2.6140939597315436, "grad_norm": 0.484375, "learning_rate": 0.00014153816854171525, "loss": 0.351, "step": 6232 }, { "epoch": 2.6157718120805367, "grad_norm": 0.640625, "learning_rate": 0.00014137868574986667, "loss": 0.5052, "step": 6236 }, { "epoch": 2.61744966442953, "grad_norm": 1.2890625, "learning_rate": 0.00014121921273554792, "loss": 0.3704, "step": 6240 }, { "epoch": 2.6191275167785237, "grad_norm": 0.53125, "learning_rate": 0.00014105974967961905, "loss": 0.2445, "step": 6244 }, { "epoch": 2.620805369127517, "grad_norm": 0.67578125, "learning_rate": 0.000140900296762929, "loss": 0.385, "step": 6248 }, { "epoch": 2.62248322147651, "grad_norm": 0.50390625, "learning_rate": 0.0001407408541663151, "loss": 0.4199, "step": 6252 }, { "epoch": 2.6241610738255035, "grad_norm": 0.5703125, "learning_rate": 0.000140581422070603, "loss": 0.2518, "step": 6256 }, { "epoch": 2.6258389261744965, "grad_norm": 0.65625, "learning_rate": 0.0001404220006566064, "loss": 0.304, "step": 6260 }, { "epoch": 2.62751677852349, "grad_norm": 0.62890625, "learning_rate": 0.00014026259010512697, "loss": 0.2195, "step": 6264 }, { "epoch": 2.629194630872483, "grad_norm": 0.5703125, "learning_rate": 0.00014010319059695397, "loss": 0.3464, "step": 6268 }, { "epoch": 2.6308724832214763, "grad_norm": 0.59375, "learning_rate": 0.00013994380231286427, "loss": 0.3912, "step": 6272 }, { "epoch": 2.63255033557047, "grad_norm": 0.5546875, "learning_rate": 0.00013978442543362182, "loss": 0.3073, "step": 6276 }, { "epoch": 2.6342281879194633, "grad_norm": 0.640625, "learning_rate": 0.0001396250601399778, "loss": 0.3774, "step": 6280 }, { "epoch": 2.6359060402684564, "grad_norm": 0.43359375, "learning_rate": 0.00013946570661267017, "loss": 0.3678, "step": 6284 }, { "epoch": 2.6375838926174495, "grad_norm": 0.66796875, "learning_rate": 0.00013930636503242354, "loss": 0.3685, "step": 6288 }, { "epoch": 2.639261744966443, "grad_norm": 0.55078125, "learning_rate": 0.000139147035579949, "loss": 0.3078, "step": 6292 }, { "epoch": 2.640939597315436, "grad_norm": 0.69921875, "learning_rate": 0.00013898771843594399, "loss": 0.3876, "step": 6296 }, { "epoch": 2.6426174496644297, "grad_norm": 0.66015625, "learning_rate": 0.0001388284137810917, "loss": 0.328, "step": 6300 }, { "epoch": 2.6442953020134228, "grad_norm": 0.59375, "learning_rate": 0.00013866912179606145, "loss": 0.3516, "step": 6304 }, { "epoch": 2.645973154362416, "grad_norm": 0.71484375, "learning_rate": 0.00013850984266150807, "loss": 0.2466, "step": 6308 }, { "epoch": 2.6476510067114094, "grad_norm": 0.59375, "learning_rate": 0.00013835057655807177, "loss": 0.378, "step": 6312 }, { "epoch": 2.649328859060403, "grad_norm": 0.392578125, "learning_rate": 0.0001381913236663781, "loss": 0.386, "step": 6316 }, { "epoch": 2.651006711409396, "grad_norm": 0.578125, "learning_rate": 0.0001380320841670375, "loss": 0.38, "step": 6320 }, { "epoch": 2.652684563758389, "grad_norm": 0.51171875, "learning_rate": 0.0001378728582406454, "loss": 0.3441, "step": 6324 }, { "epoch": 2.6543624161073827, "grad_norm": 0.60546875, "learning_rate": 0.00013771364606778158, "loss": 0.4694, "step": 6328 }, { "epoch": 2.6560402684563758, "grad_norm": 0.5, "learning_rate": 0.0001375544478290104, "loss": 0.3019, "step": 6332 }, { "epoch": 2.6577181208053693, "grad_norm": 0.61328125, "learning_rate": 0.00013739526370488045, "loss": 0.3773, "step": 6336 }, { "epoch": 2.6593959731543624, "grad_norm": 0.6484375, "learning_rate": 0.00013723609387592418, "loss": 0.5262, "step": 6340 }, { "epoch": 2.6610738255033555, "grad_norm": 0.66796875, "learning_rate": 0.0001370769385226579, "loss": 0.4213, "step": 6344 }, { "epoch": 2.662751677852349, "grad_norm": 0.64453125, "learning_rate": 0.0001369177978255815, "loss": 0.3972, "step": 6348 }, { "epoch": 2.6644295302013425, "grad_norm": 0.53515625, "learning_rate": 0.00013675867196517822, "loss": 0.4008, "step": 6352 }, { "epoch": 2.6661073825503356, "grad_norm": 0.6171875, "learning_rate": 0.0001365995611219145, "loss": 0.384, "step": 6356 }, { "epoch": 2.6677852348993287, "grad_norm": 0.5546875, "learning_rate": 0.0001364404654762398, "loss": 0.357, "step": 6360 }, { "epoch": 2.6694630872483223, "grad_norm": 0.486328125, "learning_rate": 0.0001362813852085862, "loss": 0.3526, "step": 6364 }, { "epoch": 2.6711409395973154, "grad_norm": 0.490234375, "learning_rate": 0.00013612232049936844, "loss": 0.3192, "step": 6368 }, { "epoch": 2.672818791946309, "grad_norm": 0.60546875, "learning_rate": 0.0001359632715289837, "loss": 0.3403, "step": 6372 }, { "epoch": 2.674496644295302, "grad_norm": 0.54296875, "learning_rate": 0.00013580423847781107, "loss": 0.3787, "step": 6376 }, { "epoch": 2.676174496644295, "grad_norm": 0.578125, "learning_rate": 0.00013564522152621186, "loss": 0.4453, "step": 6380 }, { "epoch": 2.6778523489932886, "grad_norm": 0.421875, "learning_rate": 0.0001354862208545289, "loss": 0.2845, "step": 6384 }, { "epoch": 2.679530201342282, "grad_norm": 0.59765625, "learning_rate": 0.00013532723664308664, "loss": 0.2798, "step": 6388 }, { "epoch": 2.6812080536912752, "grad_norm": 0.53515625, "learning_rate": 0.00013516826907219098, "loss": 0.2555, "step": 6392 }, { "epoch": 2.6828859060402683, "grad_norm": 0.58984375, "learning_rate": 0.00013500931832212862, "loss": 0.3284, "step": 6396 }, { "epoch": 2.684563758389262, "grad_norm": 0.58984375, "learning_rate": 0.00013485038457316766, "loss": 0.3948, "step": 6400 }, { "epoch": 2.686241610738255, "grad_norm": 0.5234375, "learning_rate": 0.0001346914680055565, "loss": 0.3009, "step": 6404 }, { "epoch": 2.6879194630872485, "grad_norm": 0.59375, "learning_rate": 0.00013453256879952425, "loss": 0.2744, "step": 6408 }, { "epoch": 2.6895973154362416, "grad_norm": 0.48046875, "learning_rate": 0.00013437368713528034, "loss": 0.2833, "step": 6412 }, { "epoch": 2.6912751677852347, "grad_norm": 0.43359375, "learning_rate": 0.00013421482319301415, "loss": 0.271, "step": 6416 }, { "epoch": 2.692953020134228, "grad_norm": 0.6328125, "learning_rate": 0.0001340559771528952, "loss": 0.3364, "step": 6420 }, { "epoch": 2.6946308724832218, "grad_norm": 0.52734375, "learning_rate": 0.00013389714919507246, "loss": 0.2904, "step": 6424 }, { "epoch": 2.696308724832215, "grad_norm": 0.7265625, "learning_rate": 0.00013373833949967455, "loss": 0.4181, "step": 6428 }, { "epoch": 2.697986577181208, "grad_norm": 0.53515625, "learning_rate": 0.00013357954824680938, "loss": 0.2977, "step": 6432 }, { "epoch": 2.6996644295302015, "grad_norm": 0.5703125, "learning_rate": 0.00013342077561656385, "loss": 0.188, "step": 6436 }, { "epoch": 2.7013422818791946, "grad_norm": 0.53125, "learning_rate": 0.00013326202178900377, "loss": 0.413, "step": 6440 }, { "epoch": 2.703020134228188, "grad_norm": 0.431640625, "learning_rate": 0.00013310328694417375, "loss": 0.3733, "step": 6444 }, { "epoch": 2.704697986577181, "grad_norm": 0.59375, "learning_rate": 0.0001329445712620967, "loss": 0.3726, "step": 6448 }, { "epoch": 2.7063758389261743, "grad_norm": 0.55859375, "learning_rate": 0.00013278587492277383, "loss": 0.321, "step": 6452 }, { "epoch": 2.708053691275168, "grad_norm": 0.53125, "learning_rate": 0.00013262719810618455, "loss": 0.3922, "step": 6456 }, { "epoch": 2.709731543624161, "grad_norm": 0.5078125, "learning_rate": 0.00013246854099228591, "loss": 0.335, "step": 6460 }, { "epoch": 2.7114093959731544, "grad_norm": 0.546875, "learning_rate": 0.00013230990376101283, "loss": 0.3398, "step": 6464 }, { "epoch": 2.7130872483221475, "grad_norm": 0.453125, "learning_rate": 0.00013215128659227758, "loss": 0.2664, "step": 6468 }, { "epoch": 2.714765100671141, "grad_norm": 0.439453125, "learning_rate": 0.0001319926896659696, "loss": 0.3221, "step": 6472 }, { "epoch": 2.716442953020134, "grad_norm": 0.5078125, "learning_rate": 0.0001318341131619555, "loss": 0.4603, "step": 6476 }, { "epoch": 2.7181208053691277, "grad_norm": 0.546875, "learning_rate": 0.00013167555726007865, "loss": 0.3556, "step": 6480 }, { "epoch": 2.719798657718121, "grad_norm": 0.73046875, "learning_rate": 0.00013151702214015922, "loss": 0.4094, "step": 6484 }, { "epoch": 2.721476510067114, "grad_norm": 0.4765625, "learning_rate": 0.00013135850798199353, "loss": 0.3238, "step": 6488 }, { "epoch": 2.7231543624161074, "grad_norm": 0.59375, "learning_rate": 0.00013120001496535433, "loss": 0.3087, "step": 6492 }, { "epoch": 2.7248322147651005, "grad_norm": 0.69921875, "learning_rate": 0.0001310415432699904, "loss": 0.4705, "step": 6496 }, { "epoch": 2.726510067114094, "grad_norm": 0.6484375, "learning_rate": 0.00013088309307562613, "loss": 0.3113, "step": 6500 }, { "epoch": 2.728187919463087, "grad_norm": 0.578125, "learning_rate": 0.0001307246645619618, "loss": 0.3402, "step": 6504 }, { "epoch": 2.7298657718120807, "grad_norm": 0.515625, "learning_rate": 0.00013056625790867294, "loss": 0.2471, "step": 6508 }, { "epoch": 2.7315436241610738, "grad_norm": 0.490234375, "learning_rate": 0.00013040787329541032, "loss": 0.2157, "step": 6512 }, { "epoch": 2.7332214765100673, "grad_norm": 0.58203125, "learning_rate": 0.00013024951090179969, "loss": 0.3021, "step": 6516 }, { "epoch": 2.7348993288590604, "grad_norm": 0.68359375, "learning_rate": 0.0001300911709074417, "loss": 0.3479, "step": 6520 }, { "epoch": 2.7365771812080535, "grad_norm": 0.609375, "learning_rate": 0.00012993285349191145, "loss": 0.498, "step": 6524 }, { "epoch": 2.738255033557047, "grad_norm": 0.58203125, "learning_rate": 0.00012977455883475853, "loss": 0.4077, "step": 6528 }, { "epoch": 2.73993288590604, "grad_norm": 0.60546875, "learning_rate": 0.00012961628711550676, "loss": 0.5201, "step": 6532 }, { "epoch": 2.7416107382550337, "grad_norm": 0.53515625, "learning_rate": 0.00012945803851365377, "loss": 0.4058, "step": 6536 }, { "epoch": 2.7432885906040267, "grad_norm": 0.62109375, "learning_rate": 0.00012929981320867117, "loss": 0.3473, "step": 6540 }, { "epoch": 2.7449664429530203, "grad_norm": 0.6640625, "learning_rate": 0.000129141611380004, "loss": 0.5105, "step": 6544 }, { "epoch": 2.7466442953020134, "grad_norm": 0.5625, "learning_rate": 0.0001289834332070708, "loss": 0.2857, "step": 6548 }, { "epoch": 2.748322147651007, "grad_norm": 0.6171875, "learning_rate": 0.0001288252788692632, "loss": 0.3778, "step": 6552 }, { "epoch": 2.75, "grad_norm": 0.52734375, "learning_rate": 0.00012866714854594584, "loss": 0.3877, "step": 6556 }, { "epoch": 2.751677852348993, "grad_norm": 0.546875, "learning_rate": 0.00012850904241645602, "loss": 0.4025, "step": 6560 }, { "epoch": 2.7533557046979866, "grad_norm": 0.5859375, "learning_rate": 0.00012835096066010377, "loss": 0.3119, "step": 6564 }, { "epoch": 2.7550335570469797, "grad_norm": 0.65234375, "learning_rate": 0.00012819290345617135, "loss": 0.3965, "step": 6568 }, { "epoch": 2.7567114093959733, "grad_norm": 0.5703125, "learning_rate": 0.0001280348709839133, "loss": 0.394, "step": 6572 }, { "epoch": 2.7583892617449663, "grad_norm": 0.58984375, "learning_rate": 0.00012787686342255594, "loss": 0.3106, "step": 6576 }, { "epoch": 2.76006711409396, "grad_norm": 0.6015625, "learning_rate": 0.00012771888095129748, "loss": 0.264, "step": 6580 }, { "epoch": 2.761744966442953, "grad_norm": 0.5546875, "learning_rate": 0.00012756092374930757, "loss": 0.442, "step": 6584 }, { "epoch": 2.7634228187919465, "grad_norm": 0.64453125, "learning_rate": 0.0001274029919957273, "loss": 0.3472, "step": 6588 }, { "epoch": 2.7651006711409396, "grad_norm": 0.70703125, "learning_rate": 0.00012724508586966888, "loss": 0.3097, "step": 6592 }, { "epoch": 2.7667785234899327, "grad_norm": 0.53515625, "learning_rate": 0.00012708720555021534, "loss": 0.2386, "step": 6596 }, { "epoch": 2.7684563758389262, "grad_norm": 0.50390625, "learning_rate": 0.0001269293512164206, "loss": 0.3387, "step": 6600 }, { "epoch": 2.7701342281879193, "grad_norm": 0.5390625, "learning_rate": 0.00012677152304730905, "loss": 0.4947, "step": 6604 }, { "epoch": 2.771812080536913, "grad_norm": 0.5078125, "learning_rate": 0.00012661372122187533, "loss": 0.2412, "step": 6608 }, { "epoch": 2.773489932885906, "grad_norm": 0.59375, "learning_rate": 0.00012645594591908433, "loss": 0.4554, "step": 6612 }, { "epoch": 2.7751677852348995, "grad_norm": 0.5625, "learning_rate": 0.00012629819731787084, "loss": 0.3132, "step": 6616 }, { "epoch": 2.7768456375838926, "grad_norm": 0.65234375, "learning_rate": 0.0001261404755971392, "loss": 0.3101, "step": 6620 }, { "epoch": 2.778523489932886, "grad_norm": 0.578125, "learning_rate": 0.0001259827809357635, "loss": 0.3054, "step": 6624 }, { "epoch": 2.780201342281879, "grad_norm": 0.62109375, "learning_rate": 0.000125825113512587, "loss": 0.2757, "step": 6628 }, { "epoch": 2.7818791946308723, "grad_norm": 0.48828125, "learning_rate": 0.00012566747350642213, "loss": 0.3381, "step": 6632 }, { "epoch": 2.783557046979866, "grad_norm": 0.62890625, "learning_rate": 0.00012550986109605014, "loss": 0.4488, "step": 6636 }, { "epoch": 2.785234899328859, "grad_norm": 0.71484375, "learning_rate": 0.0001253522764602211, "loss": 0.3532, "step": 6640 }, { "epoch": 2.7869127516778525, "grad_norm": 0.53125, "learning_rate": 0.0001251947197776535, "loss": 0.3188, "step": 6644 }, { "epoch": 2.7885906040268456, "grad_norm": 0.5625, "learning_rate": 0.00012503719122703414, "loss": 0.2282, "step": 6648 }, { "epoch": 2.790268456375839, "grad_norm": 0.62890625, "learning_rate": 0.00012487969098701795, "loss": 0.4063, "step": 6652 }, { "epoch": 2.791946308724832, "grad_norm": 0.51171875, "learning_rate": 0.00012472221923622777, "loss": 0.345, "step": 6656 }, { "epoch": 2.7936241610738257, "grad_norm": 0.435546875, "learning_rate": 0.00012456477615325402, "loss": 0.2269, "step": 6660 }, { "epoch": 2.795302013422819, "grad_norm": 0.5546875, "learning_rate": 0.0001244073619166547, "loss": 0.3583, "step": 6664 }, { "epoch": 2.796979865771812, "grad_norm": 0.515625, "learning_rate": 0.0001242499767049551, "loss": 0.4246, "step": 6668 }, { "epoch": 2.7986577181208054, "grad_norm": 0.5703125, "learning_rate": 0.0001240926206966475, "loss": 0.3198, "step": 6672 }, { "epoch": 2.8003355704697985, "grad_norm": 0.5703125, "learning_rate": 0.00012393529407019117, "loss": 0.3224, "step": 6676 }, { "epoch": 2.802013422818792, "grad_norm": 0.66015625, "learning_rate": 0.00012377799700401205, "loss": 0.3512, "step": 6680 }, { "epoch": 2.803691275167785, "grad_norm": 0.609375, "learning_rate": 0.00012362072967650245, "loss": 0.4377, "step": 6684 }, { "epoch": 2.8053691275167782, "grad_norm": 0.58203125, "learning_rate": 0.000123463492266021, "loss": 0.3407, "step": 6688 }, { "epoch": 2.807046979865772, "grad_norm": 0.5390625, "learning_rate": 0.00012330628495089253, "loss": 0.3595, "step": 6692 }, { "epoch": 2.8087248322147653, "grad_norm": 0.49609375, "learning_rate": 0.00012314910790940756, "loss": 0.3503, "step": 6696 }, { "epoch": 2.8104026845637584, "grad_norm": 0.56640625, "learning_rate": 0.00012299196131982236, "loss": 0.327, "step": 6700 }, { "epoch": 2.8120805369127515, "grad_norm": 0.578125, "learning_rate": 0.00012283484536035862, "loss": 0.3309, "step": 6704 }, { "epoch": 2.813758389261745, "grad_norm": 0.53515625, "learning_rate": 0.00012267776020920333, "loss": 0.3195, "step": 6708 }, { "epoch": 2.815436241610738, "grad_norm": 0.6015625, "learning_rate": 0.0001225207060445086, "loss": 0.3602, "step": 6712 }, { "epoch": 2.8171140939597317, "grad_norm": 0.546875, "learning_rate": 0.00012236368304439124, "loss": 0.4011, "step": 6716 }, { "epoch": 2.8187919463087248, "grad_norm": 0.65625, "learning_rate": 0.00012220669138693287, "loss": 0.3811, "step": 6720 }, { "epoch": 2.820469798657718, "grad_norm": 0.439453125, "learning_rate": 0.00012204973125017949, "loss": 0.2586, "step": 6724 }, { "epoch": 2.8221476510067114, "grad_norm": 0.7734375, "learning_rate": 0.00012189280281214126, "loss": 0.4392, "step": 6728 }, { "epoch": 2.823825503355705, "grad_norm": 0.5390625, "learning_rate": 0.0001217359062507927, "loss": 0.307, "step": 6732 }, { "epoch": 2.825503355704698, "grad_norm": 0.59765625, "learning_rate": 0.0001215790417440718, "loss": 0.364, "step": 6736 }, { "epoch": 2.827181208053691, "grad_norm": 0.64453125, "learning_rate": 0.00012142220946988046, "loss": 0.3041, "step": 6740 }, { "epoch": 2.8288590604026846, "grad_norm": 0.62109375, "learning_rate": 0.00012126540960608387, "loss": 0.2997, "step": 6744 }, { "epoch": 2.8305369127516777, "grad_norm": 0.58203125, "learning_rate": 0.00012110864233051056, "loss": 0.4443, "step": 6748 }, { "epoch": 2.8322147651006713, "grad_norm": 0.5546875, "learning_rate": 0.00012095190782095208, "loss": 0.3672, "step": 6752 }, { "epoch": 2.8338926174496644, "grad_norm": 0.55859375, "learning_rate": 0.00012079520625516274, "loss": 0.252, "step": 6756 }, { "epoch": 2.8355704697986575, "grad_norm": 0.4375, "learning_rate": 0.00012063853781085961, "loss": 0.2513, "step": 6760 }, { "epoch": 2.837248322147651, "grad_norm": 0.470703125, "learning_rate": 0.00012048190266572216, "loss": 0.2899, "step": 6764 }, { "epoch": 2.8389261744966445, "grad_norm": 0.65625, "learning_rate": 0.000120325300997392, "loss": 0.6058, "step": 6768 }, { "epoch": 2.8406040268456376, "grad_norm": 0.63671875, "learning_rate": 0.00012016873298347291, "loss": 0.3881, "step": 6772 }, { "epoch": 2.8422818791946307, "grad_norm": 0.51953125, "learning_rate": 0.00012001219880153044, "loss": 0.2946, "step": 6776 }, { "epoch": 2.8439597315436242, "grad_norm": 0.470703125, "learning_rate": 0.00011985569862909176, "loss": 0.3322, "step": 6780 }, { "epoch": 2.8456375838926173, "grad_norm": 0.53125, "learning_rate": 0.00011969923264364546, "loss": 0.3061, "step": 6784 }, { "epoch": 2.847315436241611, "grad_norm": 0.6875, "learning_rate": 0.00011954280102264141, "loss": 0.3184, "step": 6788 }, { "epoch": 2.848993288590604, "grad_norm": 0.609375, "learning_rate": 0.00011938640394349046, "loss": 0.3025, "step": 6792 }, { "epoch": 2.850671140939597, "grad_norm": 0.578125, "learning_rate": 0.0001192300415835643, "loss": 0.4207, "step": 6796 }, { "epoch": 2.8523489932885906, "grad_norm": 0.451171875, "learning_rate": 0.0001190737141201953, "loss": 0.2993, "step": 6800 }, { "epoch": 2.854026845637584, "grad_norm": 0.61328125, "learning_rate": 0.00011891742173067611, "loss": 0.4175, "step": 6804 }, { "epoch": 2.8557046979865772, "grad_norm": 0.484375, "learning_rate": 0.0001187611645922598, "loss": 0.299, "step": 6808 }, { "epoch": 2.8573825503355703, "grad_norm": 0.53515625, "learning_rate": 0.00011860494288215915, "loss": 0.1935, "step": 6812 }, { "epoch": 2.859060402684564, "grad_norm": 0.60546875, "learning_rate": 0.00011844875677754725, "loss": 0.3201, "step": 6816 }, { "epoch": 2.860738255033557, "grad_norm": 0.58984375, "learning_rate": 0.00011829260645555634, "loss": 0.4791, "step": 6820 }, { "epoch": 2.8624161073825505, "grad_norm": 0.578125, "learning_rate": 0.0001181364920932783, "loss": 0.4746, "step": 6824 }, { "epoch": 2.8640939597315436, "grad_norm": 0.59765625, "learning_rate": 0.00011798041386776422, "loss": 0.3355, "step": 6828 }, { "epoch": 2.8657718120805367, "grad_norm": 0.7890625, "learning_rate": 0.00011782437195602413, "loss": 0.2954, "step": 6832 }, { "epoch": 2.86744966442953, "grad_norm": 0.5703125, "learning_rate": 0.00011766836653502694, "loss": 0.413, "step": 6836 }, { "epoch": 2.8691275167785237, "grad_norm": 0.5390625, "learning_rate": 0.0001175123977817002, "loss": 0.2379, "step": 6840 }, { "epoch": 2.870805369127517, "grad_norm": 0.431640625, "learning_rate": 0.00011735646587292976, "loss": 0.3397, "step": 6844 }, { "epoch": 2.87248322147651, "grad_norm": 0.69921875, "learning_rate": 0.00011720057098555981, "loss": 0.309, "step": 6848 }, { "epoch": 2.8741610738255035, "grad_norm": 0.5859375, "learning_rate": 0.0001170447132963925, "loss": 0.2664, "step": 6852 }, { "epoch": 2.8758389261744965, "grad_norm": 0.53125, "learning_rate": 0.00011688889298218776, "loss": 0.3199, "step": 6856 }, { "epoch": 2.87751677852349, "grad_norm": 0.5703125, "learning_rate": 0.00011673311021966323, "loss": 0.483, "step": 6860 }, { "epoch": 2.879194630872483, "grad_norm": 0.75, "learning_rate": 0.0001165773651854938, "loss": 0.3795, "step": 6864 }, { "epoch": 2.8808724832214763, "grad_norm": 0.6171875, "learning_rate": 0.00011642165805631176, "loss": 0.3673, "step": 6868 }, { "epoch": 2.88255033557047, "grad_norm": 0.671875, "learning_rate": 0.00011626598900870631, "loss": 0.3064, "step": 6872 }, { "epoch": 2.8842281879194633, "grad_norm": 0.54296875, "learning_rate": 0.0001161103582192234, "loss": 0.4231, "step": 6876 }, { "epoch": 2.8859060402684564, "grad_norm": 0.365234375, "learning_rate": 0.00011595476586436575, "loss": 0.2156, "step": 6880 }, { "epoch": 2.8875838926174495, "grad_norm": 0.5859375, "learning_rate": 0.00011579921212059239, "loss": 0.4216, "step": 6884 }, { "epoch": 2.889261744966443, "grad_norm": 0.59375, "learning_rate": 0.00011564369716431853, "loss": 0.4793, "step": 6888 }, { "epoch": 2.890939597315436, "grad_norm": 0.5234375, "learning_rate": 0.00011548822117191547, "loss": 0.4569, "step": 6892 }, { "epoch": 2.8926174496644297, "grad_norm": 0.5703125, "learning_rate": 0.00011533278431971027, "loss": 0.4336, "step": 6896 }, { "epoch": 2.8942953020134228, "grad_norm": 0.62890625, "learning_rate": 0.00011517738678398568, "loss": 0.3866, "step": 6900 }, { "epoch": 2.895973154362416, "grad_norm": 0.6484375, "learning_rate": 0.00011502202874097973, "loss": 0.4565, "step": 6904 }, { "epoch": 2.8976510067114094, "grad_norm": 0.57421875, "learning_rate": 0.00011486671036688575, "loss": 0.3939, "step": 6908 }, { "epoch": 2.899328859060403, "grad_norm": 0.77734375, "learning_rate": 0.00011471143183785209, "loss": 0.3017, "step": 6912 }, { "epoch": 2.901006711409396, "grad_norm": 0.640625, "learning_rate": 0.00011455619332998184, "loss": 0.4579, "step": 6916 }, { "epoch": 2.902684563758389, "grad_norm": 0.498046875, "learning_rate": 0.00011440099501933276, "loss": 0.195, "step": 6920 }, { "epoch": 2.9043624161073827, "grad_norm": 0.76171875, "learning_rate": 0.00011424583708191704, "loss": 0.4246, "step": 6924 }, { "epoch": 2.9060402684563758, "grad_norm": 0.408203125, "learning_rate": 0.00011409071969370097, "loss": 0.3375, "step": 6928 }, { "epoch": 2.9077181208053693, "grad_norm": 0.6640625, "learning_rate": 0.00011393564303060497, "loss": 0.4331, "step": 6932 }, { "epoch": 2.9093959731543624, "grad_norm": 0.55078125, "learning_rate": 0.0001137806072685033, "loss": 0.477, "step": 6936 }, { "epoch": 2.9110738255033555, "grad_norm": 0.55078125, "learning_rate": 0.00011362561258322365, "loss": 0.2812, "step": 6940 }, { "epoch": 2.912751677852349, "grad_norm": 0.66796875, "learning_rate": 0.0001134706591505473, "loss": 0.3448, "step": 6944 }, { "epoch": 2.9144295302013425, "grad_norm": 0.43359375, "learning_rate": 0.00011331574714620873, "loss": 0.3779, "step": 6948 }, { "epoch": 2.9161073825503356, "grad_norm": 0.73046875, "learning_rate": 0.00011316087674589532, "loss": 0.3321, "step": 6952 }, { "epoch": 2.9177852348993287, "grad_norm": 0.6953125, "learning_rate": 0.00011300604812524737, "loss": 0.5221, "step": 6956 }, { "epoch": 2.9194630872483223, "grad_norm": 0.53515625, "learning_rate": 0.00011285126145985777, "loss": 0.3041, "step": 6960 }, { "epoch": 2.9211409395973154, "grad_norm": 0.50390625, "learning_rate": 0.0001126965169252718, "loss": 0.3263, "step": 6964 }, { "epoch": 2.922818791946309, "grad_norm": 0.55078125, "learning_rate": 0.00011254181469698703, "loss": 0.4576, "step": 6968 }, { "epoch": 2.924496644295302, "grad_norm": 0.61328125, "learning_rate": 0.00011238715495045293, "loss": 0.3361, "step": 6972 }, { "epoch": 2.926174496644295, "grad_norm": 0.76953125, "learning_rate": 0.00011223253786107089, "loss": 0.368, "step": 6976 }, { "epoch": 2.9278523489932886, "grad_norm": 0.5625, "learning_rate": 0.00011207796360419396, "loss": 0.2228, "step": 6980 }, { "epoch": 2.929530201342282, "grad_norm": 0.76171875, "learning_rate": 0.00011192343235512648, "loss": 0.3764, "step": 6984 }, { "epoch": 2.9312080536912752, "grad_norm": 0.56640625, "learning_rate": 0.00011176894428912414, "loss": 0.2268, "step": 6988 }, { "epoch": 2.9328859060402683, "grad_norm": 0.58984375, "learning_rate": 0.00011161449958139354, "loss": 0.2909, "step": 6992 }, { "epoch": 2.934563758389262, "grad_norm": 0.75390625, "learning_rate": 0.00011146009840709217, "loss": 0.3877, "step": 6996 }, { "epoch": 2.936241610738255, "grad_norm": 0.63671875, "learning_rate": 0.00011130574094132824, "loss": 0.2333, "step": 7000 }, { "epoch": 2.9379194630872485, "grad_norm": 0.5625, "learning_rate": 0.00011115142735916015, "loss": 0.3705, "step": 7004 }, { "epoch": 2.9395973154362416, "grad_norm": 0.58984375, "learning_rate": 0.00011099715783559676, "loss": 0.3154, "step": 7008 }, { "epoch": 2.9412751677852347, "grad_norm": 0.58984375, "learning_rate": 0.0001108429325455969, "loss": 0.4113, "step": 7012 }, { "epoch": 2.942953020134228, "grad_norm": 0.63671875, "learning_rate": 0.00011068875166406914, "loss": 0.4045, "step": 7016 }, { "epoch": 2.9446308724832218, "grad_norm": 0.55859375, "learning_rate": 0.00011053461536587182, "loss": 0.4129, "step": 7020 }, { "epoch": 2.946308724832215, "grad_norm": 0.70703125, "learning_rate": 0.00011038052382581261, "loss": 0.4553, "step": 7024 }, { "epoch": 2.947986577181208, "grad_norm": 0.59375, "learning_rate": 0.00011022647721864848, "loss": 0.3147, "step": 7028 }, { "epoch": 2.9496644295302015, "grad_norm": 0.431640625, "learning_rate": 0.00011007247571908546, "loss": 0.3389, "step": 7032 }, { "epoch": 2.9513422818791946, "grad_norm": 0.68359375, "learning_rate": 0.00010991851950177832, "loss": 0.3121, "step": 7036 }, { "epoch": 2.953020134228188, "grad_norm": 0.5078125, "learning_rate": 0.00010976460874133058, "loss": 0.3237, "step": 7040 }, { "epoch": 2.954697986577181, "grad_norm": 0.56640625, "learning_rate": 0.00010961074361229422, "loss": 0.333, "step": 7044 }, { "epoch": 2.9563758389261743, "grad_norm": 0.5625, "learning_rate": 0.00010945692428916931, "loss": 0.4065, "step": 7048 }, { "epoch": 2.958053691275168, "grad_norm": 0.51171875, "learning_rate": 0.00010930315094640416, "loss": 0.2585, "step": 7052 }, { "epoch": 2.959731543624161, "grad_norm": 0.40625, "learning_rate": 0.00010914942375839486, "loss": 0.3289, "step": 7056 }, { "epoch": 2.9614093959731544, "grad_norm": 0.6796875, "learning_rate": 0.00010899574289948508, "loss": 0.3151, "step": 7060 }, { "epoch": 2.9630872483221475, "grad_norm": 0.6875, "learning_rate": 0.00010884210854396605, "loss": 0.3792, "step": 7064 }, { "epoch": 2.964765100671141, "grad_norm": 0.64453125, "learning_rate": 0.00010868852086607629, "loss": 0.2994, "step": 7068 }, { "epoch": 2.966442953020134, "grad_norm": 0.61328125, "learning_rate": 0.00010853498004000128, "loss": 0.4688, "step": 7072 }, { "epoch": 2.9681208053691277, "grad_norm": 0.62109375, "learning_rate": 0.00010838148623987335, "loss": 0.2466, "step": 7076 }, { "epoch": 2.969798657718121, "grad_norm": 0.6171875, "learning_rate": 0.00010822803963977158, "loss": 0.3528, "step": 7080 }, { "epoch": 2.971476510067114, "grad_norm": 0.5546875, "learning_rate": 0.00010807464041372156, "loss": 0.3664, "step": 7084 }, { "epoch": 2.9731543624161074, "grad_norm": 0.55859375, "learning_rate": 0.00010792128873569495, "loss": 0.3528, "step": 7088 }, { "epoch": 2.9748322147651005, "grad_norm": 0.419921875, "learning_rate": 0.00010776798477960975, "loss": 0.3502, "step": 7092 }, { "epoch": 2.976510067114094, "grad_norm": 0.56640625, "learning_rate": 0.00010761472871932965, "loss": 0.3453, "step": 7096 }, { "epoch": 2.978187919463087, "grad_norm": 0.75390625, "learning_rate": 0.00010746152072866405, "loss": 0.3224, "step": 7100 }, { "epoch": 2.9798657718120807, "grad_norm": 0.625, "learning_rate": 0.0001073083609813679, "loss": 0.339, "step": 7104 }, { "epoch": 2.9815436241610738, "grad_norm": 0.65234375, "learning_rate": 0.00010715524965114141, "loss": 0.4152, "step": 7108 }, { "epoch": 2.9832214765100673, "grad_norm": 0.5390625, "learning_rate": 0.00010700218691162981, "loss": 0.3803, "step": 7112 }, { "epoch": 2.9848993288590604, "grad_norm": 0.43359375, "learning_rate": 0.00010684917293642334, "loss": 0.2479, "step": 7116 }, { "epoch": 2.9865771812080535, "grad_norm": 0.51171875, "learning_rate": 0.00010669620789905688, "loss": 0.3287, "step": 7120 }, { "epoch": 2.988255033557047, "grad_norm": 0.59765625, "learning_rate": 0.00010654329197300972, "loss": 0.2486, "step": 7124 }, { "epoch": 2.98993288590604, "grad_norm": 0.6015625, "learning_rate": 0.00010639042533170565, "loss": 0.4836, "step": 7128 }, { "epoch": 2.9916107382550337, "grad_norm": 0.78515625, "learning_rate": 0.00010623760814851235, "loss": 0.5044, "step": 7132 }, { "epoch": 2.9932885906040267, "grad_norm": 0.6171875, "learning_rate": 0.00010608484059674157, "loss": 0.487, "step": 7136 }, { "epoch": 2.9949664429530203, "grad_norm": 0.74609375, "learning_rate": 0.00010593212284964875, "loss": 0.3481, "step": 7140 }, { "epoch": 2.9966442953020134, "grad_norm": 0.69921875, "learning_rate": 0.00010577945508043265, "loss": 0.3628, "step": 7144 }, { "epoch": 2.998322147651007, "grad_norm": 0.6171875, "learning_rate": 0.00010562683746223573, "loss": 0.3916, "step": 7148 }, { "epoch": 3.0, "grad_norm": 0.50390625, "learning_rate": 0.00010547427016814321, "loss": 0.3158, "step": 7152 }, { "epoch": 3.001677852348993, "grad_norm": 0.404296875, "learning_rate": 0.00010532175337118342, "loss": 0.1874, "step": 7156 }, { "epoch": 3.0033557046979866, "grad_norm": 0.41796875, "learning_rate": 0.00010516928724432743, "loss": 0.1988, "step": 7160 }, { "epoch": 3.0050335570469797, "grad_norm": 0.3984375, "learning_rate": 0.0001050168719604887, "loss": 0.191, "step": 7164 }, { "epoch": 3.0067114093959733, "grad_norm": 0.455078125, "learning_rate": 0.00010486450769252318, "loss": 0.3331, "step": 7168 }, { "epoch": 3.0083892617449663, "grad_norm": 0.6328125, "learning_rate": 0.00010471219461322892, "loss": 0.3379, "step": 7172 }, { "epoch": 3.01006711409396, "grad_norm": 0.48828125, "learning_rate": 0.00010455993289534584, "loss": 0.2234, "step": 7176 }, { "epoch": 3.011744966442953, "grad_norm": 0.451171875, "learning_rate": 0.00010440772271155573, "loss": 0.3433, "step": 7180 }, { "epoch": 3.0134228187919465, "grad_norm": 0.53125, "learning_rate": 0.00010425556423448183, "loss": 0.2153, "step": 7184 }, { "epoch": 3.0151006711409396, "grad_norm": 0.474609375, "learning_rate": 0.00010410345763668884, "loss": 0.3205, "step": 7188 }, { "epoch": 3.0167785234899327, "grad_norm": 0.5, "learning_rate": 0.00010395140309068256, "loss": 0.1587, "step": 7192 }, { "epoch": 3.0184563758389262, "grad_norm": 0.4765625, "learning_rate": 0.0001037994007689097, "loss": 0.2495, "step": 7196 }, { "epoch": 3.0201342281879193, "grad_norm": 0.44140625, "learning_rate": 0.0001036474508437579, "loss": 0.2456, "step": 7200 }, { "epoch": 3.021812080536913, "grad_norm": 0.4609375, "learning_rate": 0.00010349555348755524, "loss": 0.1604, "step": 7204 }, { "epoch": 3.023489932885906, "grad_norm": 0.380859375, "learning_rate": 0.0001033437088725702, "loss": 0.1691, "step": 7208 }, { "epoch": 3.0251677852348995, "grad_norm": 0.640625, "learning_rate": 0.00010319191717101151, "loss": 0.3332, "step": 7212 }, { "epoch": 3.0268456375838926, "grad_norm": 0.59375, "learning_rate": 0.00010304017855502786, "loss": 0.3142, "step": 7216 }, { "epoch": 3.028523489932886, "grad_norm": 0.37109375, "learning_rate": 0.00010288849319670771, "loss": 0.1546, "step": 7220 }, { "epoch": 3.030201342281879, "grad_norm": 0.5625, "learning_rate": 0.0001027368612680791, "loss": 0.2596, "step": 7224 }, { "epoch": 3.0318791946308723, "grad_norm": 0.490234375, "learning_rate": 0.00010258528294110953, "loss": 0.1866, "step": 7228 }, { "epoch": 3.033557046979866, "grad_norm": 0.58203125, "learning_rate": 0.00010243375838770578, "loss": 0.3239, "step": 7232 }, { "epoch": 3.035234899328859, "grad_norm": 0.5234375, "learning_rate": 0.00010228228777971345, "loss": 0.2085, "step": 7236 }, { "epoch": 3.0369127516778525, "grad_norm": 0.44921875, "learning_rate": 0.0001021308712889171, "loss": 0.2315, "step": 7240 }, { "epoch": 3.0385906040268456, "grad_norm": 0.62109375, "learning_rate": 0.0001019795090870399, "loss": 0.3969, "step": 7244 }, { "epoch": 3.040268456375839, "grad_norm": 0.51953125, "learning_rate": 0.0001018282013457434, "loss": 0.3743, "step": 7248 }, { "epoch": 3.041946308724832, "grad_norm": 0.65625, "learning_rate": 0.00010167694823662744, "loss": 0.3015, "step": 7252 }, { "epoch": 3.0436241610738257, "grad_norm": 0.51953125, "learning_rate": 0.00010152574993122988, "loss": 0.2221, "step": 7256 }, { "epoch": 3.045302013422819, "grad_norm": 0.453125, "learning_rate": 0.00010137460660102639, "loss": 0.2072, "step": 7260 }, { "epoch": 3.046979865771812, "grad_norm": 0.5546875, "learning_rate": 0.00010122351841743035, "loss": 0.2231, "step": 7264 }, { "epoch": 3.0486577181208054, "grad_norm": 0.58984375, "learning_rate": 0.0001010724855517926, "loss": 0.2825, "step": 7268 }, { "epoch": 3.0503355704697985, "grad_norm": 0.66015625, "learning_rate": 0.00010092150817540118, "loss": 0.1527, "step": 7272 }, { "epoch": 3.052013422818792, "grad_norm": 0.53125, "learning_rate": 0.00010077058645948125, "loss": 0.1492, "step": 7276 }, { "epoch": 3.053691275167785, "grad_norm": 0.70703125, "learning_rate": 0.00010061972057519484, "loss": 0.314, "step": 7280 }, { "epoch": 3.0553691275167787, "grad_norm": 0.57421875, "learning_rate": 0.00010046891069364062, "loss": 0.3185, "step": 7284 }, { "epoch": 3.057046979865772, "grad_norm": 0.63671875, "learning_rate": 0.00010031815698585384, "loss": 0.1492, "step": 7288 }, { "epoch": 3.0587248322147653, "grad_norm": 0.494140625, "learning_rate": 0.0001001674596228059, "loss": 0.3166, "step": 7292 }, { "epoch": 3.0604026845637584, "grad_norm": 0.328125, "learning_rate": 0.00010001681877540442, "loss": 0.1755, "step": 7296 }, { "epoch": 3.0620805369127515, "grad_norm": 0.5390625, "learning_rate": 9.986623461449292e-05, "loss": 0.3242, "step": 7300 }, { "epoch": 3.063758389261745, "grad_norm": 0.470703125, "learning_rate": 9.971570731085056e-05, "loss": 0.2234, "step": 7304 }, { "epoch": 3.065436241610738, "grad_norm": 0.66015625, "learning_rate": 9.956523703519202e-05, "loss": 0.3148, "step": 7308 }, { "epoch": 3.0671140939597317, "grad_norm": 0.494140625, "learning_rate": 9.941482395816737e-05, "loss": 0.1966, "step": 7312 }, { "epoch": 3.0687919463087248, "grad_norm": 0.625, "learning_rate": 9.926446825036181e-05, "loss": 0.2488, "step": 7316 }, { "epoch": 3.0704697986577183, "grad_norm": 0.490234375, "learning_rate": 9.911417008229544e-05, "loss": 0.2278, "step": 7320 }, { "epoch": 3.0721476510067114, "grad_norm": 0.451171875, "learning_rate": 9.896392962442308e-05, "loss": 0.2356, "step": 7324 }, { "epoch": 3.073825503355705, "grad_norm": 0.5234375, "learning_rate": 9.88137470471341e-05, "loss": 0.1942, "step": 7328 }, { "epoch": 3.075503355704698, "grad_norm": 0.58203125, "learning_rate": 9.866362252075237e-05, "loss": 0.2697, "step": 7332 }, { "epoch": 3.077181208053691, "grad_norm": 0.404296875, "learning_rate": 9.851355621553568e-05, "loss": 0.1762, "step": 7336 }, { "epoch": 3.0788590604026846, "grad_norm": 0.5, "learning_rate": 9.8363548301676e-05, "loss": 0.1598, "step": 7340 }, { "epoch": 3.0805369127516777, "grad_norm": 0.59375, "learning_rate": 9.821359894929895e-05, "loss": 0.2962, "step": 7344 }, { "epoch": 3.0822147651006713, "grad_norm": 0.53515625, "learning_rate": 9.806370832846378e-05, "loss": 0.3023, "step": 7348 }, { "epoch": 3.0838926174496644, "grad_norm": 0.953125, "learning_rate": 9.791387660916321e-05, "loss": 0.2277, "step": 7352 }, { "epoch": 3.085570469798658, "grad_norm": 0.376953125, "learning_rate": 9.776410396132301e-05, "loss": 0.1121, "step": 7356 }, { "epoch": 3.087248322147651, "grad_norm": 0.50390625, "learning_rate": 9.761439055480204e-05, "loss": 0.2217, "step": 7360 }, { "epoch": 3.088926174496644, "grad_norm": 0.46484375, "learning_rate": 9.746473655939202e-05, "loss": 0.2354, "step": 7364 }, { "epoch": 3.0906040268456376, "grad_norm": 0.470703125, "learning_rate": 9.731514214481714e-05, "loss": 0.2194, "step": 7368 }, { "epoch": 3.0922818791946307, "grad_norm": 0.5390625, "learning_rate": 9.716560748073418e-05, "loss": 0.2669, "step": 7372 }, { "epoch": 3.0939597315436242, "grad_norm": 0.6484375, "learning_rate": 9.701613273673211e-05, "loss": 0.2683, "step": 7376 }, { "epoch": 3.0956375838926173, "grad_norm": 0.5546875, "learning_rate": 9.686671808233185e-05, "loss": 0.2053, "step": 7380 }, { "epoch": 3.097315436241611, "grad_norm": 0.53125, "learning_rate": 9.67173636869863e-05, "loss": 0.2239, "step": 7384 }, { "epoch": 3.098993288590604, "grad_norm": 0.54296875, "learning_rate": 9.656806972007997e-05, "loss": 0.2149, "step": 7388 }, { "epoch": 3.1006711409395975, "grad_norm": 0.56640625, "learning_rate": 9.641883635092876e-05, "loss": 0.2715, "step": 7392 }, { "epoch": 3.1023489932885906, "grad_norm": 0.466796875, "learning_rate": 9.626966374878002e-05, "loss": 0.1692, "step": 7396 }, { "epoch": 3.1040268456375837, "grad_norm": 0.466796875, "learning_rate": 9.6120552082812e-05, "loss": 0.2021, "step": 7400 }, { "epoch": 3.1057046979865772, "grad_norm": 0.5625, "learning_rate": 9.597150152213401e-05, "loss": 0.1667, "step": 7404 }, { "epoch": 3.1073825503355703, "grad_norm": 0.482421875, "learning_rate": 9.58225122357859e-05, "loss": 0.2074, "step": 7408 }, { "epoch": 3.109060402684564, "grad_norm": 0.6015625, "learning_rate": 9.56735843927381e-05, "loss": 0.1386, "step": 7412 }, { "epoch": 3.110738255033557, "grad_norm": 0.53125, "learning_rate": 9.552471816189141e-05, "loss": 0.2084, "step": 7416 }, { "epoch": 3.1124161073825505, "grad_norm": 0.48046875, "learning_rate": 9.537591371207666e-05, "loss": 0.1622, "step": 7420 }, { "epoch": 3.1140939597315436, "grad_norm": 0.4921875, "learning_rate": 9.522717121205468e-05, "loss": 0.3066, "step": 7424 }, { "epoch": 3.115771812080537, "grad_norm": 0.578125, "learning_rate": 9.5078490830516e-05, "loss": 0.2351, "step": 7428 }, { "epoch": 3.11744966442953, "grad_norm": 0.5859375, "learning_rate": 9.492987273608072e-05, "loss": 0.2369, "step": 7432 }, { "epoch": 3.1191275167785233, "grad_norm": 0.5, "learning_rate": 9.47813170972983e-05, "loss": 0.1853, "step": 7436 }, { "epoch": 3.120805369127517, "grad_norm": 0.5234375, "learning_rate": 9.46328240826474e-05, "loss": 0.2113, "step": 7440 }, { "epoch": 3.12248322147651, "grad_norm": 0.61328125, "learning_rate": 9.448439386053556e-05, "loss": 0.2611, "step": 7444 }, { "epoch": 3.1241610738255035, "grad_norm": 0.49609375, "learning_rate": 9.433602659929923e-05, "loss": 0.2957, "step": 7448 }, { "epoch": 3.1258389261744965, "grad_norm": 0.3828125, "learning_rate": 9.418772246720336e-05, "loss": 0.182, "step": 7452 }, { "epoch": 3.12751677852349, "grad_norm": 0.53125, "learning_rate": 9.403948163244131e-05, "loss": 0.1933, "step": 7456 }, { "epoch": 3.129194630872483, "grad_norm": 0.6875, "learning_rate": 9.389130426313479e-05, "loss": 0.2469, "step": 7460 }, { "epoch": 3.1308724832214767, "grad_norm": 0.490234375, "learning_rate": 9.374319052733332e-05, "loss": 0.2058, "step": 7464 }, { "epoch": 3.13255033557047, "grad_norm": 0.6015625, "learning_rate": 9.359514059301439e-05, "loss": 0.2438, "step": 7468 }, { "epoch": 3.134228187919463, "grad_norm": 0.55859375, "learning_rate": 9.344715462808314e-05, "loss": 0.2039, "step": 7472 }, { "epoch": 3.1359060402684564, "grad_norm": 0.56640625, "learning_rate": 9.329923280037201e-05, "loss": 0.248, "step": 7476 }, { "epoch": 3.1375838926174495, "grad_norm": 0.51171875, "learning_rate": 9.3151375277641e-05, "loss": 0.2214, "step": 7480 }, { "epoch": 3.139261744966443, "grad_norm": 0.439453125, "learning_rate": 9.300358222757685e-05, "loss": 0.1671, "step": 7484 }, { "epoch": 3.140939597315436, "grad_norm": 0.625, "learning_rate": 9.285585381779337e-05, "loss": 0.1956, "step": 7488 }, { "epoch": 3.1426174496644297, "grad_norm": 0.58984375, "learning_rate": 9.270819021583106e-05, "loss": 0.2538, "step": 7492 }, { "epoch": 3.1442953020134228, "grad_norm": 0.494140625, "learning_rate": 9.256059158915678e-05, "loss": 0.258, "step": 7496 }, { "epoch": 3.1459731543624163, "grad_norm": 0.4453125, "learning_rate": 9.241305810516391e-05, "loss": 0.1884, "step": 7500 }, { "epoch": 3.1476510067114094, "grad_norm": 0.40234375, "learning_rate": 9.226558993117174e-05, "loss": 0.2031, "step": 7504 }, { "epoch": 3.1493288590604025, "grad_norm": 0.609375, "learning_rate": 9.211818723442562e-05, "loss": 0.2279, "step": 7508 }, { "epoch": 3.151006711409396, "grad_norm": 0.53125, "learning_rate": 9.197085018209668e-05, "loss": 0.1125, "step": 7512 }, { "epoch": 3.152684563758389, "grad_norm": 0.6328125, "learning_rate": 9.182357894128144e-05, "loss": 0.2932, "step": 7516 }, { "epoch": 3.1543624161073827, "grad_norm": 0.5859375, "learning_rate": 9.167637367900191e-05, "loss": 0.2184, "step": 7520 }, { "epoch": 3.1560402684563758, "grad_norm": 0.63671875, "learning_rate": 9.152923456220525e-05, "loss": 0.2321, "step": 7524 }, { "epoch": 3.1577181208053693, "grad_norm": 0.515625, "learning_rate": 9.138216175776358e-05, "loss": 0.2567, "step": 7528 }, { "epoch": 3.1593959731543624, "grad_norm": 0.54296875, "learning_rate": 9.123515543247385e-05, "loss": 0.3591, "step": 7532 }, { "epoch": 3.1610738255033555, "grad_norm": 0.6015625, "learning_rate": 9.10882157530576e-05, "loss": 0.2685, "step": 7536 }, { "epoch": 3.162751677852349, "grad_norm": 0.494140625, "learning_rate": 9.09413428861607e-05, "loss": 0.1791, "step": 7540 }, { "epoch": 3.164429530201342, "grad_norm": 0.52734375, "learning_rate": 9.079453699835339e-05, "loss": 0.2556, "step": 7544 }, { "epoch": 3.1661073825503356, "grad_norm": 0.73828125, "learning_rate": 9.064779825612991e-05, "loss": 0.2411, "step": 7548 }, { "epoch": 3.1677852348993287, "grad_norm": 0.56640625, "learning_rate": 9.050112682590826e-05, "loss": 0.267, "step": 7552 }, { "epoch": 3.1694630872483223, "grad_norm": 0.46875, "learning_rate": 9.03545228740302e-05, "loss": 0.2421, "step": 7556 }, { "epoch": 3.1711409395973154, "grad_norm": 0.53515625, "learning_rate": 9.02079865667609e-05, "loss": 0.2889, "step": 7560 }, { "epoch": 3.172818791946309, "grad_norm": 0.58203125, "learning_rate": 9.00615180702889e-05, "loss": 0.3366, "step": 7564 }, { "epoch": 3.174496644295302, "grad_norm": 0.625, "learning_rate": 8.991511755072571e-05, "loss": 0.2449, "step": 7568 }, { "epoch": 3.176174496644295, "grad_norm": 0.5078125, "learning_rate": 8.976878517410585e-05, "loss": 0.2587, "step": 7572 }, { "epoch": 3.1778523489932886, "grad_norm": 0.54296875, "learning_rate": 8.962252110638653e-05, "loss": 0.2599, "step": 7576 }, { "epoch": 3.1795302013422817, "grad_norm": 0.58203125, "learning_rate": 8.947632551344742e-05, "loss": 0.2203, "step": 7580 }, { "epoch": 3.1812080536912752, "grad_norm": 0.60546875, "learning_rate": 8.933019856109064e-05, "loss": 0.2453, "step": 7584 }, { "epoch": 3.1828859060402683, "grad_norm": 0.53515625, "learning_rate": 8.918414041504041e-05, "loss": 0.2208, "step": 7588 }, { "epoch": 3.184563758389262, "grad_norm": 0.5, "learning_rate": 8.90381512409429e-05, "loss": 0.2832, "step": 7592 }, { "epoch": 3.186241610738255, "grad_norm": 0.5703125, "learning_rate": 8.88922312043661e-05, "loss": 0.1791, "step": 7596 }, { "epoch": 3.1879194630872485, "grad_norm": 0.53515625, "learning_rate": 8.874638047079962e-05, "loss": 0.3251, "step": 7600 }, { "epoch": 3.1895973154362416, "grad_norm": 0.609375, "learning_rate": 8.86005992056543e-05, "loss": 0.3153, "step": 7604 }, { "epoch": 3.1912751677852347, "grad_norm": 0.6875, "learning_rate": 8.845488757426244e-05, "loss": 0.3372, "step": 7608 }, { "epoch": 3.192953020134228, "grad_norm": 0.5390625, "learning_rate": 8.830924574187716e-05, "loss": 0.2473, "step": 7612 }, { "epoch": 3.1946308724832213, "grad_norm": 0.53515625, "learning_rate": 8.816367387367252e-05, "loss": 0.2035, "step": 7616 }, { "epoch": 3.196308724832215, "grad_norm": 0.734375, "learning_rate": 8.80181721347433e-05, "loss": 0.4135, "step": 7620 }, { "epoch": 3.197986577181208, "grad_norm": 0.56640625, "learning_rate": 8.787274069010458e-05, "loss": 0.2432, "step": 7624 }, { "epoch": 3.1996644295302015, "grad_norm": 0.609375, "learning_rate": 8.77273797046918e-05, "loss": 0.1493, "step": 7628 }, { "epoch": 3.2013422818791946, "grad_norm": 0.5546875, "learning_rate": 8.758208934336053e-05, "loss": 0.1488, "step": 7632 }, { "epoch": 3.203020134228188, "grad_norm": 0.69921875, "learning_rate": 8.743686977088622e-05, "loss": 0.2576, "step": 7636 }, { "epoch": 3.204697986577181, "grad_norm": 0.318359375, "learning_rate": 8.729172115196399e-05, "loss": 0.1544, "step": 7640 }, { "epoch": 3.2063758389261743, "grad_norm": 0.40234375, "learning_rate": 8.714664365120846e-05, "loss": 0.2258, "step": 7644 }, { "epoch": 3.208053691275168, "grad_norm": 0.59765625, "learning_rate": 8.70016374331538e-05, "loss": 0.2396, "step": 7648 }, { "epoch": 3.209731543624161, "grad_norm": 0.546875, "learning_rate": 8.685670266225314e-05, "loss": 0.2853, "step": 7652 }, { "epoch": 3.2114093959731544, "grad_norm": 0.66796875, "learning_rate": 8.671183950287872e-05, "loss": 0.2247, "step": 7656 }, { "epoch": 3.2130872483221475, "grad_norm": 0.640625, "learning_rate": 8.656704811932133e-05, "loss": 0.2706, "step": 7660 }, { "epoch": 3.214765100671141, "grad_norm": 0.546875, "learning_rate": 8.64223286757906e-05, "loss": 0.2041, "step": 7664 }, { "epoch": 3.216442953020134, "grad_norm": 0.609375, "learning_rate": 8.627768133641445e-05, "loss": 0.3334, "step": 7668 }, { "epoch": 3.2181208053691277, "grad_norm": 0.498046875, "learning_rate": 8.613310626523909e-05, "loss": 0.336, "step": 7672 }, { "epoch": 3.219798657718121, "grad_norm": 0.4140625, "learning_rate": 8.598860362622872e-05, "loss": 0.2065, "step": 7676 }, { "epoch": 3.221476510067114, "grad_norm": 0.53515625, "learning_rate": 8.584417358326549e-05, "loss": 0.2194, "step": 7680 }, { "epoch": 3.2231543624161074, "grad_norm": 0.53125, "learning_rate": 8.569981630014901e-05, "loss": 0.2791, "step": 7684 }, { "epoch": 3.2248322147651005, "grad_norm": 0.703125, "learning_rate": 8.55555319405966e-05, "loss": 0.2765, "step": 7688 }, { "epoch": 3.226510067114094, "grad_norm": 0.66015625, "learning_rate": 8.54113206682427e-05, "loss": 0.281, "step": 7692 }, { "epoch": 3.228187919463087, "grad_norm": 0.54296875, "learning_rate": 8.526718264663903e-05, "loss": 0.2075, "step": 7696 }, { "epoch": 3.2298657718120807, "grad_norm": 0.515625, "learning_rate": 8.512311803925417e-05, "loss": 0.2685, "step": 7700 }, { "epoch": 3.2315436241610738, "grad_norm": 0.7734375, "learning_rate": 8.497912700947329e-05, "loss": 0.2652, "step": 7704 }, { "epoch": 3.2332214765100673, "grad_norm": 0.6484375, "learning_rate": 8.483520972059829e-05, "loss": 0.3238, "step": 7708 }, { "epoch": 3.2348993288590604, "grad_norm": 0.55859375, "learning_rate": 8.469136633584743e-05, "loss": 0.2573, "step": 7712 }, { "epoch": 3.2365771812080535, "grad_norm": 0.5, "learning_rate": 8.454759701835511e-05, "loss": 0.2399, "step": 7716 }, { "epoch": 3.238255033557047, "grad_norm": 0.53125, "learning_rate": 8.440390193117169e-05, "loss": 0.2733, "step": 7720 }, { "epoch": 3.23993288590604, "grad_norm": 0.66015625, "learning_rate": 8.426028123726343e-05, "loss": 0.2226, "step": 7724 }, { "epoch": 3.2416107382550337, "grad_norm": 0.63671875, "learning_rate": 8.411673509951219e-05, "loss": 0.2737, "step": 7728 }, { "epoch": 3.2432885906040267, "grad_norm": 0.546875, "learning_rate": 8.397326368071522e-05, "loss": 0.2277, "step": 7732 }, { "epoch": 3.2449664429530203, "grad_norm": 0.68359375, "learning_rate": 8.382986714358511e-05, "loss": 0.2552, "step": 7736 }, { "epoch": 3.2466442953020134, "grad_norm": 0.515625, "learning_rate": 8.368654565074956e-05, "loss": 0.1705, "step": 7740 }, { "epoch": 3.248322147651007, "grad_norm": 0.5703125, "learning_rate": 8.354329936475095e-05, "loss": 0.314, "step": 7744 }, { "epoch": 3.25, "grad_norm": 0.498046875, "learning_rate": 8.340012844804656e-05, "loss": 0.2057, "step": 7748 }, { "epoch": 3.251677852348993, "grad_norm": 0.4921875, "learning_rate": 8.325703306300816e-05, "loss": 0.2778, "step": 7752 }, { "epoch": 3.2533557046979866, "grad_norm": 0.447265625, "learning_rate": 8.311401337192186e-05, "loss": 0.3023, "step": 7756 }, { "epoch": 3.2550335570469797, "grad_norm": 0.546875, "learning_rate": 8.297106953698785e-05, "loss": 0.2312, "step": 7760 }, { "epoch": 3.2567114093959733, "grad_norm": 0.5625, "learning_rate": 8.282820172032046e-05, "loss": 0.1638, "step": 7764 }, { "epoch": 3.2583892617449663, "grad_norm": 0.6015625, "learning_rate": 8.268541008394758e-05, "loss": 0.2511, "step": 7768 }, { "epoch": 3.26006711409396, "grad_norm": 0.494140625, "learning_rate": 8.254269478981083e-05, "loss": 0.2457, "step": 7772 }, { "epoch": 3.261744966442953, "grad_norm": 0.48828125, "learning_rate": 8.240005599976523e-05, "loss": 0.2414, "step": 7776 }, { "epoch": 3.2634228187919465, "grad_norm": 0.57421875, "learning_rate": 8.22574938755791e-05, "loss": 0.2886, "step": 7780 }, { "epoch": 3.2651006711409396, "grad_norm": 0.3984375, "learning_rate": 8.21150085789337e-05, "loss": 0.1576, "step": 7784 }, { "epoch": 3.2667785234899327, "grad_norm": 0.6171875, "learning_rate": 8.197260027142333e-05, "loss": 0.1554, "step": 7788 }, { "epoch": 3.2684563758389262, "grad_norm": 0.578125, "learning_rate": 8.183026911455468e-05, "loss": 0.2847, "step": 7792 }, { "epoch": 3.2701342281879193, "grad_norm": 0.83984375, "learning_rate": 8.168801526974721e-05, "loss": 0.2918, "step": 7796 }, { "epoch": 3.271812080536913, "grad_norm": 0.57421875, "learning_rate": 8.15458388983326e-05, "loss": 0.2163, "step": 7800 }, { "epoch": 3.273489932885906, "grad_norm": 0.546875, "learning_rate": 8.140374016155469e-05, "loss": 0.1236, "step": 7804 }, { "epoch": 3.2751677852348995, "grad_norm": 0.53515625, "learning_rate": 8.126171922056927e-05, "loss": 0.3372, "step": 7808 }, { "epoch": 3.2768456375838926, "grad_norm": 0.57421875, "learning_rate": 8.111977623644386e-05, "loss": 0.1732, "step": 7812 }, { "epoch": 3.278523489932886, "grad_norm": 0.55859375, "learning_rate": 8.097791137015763e-05, "loss": 0.2897, "step": 7816 }, { "epoch": 3.280201342281879, "grad_norm": 0.49609375, "learning_rate": 8.083612478260109e-05, "loss": 0.2073, "step": 7820 }, { "epoch": 3.2818791946308723, "grad_norm": 0.494140625, "learning_rate": 8.069441663457605e-05, "loss": 0.3146, "step": 7824 }, { "epoch": 3.283557046979866, "grad_norm": 0.57421875, "learning_rate": 8.055278708679537e-05, "loss": 0.2163, "step": 7828 }, { "epoch": 3.285234899328859, "grad_norm": 0.458984375, "learning_rate": 8.041123629988256e-05, "loss": 0.2049, "step": 7832 }, { "epoch": 3.2869127516778525, "grad_norm": 0.62109375, "learning_rate": 8.026976443437209e-05, "loss": 0.2677, "step": 7836 }, { "epoch": 3.2885906040268456, "grad_norm": 0.7109375, "learning_rate": 8.012837165070876e-05, "loss": 0.3922, "step": 7840 }, { "epoch": 3.290268456375839, "grad_norm": 0.6328125, "learning_rate": 7.99870581092477e-05, "loss": 0.2308, "step": 7844 }, { "epoch": 3.291946308724832, "grad_norm": 0.486328125, "learning_rate": 7.984582397025432e-05, "loss": 0.2923, "step": 7848 }, { "epoch": 3.2936241610738257, "grad_norm": 0.58984375, "learning_rate": 7.970466939390368e-05, "loss": 0.2656, "step": 7852 }, { "epoch": 3.295302013422819, "grad_norm": 0.58203125, "learning_rate": 7.956359454028081e-05, "loss": 0.2724, "step": 7856 }, { "epoch": 3.296979865771812, "grad_norm": 0.53125, "learning_rate": 7.942259956938038e-05, "loss": 0.2325, "step": 7860 }, { "epoch": 3.2986577181208054, "grad_norm": 0.53515625, "learning_rate": 7.92816846411063e-05, "loss": 0.1318, "step": 7864 }, { "epoch": 3.3003355704697985, "grad_norm": 0.59375, "learning_rate": 7.914084991527183e-05, "loss": 0.2177, "step": 7868 }, { "epoch": 3.302013422818792, "grad_norm": 0.447265625, "learning_rate": 7.900009555159925e-05, "loss": 0.2425, "step": 7872 }, { "epoch": 3.303691275167785, "grad_norm": 0.640625, "learning_rate": 7.885942170971958e-05, "loss": 0.1989, "step": 7876 }, { "epoch": 3.3053691275167787, "grad_norm": 0.61328125, "learning_rate": 7.871882854917264e-05, "loss": 0.2667, "step": 7880 }, { "epoch": 3.307046979865772, "grad_norm": 0.625, "learning_rate": 7.857831622940675e-05, "loss": 0.2829, "step": 7884 }, { "epoch": 3.3087248322147653, "grad_norm": 0.5703125, "learning_rate": 7.843788490977852e-05, "loss": 0.238, "step": 7888 }, { "epoch": 3.3104026845637584, "grad_norm": 0.5546875, "learning_rate": 7.829753474955265e-05, "loss": 0.2693, "step": 7892 }, { "epoch": 3.3120805369127515, "grad_norm": 0.486328125, "learning_rate": 7.815726590790189e-05, "loss": 0.3224, "step": 7896 }, { "epoch": 3.313758389261745, "grad_norm": 0.58203125, "learning_rate": 7.801707854390668e-05, "loss": 0.2428, "step": 7900 }, { "epoch": 3.315436241610738, "grad_norm": 0.70703125, "learning_rate": 7.787697281655513e-05, "loss": 0.225, "step": 7904 }, { "epoch": 3.3171140939597317, "grad_norm": 0.58203125, "learning_rate": 7.773694888474267e-05, "loss": 0.2382, "step": 7908 }, { "epoch": 3.3187919463087248, "grad_norm": 0.5703125, "learning_rate": 7.759700690727214e-05, "loss": 0.2957, "step": 7912 }, { "epoch": 3.3204697986577183, "grad_norm": 0.5859375, "learning_rate": 7.745714704285317e-05, "loss": 0.2393, "step": 7916 }, { "epoch": 3.3221476510067114, "grad_norm": 0.65234375, "learning_rate": 7.731736945010247e-05, "loss": 0.2699, "step": 7920 }, { "epoch": 3.323825503355705, "grad_norm": 0.421875, "learning_rate": 7.71776742875434e-05, "loss": 0.246, "step": 7924 }, { "epoch": 3.325503355704698, "grad_norm": 0.6171875, "learning_rate": 7.703806171360581e-05, "loss": 0.2586, "step": 7928 }, { "epoch": 3.327181208053691, "grad_norm": 0.609375, "learning_rate": 7.689853188662588e-05, "loss": 0.241, "step": 7932 }, { "epoch": 3.3288590604026846, "grad_norm": 0.55078125, "learning_rate": 7.675908496484605e-05, "loss": 0.2787, "step": 7936 }, { "epoch": 3.3305369127516777, "grad_norm": 0.53125, "learning_rate": 7.661972110641448e-05, "loss": 0.2468, "step": 7940 }, { "epoch": 3.3322147651006713, "grad_norm": 0.5859375, "learning_rate": 7.648044046938543e-05, "loss": 0.3034, "step": 7944 }, { "epoch": 3.3338926174496644, "grad_norm": 0.4921875, "learning_rate": 7.634124321171858e-05, "loss": 0.2464, "step": 7948 }, { "epoch": 3.335570469798658, "grad_norm": 0.69140625, "learning_rate": 7.620212949127916e-05, "loss": 0.2607, "step": 7952 }, { "epoch": 3.337248322147651, "grad_norm": 0.396484375, "learning_rate": 7.606309946583762e-05, "loss": 0.1957, "step": 7956 }, { "epoch": 3.3389261744966445, "grad_norm": 0.94140625, "learning_rate": 7.592415329306939e-05, "loss": 0.2283, "step": 7960 }, { "epoch": 3.3406040268456376, "grad_norm": 0.44140625, "learning_rate": 7.578529113055493e-05, "loss": 0.2285, "step": 7964 }, { "epoch": 3.3422818791946307, "grad_norm": 0.49609375, "learning_rate": 7.564651313577943e-05, "loss": 0.2107, "step": 7968 }, { "epoch": 3.3439597315436242, "grad_norm": 0.421875, "learning_rate": 7.550781946613253e-05, "loss": 0.1816, "step": 7972 }, { "epoch": 3.3456375838926173, "grad_norm": 0.5390625, "learning_rate": 7.536921027890832e-05, "loss": 0.2161, "step": 7976 }, { "epoch": 3.347315436241611, "grad_norm": 0.421875, "learning_rate": 7.523068573130503e-05, "loss": 0.2852, "step": 7980 }, { "epoch": 3.348993288590604, "grad_norm": 0.58984375, "learning_rate": 7.50922459804249e-05, "loss": 0.2497, "step": 7984 }, { "epoch": 3.3506711409395975, "grad_norm": 0.546875, "learning_rate": 7.495389118327407e-05, "loss": 0.253, "step": 7988 }, { "epoch": 3.3523489932885906, "grad_norm": 0.56640625, "learning_rate": 7.48156214967622e-05, "loss": 0.1738, "step": 7992 }, { "epoch": 3.354026845637584, "grad_norm": 0.73046875, "learning_rate": 7.467743707770254e-05, "loss": 0.2714, "step": 7996 }, { "epoch": 3.3557046979865772, "grad_norm": 0.76953125, "learning_rate": 7.453933808281166e-05, "loss": 0.3228, "step": 8000 }, { "epoch": 3.3573825503355703, "grad_norm": 0.60546875, "learning_rate": 7.440132466870907e-05, "loss": 0.227, "step": 8004 }, { "epoch": 3.359060402684564, "grad_norm": 0.451171875, "learning_rate": 7.42633969919174e-05, "loss": 0.3254, "step": 8008 }, { "epoch": 3.360738255033557, "grad_norm": 0.5546875, "learning_rate": 7.412555520886197e-05, "loss": 0.2516, "step": 8012 }, { "epoch": 3.3624161073825505, "grad_norm": 0.67578125, "learning_rate": 7.398779947587073e-05, "loss": 0.2298, "step": 8016 }, { "epoch": 3.3640939597315436, "grad_norm": 0.64453125, "learning_rate": 7.385012994917404e-05, "loss": 0.2453, "step": 8020 }, { "epoch": 3.365771812080537, "grad_norm": 0.45703125, "learning_rate": 7.371254678490437e-05, "loss": 0.265, "step": 8024 }, { "epoch": 3.36744966442953, "grad_norm": 0.79296875, "learning_rate": 7.357505013909638e-05, "loss": 0.2879, "step": 8028 }, { "epoch": 3.3691275167785237, "grad_norm": 0.494140625, "learning_rate": 7.343764016768657e-05, "loss": 0.2152, "step": 8032 }, { "epoch": 3.370805369127517, "grad_norm": 0.5859375, "learning_rate": 7.330031702651316e-05, "loss": 0.2537, "step": 8036 }, { "epoch": 3.37248322147651, "grad_norm": 0.53125, "learning_rate": 7.316308087131586e-05, "loss": 0.1672, "step": 8040 }, { "epoch": 3.3741610738255035, "grad_norm": 0.71484375, "learning_rate": 7.302593185773579e-05, "loss": 0.264, "step": 8044 }, { "epoch": 3.3758389261744965, "grad_norm": 0.5078125, "learning_rate": 7.288887014131508e-05, "loss": 0.3228, "step": 8048 }, { "epoch": 3.37751677852349, "grad_norm": 0.625, "learning_rate": 7.275189587749705e-05, "loss": 0.3881, "step": 8052 }, { "epoch": 3.379194630872483, "grad_norm": 0.5234375, "learning_rate": 7.261500922162568e-05, "loss": 0.1271, "step": 8056 }, { "epoch": 3.3808724832214763, "grad_norm": 0.8515625, "learning_rate": 7.247821032894586e-05, "loss": 0.2341, "step": 8060 }, { "epoch": 3.38255033557047, "grad_norm": 0.435546875, "learning_rate": 7.234149935460258e-05, "loss": 0.1703, "step": 8064 }, { "epoch": 3.384228187919463, "grad_norm": 0.6015625, "learning_rate": 7.220487645364134e-05, "loss": 0.3153, "step": 8068 }, { "epoch": 3.3859060402684564, "grad_norm": 0.515625, "learning_rate": 7.206834178100773e-05, "loss": 0.2763, "step": 8072 }, { "epoch": 3.3875838926174495, "grad_norm": 0.5546875, "learning_rate": 7.193189549154727e-05, "loss": 0.3204, "step": 8076 }, { "epoch": 3.389261744966443, "grad_norm": 0.5546875, "learning_rate": 7.17955377400052e-05, "loss": 0.257, "step": 8080 }, { "epoch": 3.390939597315436, "grad_norm": 0.5703125, "learning_rate": 7.16592686810265e-05, "loss": 0.3947, "step": 8084 }, { "epoch": 3.3926174496644297, "grad_norm": 0.578125, "learning_rate": 7.152308846915529e-05, "loss": 0.3716, "step": 8088 }, { "epoch": 3.3942953020134228, "grad_norm": 0.69140625, "learning_rate": 7.138699725883515e-05, "loss": 0.3912, "step": 8092 }, { "epoch": 3.395973154362416, "grad_norm": 0.5703125, "learning_rate": 7.125099520440867e-05, "loss": 0.3482, "step": 8096 }, { "epoch": 3.3976510067114094, "grad_norm": 0.45703125, "learning_rate": 7.11150824601173e-05, "loss": 0.2486, "step": 8100 }, { "epoch": 3.3993288590604025, "grad_norm": 0.7578125, "learning_rate": 7.097925918010122e-05, "loss": 0.3197, "step": 8104 }, { "epoch": 3.401006711409396, "grad_norm": 0.53515625, "learning_rate": 7.084352551839922e-05, "loss": 0.2139, "step": 8108 }, { "epoch": 3.402684563758389, "grad_norm": 0.50390625, "learning_rate": 7.070788162894824e-05, "loss": 0.2074, "step": 8112 }, { "epoch": 3.4043624161073827, "grad_norm": 0.53125, "learning_rate": 7.057232766558363e-05, "loss": 0.1932, "step": 8116 }, { "epoch": 3.4060402684563758, "grad_norm": 0.69921875, "learning_rate": 7.043686378203863e-05, "loss": 0.2297, "step": 8120 }, { "epoch": 3.4077181208053693, "grad_norm": 0.62109375, "learning_rate": 7.030149013194439e-05, "loss": 0.2138, "step": 8124 }, { "epoch": 3.4093959731543624, "grad_norm": 0.443359375, "learning_rate": 7.016620686882974e-05, "loss": 0.1426, "step": 8128 }, { "epoch": 3.4110738255033555, "grad_norm": 0.51953125, "learning_rate": 7.003101414612086e-05, "loss": 0.2495, "step": 8132 }, { "epoch": 3.412751677852349, "grad_norm": 0.498046875, "learning_rate": 6.989591211714137e-05, "loss": 0.2462, "step": 8136 }, { "epoch": 3.414429530201342, "grad_norm": 0.5078125, "learning_rate": 6.976090093511196e-05, "loss": 0.204, "step": 8140 }, { "epoch": 3.4161073825503356, "grad_norm": 0.5390625, "learning_rate": 6.962598075315046e-05, "loss": 0.3897, "step": 8144 }, { "epoch": 3.4177852348993287, "grad_norm": 0.60546875, "learning_rate": 6.949115172427137e-05, "loss": 0.2565, "step": 8148 }, { "epoch": 3.4194630872483223, "grad_norm": 0.60546875, "learning_rate": 6.935641400138568e-05, "loss": 0.1876, "step": 8152 }, { "epoch": 3.4211409395973154, "grad_norm": 0.59375, "learning_rate": 6.922176773730104e-05, "loss": 0.2416, "step": 8156 }, { "epoch": 3.422818791946309, "grad_norm": 0.52734375, "learning_rate": 6.908721308472129e-05, "loss": 0.2524, "step": 8160 }, { "epoch": 3.424496644295302, "grad_norm": 0.69921875, "learning_rate": 6.895275019624638e-05, "loss": 0.2847, "step": 8164 }, { "epoch": 3.426174496644295, "grad_norm": 0.77734375, "learning_rate": 6.881837922437225e-05, "loss": 0.3039, "step": 8168 }, { "epoch": 3.4278523489932886, "grad_norm": 0.6328125, "learning_rate": 6.86841003214904e-05, "loss": 0.3318, "step": 8172 }, { "epoch": 3.4295302013422817, "grad_norm": 0.48828125, "learning_rate": 6.854991363988815e-05, "loss": 0.2378, "step": 8176 }, { "epoch": 3.4312080536912752, "grad_norm": 0.6015625, "learning_rate": 6.841581933174808e-05, "loss": 0.2657, "step": 8180 }, { "epoch": 3.4328859060402683, "grad_norm": 0.7890625, "learning_rate": 6.828181754914811e-05, "loss": 0.3038, "step": 8184 }, { "epoch": 3.434563758389262, "grad_norm": 0.54296875, "learning_rate": 6.814790844406112e-05, "loss": 0.308, "step": 8188 }, { "epoch": 3.436241610738255, "grad_norm": 0.65234375, "learning_rate": 6.801409216835502e-05, "loss": 0.2831, "step": 8192 }, { "epoch": 3.4379194630872485, "grad_norm": 0.3671875, "learning_rate": 6.788036887379228e-05, "loss": 0.2824, "step": 8196 }, { "epoch": 3.4395973154362416, "grad_norm": 0.58203125, "learning_rate": 6.774673871202998e-05, "loss": 0.2404, "step": 8200 }, { "epoch": 3.4412751677852347, "grad_norm": 0.55078125, "learning_rate": 6.761320183461966e-05, "loss": 0.1525, "step": 8204 }, { "epoch": 3.442953020134228, "grad_norm": 0.7109375, "learning_rate": 6.747975839300697e-05, "loss": 0.2468, "step": 8208 }, { "epoch": 3.4446308724832213, "grad_norm": 0.65625, "learning_rate": 6.734640853853163e-05, "loss": 0.2621, "step": 8212 }, { "epoch": 3.446308724832215, "grad_norm": 0.486328125, "learning_rate": 6.721315242242727e-05, "loss": 0.1691, "step": 8216 }, { "epoch": 3.447986577181208, "grad_norm": 0.62109375, "learning_rate": 6.707999019582103e-05, "loss": 0.1527, "step": 8220 }, { "epoch": 3.4496644295302015, "grad_norm": 0.6171875, "learning_rate": 6.694692200973372e-05, "loss": 0.2049, "step": 8224 }, { "epoch": 3.4513422818791946, "grad_norm": 0.62109375, "learning_rate": 6.681394801507955e-05, "loss": 0.3581, "step": 8228 }, { "epoch": 3.453020134228188, "grad_norm": 0.6171875, "learning_rate": 6.668106836266586e-05, "loss": 0.2265, "step": 8232 }, { "epoch": 3.454697986577181, "grad_norm": 0.453125, "learning_rate": 6.654828320319284e-05, "loss": 0.1955, "step": 8236 }, { "epoch": 3.4563758389261743, "grad_norm": 0.515625, "learning_rate": 6.64155926872537e-05, "loss": 0.2497, "step": 8240 }, { "epoch": 3.458053691275168, "grad_norm": 0.578125, "learning_rate": 6.628299696533424e-05, "loss": 0.2761, "step": 8244 }, { "epoch": 3.459731543624161, "grad_norm": 0.66015625, "learning_rate": 6.615049618781277e-05, "loss": 0.2052, "step": 8248 }, { "epoch": 3.4614093959731544, "grad_norm": 0.5390625, "learning_rate": 6.601809050495993e-05, "loss": 0.1981, "step": 8252 }, { "epoch": 3.4630872483221475, "grad_norm": 0.5703125, "learning_rate": 6.588578006693853e-05, "loss": 0.3031, "step": 8256 }, { "epoch": 3.464765100671141, "grad_norm": 0.796875, "learning_rate": 6.575356502380322e-05, "loss": 0.2283, "step": 8260 }, { "epoch": 3.466442953020134, "grad_norm": 0.69140625, "learning_rate": 6.562144552550066e-05, "loss": 0.2358, "step": 8264 }, { "epoch": 3.4681208053691277, "grad_norm": 0.52734375, "learning_rate": 6.548942172186902e-05, "loss": 0.2063, "step": 8268 }, { "epoch": 3.469798657718121, "grad_norm": 0.57421875, "learning_rate": 6.535749376263804e-05, "loss": 0.1299, "step": 8272 }, { "epoch": 3.471476510067114, "grad_norm": 0.625, "learning_rate": 6.522566179742871e-05, "loss": 0.3288, "step": 8276 }, { "epoch": 3.4731543624161074, "grad_norm": 0.65234375, "learning_rate": 6.509392597575309e-05, "loss": 0.2898, "step": 8280 }, { "epoch": 3.4748322147651005, "grad_norm": 0.74609375, "learning_rate": 6.496228644701425e-05, "loss": 0.3446, "step": 8284 }, { "epoch": 3.476510067114094, "grad_norm": 0.53125, "learning_rate": 6.483074336050613e-05, "loss": 0.3726, "step": 8288 }, { "epoch": 3.478187919463087, "grad_norm": 0.54296875, "learning_rate": 6.46992968654132e-05, "loss": 0.3203, "step": 8292 }, { "epoch": 3.4798657718120807, "grad_norm": 0.52734375, "learning_rate": 6.456794711081041e-05, "loss": 0.2939, "step": 8296 }, { "epoch": 3.4815436241610738, "grad_norm": 0.65234375, "learning_rate": 6.443669424566309e-05, "loss": 0.2624, "step": 8300 }, { "epoch": 3.4832214765100673, "grad_norm": 0.5078125, "learning_rate": 6.430553841882643e-05, "loss": 0.2358, "step": 8304 }, { "epoch": 3.4848993288590604, "grad_norm": 0.486328125, "learning_rate": 6.417447977904577e-05, "loss": 0.1635, "step": 8308 }, { "epoch": 3.4865771812080535, "grad_norm": 0.6015625, "learning_rate": 6.40435184749563e-05, "loss": 0.1953, "step": 8312 }, { "epoch": 3.488255033557047, "grad_norm": 0.7109375, "learning_rate": 6.391265465508262e-05, "loss": 0.2652, "step": 8316 }, { "epoch": 3.48993288590604, "grad_norm": 0.44140625, "learning_rate": 6.378188846783898e-05, "loss": 0.1834, "step": 8320 }, { "epoch": 3.4916107382550337, "grad_norm": 0.5078125, "learning_rate": 6.365122006152863e-05, "loss": 0.215, "step": 8324 }, { "epoch": 3.4932885906040267, "grad_norm": 0.58984375, "learning_rate": 6.352064958434417e-05, "loss": 0.2787, "step": 8328 }, { "epoch": 3.4949664429530203, "grad_norm": 0.65234375, "learning_rate": 6.339017718436705e-05, "loss": 0.2866, "step": 8332 }, { "epoch": 3.4966442953020134, "grad_norm": 0.58203125, "learning_rate": 6.325980300956748e-05, "loss": 0.2395, "step": 8336 }, { "epoch": 3.498322147651007, "grad_norm": 0.80859375, "learning_rate": 6.312952720780432e-05, "loss": 0.2298, "step": 8340 }, { "epoch": 3.5, "grad_norm": 0.5703125, "learning_rate": 6.299934992682476e-05, "loss": 0.2802, "step": 8344 }, { "epoch": 3.501677852348993, "grad_norm": 0.59375, "learning_rate": 6.286927131426436e-05, "loss": 0.2242, "step": 8348 }, { "epoch": 3.5033557046979866, "grad_norm": 0.5546875, "learning_rate": 6.273929151764673e-05, "loss": 0.2489, "step": 8352 }, { "epoch": 3.5050335570469797, "grad_norm": 0.6171875, "learning_rate": 6.260941068438344e-05, "loss": 0.3444, "step": 8356 }, { "epoch": 3.5067114093959733, "grad_norm": 0.52734375, "learning_rate": 6.247962896177379e-05, "loss": 0.2582, "step": 8360 }, { "epoch": 3.5083892617449663, "grad_norm": 0.6640625, "learning_rate": 6.234994649700475e-05, "loss": 0.1736, "step": 8364 }, { "epoch": 3.51006711409396, "grad_norm": 0.64453125, "learning_rate": 6.22203634371506e-05, "loss": 0.254, "step": 8368 }, { "epoch": 3.511744966442953, "grad_norm": 0.515625, "learning_rate": 6.209087992917297e-05, "loss": 0.2344, "step": 8372 }, { "epoch": 3.5134228187919465, "grad_norm": 0.55078125, "learning_rate": 6.196149611992056e-05, "loss": 0.3243, "step": 8376 }, { "epoch": 3.5151006711409396, "grad_norm": 0.4296875, "learning_rate": 6.183221215612904e-05, "loss": 0.274, "step": 8380 }, { "epoch": 3.5167785234899327, "grad_norm": 0.44921875, "learning_rate": 6.170302818442078e-05, "loss": 0.144, "step": 8384 }, { "epoch": 3.5184563758389262, "grad_norm": 0.458984375, "learning_rate": 6.157394435130485e-05, "loss": 0.2187, "step": 8388 }, { "epoch": 3.5201342281879193, "grad_norm": 0.439453125, "learning_rate": 6.144496080317654e-05, "loss": 0.1676, "step": 8392 }, { "epoch": 3.521812080536913, "grad_norm": 0.6484375, "learning_rate": 6.131607768631766e-05, "loss": 0.2027, "step": 8396 }, { "epoch": 3.523489932885906, "grad_norm": 0.38671875, "learning_rate": 6.1187295146896e-05, "loss": 0.211, "step": 8400 }, { "epoch": 3.5251677852348995, "grad_norm": 0.59765625, "learning_rate": 6.105861333096532e-05, "loss": 0.3097, "step": 8404 }, { "epoch": 3.5268456375838926, "grad_norm": 0.51953125, "learning_rate": 6.0930032384465024e-05, "loss": 0.2286, "step": 8408 }, { "epoch": 3.528523489932886, "grad_norm": 0.73046875, "learning_rate": 6.080155245322025e-05, "loss": 0.3472, "step": 8412 }, { "epoch": 3.530201342281879, "grad_norm": 0.65234375, "learning_rate": 6.067317368294157e-05, "loss": 0.1923, "step": 8416 }, { "epoch": 3.5318791946308723, "grad_norm": 0.6171875, "learning_rate": 6.0544896219224764e-05, "loss": 0.175, "step": 8420 }, { "epoch": 3.533557046979866, "grad_norm": 0.5859375, "learning_rate": 6.041672020755079e-05, "loss": 0.1617, "step": 8424 }, { "epoch": 3.535234899328859, "grad_norm": 0.5625, "learning_rate": 6.0288645793285516e-05, "loss": 0.2001, "step": 8428 }, { "epoch": 3.5369127516778525, "grad_norm": 0.61328125, "learning_rate": 6.016067312167949e-05, "loss": 0.2185, "step": 8432 }, { "epoch": 3.5385906040268456, "grad_norm": 0.6640625, "learning_rate": 6.003280233786805e-05, "loss": 0.2082, "step": 8436 }, { "epoch": 3.540268456375839, "grad_norm": 0.54296875, "learning_rate": 5.9905033586870843e-05, "loss": 0.2203, "step": 8440 }, { "epoch": 3.541946308724832, "grad_norm": 0.43359375, "learning_rate": 5.977736701359189e-05, "loss": 0.2418, "step": 8444 }, { "epoch": 3.5436241610738257, "grad_norm": 0.55859375, "learning_rate": 5.964980276281933e-05, "loss": 0.1511, "step": 8448 }, { "epoch": 3.545302013422819, "grad_norm": 0.60546875, "learning_rate": 5.952234097922512e-05, "loss": 0.2819, "step": 8452 }, { "epoch": 3.546979865771812, "grad_norm": 0.45703125, "learning_rate": 5.939498180736515e-05, "loss": 0.1642, "step": 8456 }, { "epoch": 3.5486577181208054, "grad_norm": 0.7265625, "learning_rate": 5.92677253916789e-05, "loss": 0.2618, "step": 8460 }, { "epoch": 3.5503355704697985, "grad_norm": 0.490234375, "learning_rate": 5.9140571876489306e-05, "loss": 0.3176, "step": 8464 }, { "epoch": 3.552013422818792, "grad_norm": 0.6484375, "learning_rate": 5.9013521406002615e-05, "loss": 0.2755, "step": 8468 }, { "epoch": 3.553691275167785, "grad_norm": 0.5390625, "learning_rate": 5.888657412430819e-05, "loss": 0.2571, "step": 8472 }, { "epoch": 3.5553691275167782, "grad_norm": 0.5625, "learning_rate": 5.875973017537837e-05, "loss": 0.2635, "step": 8476 }, { "epoch": 3.557046979865772, "grad_norm": 0.69140625, "learning_rate": 5.863298970306831e-05, "loss": 0.3223, "step": 8480 }, { "epoch": 3.5587248322147653, "grad_norm": 0.6015625, "learning_rate": 5.850635285111583e-05, "loss": 0.2364, "step": 8484 }, { "epoch": 3.5604026845637584, "grad_norm": 0.66796875, "learning_rate": 5.837981976314124e-05, "loss": 0.256, "step": 8488 }, { "epoch": 3.5620805369127515, "grad_norm": 0.8671875, "learning_rate": 5.8253390582647055e-05, "loss": 0.3401, "step": 8492 }, { "epoch": 3.563758389261745, "grad_norm": 0.6171875, "learning_rate": 5.812706545301809e-05, "loss": 0.1781, "step": 8496 }, { "epoch": 3.565436241610738, "grad_norm": 0.578125, "learning_rate": 5.800084451752108e-05, "loss": 0.3529, "step": 8500 }, { "epoch": 3.5671140939597317, "grad_norm": 0.43359375, "learning_rate": 5.787472791930463e-05, "loss": 0.1054, "step": 8504 }, { "epoch": 3.5687919463087248, "grad_norm": 0.55859375, "learning_rate": 5.7748715801398985e-05, "loss": 0.2503, "step": 8508 }, { "epoch": 3.570469798657718, "grad_norm": 0.57421875, "learning_rate": 5.7622808306715985e-05, "loss": 0.3237, "step": 8512 }, { "epoch": 3.5721476510067114, "grad_norm": 0.69921875, "learning_rate": 5.749700557804861e-05, "loss": 0.3194, "step": 8516 }, { "epoch": 3.573825503355705, "grad_norm": 0.671875, "learning_rate": 5.737130775807122e-05, "loss": 0.3624, "step": 8520 }, { "epoch": 3.575503355704698, "grad_norm": 0.56640625, "learning_rate": 5.7245714989339136e-05, "loss": 0.2081, "step": 8524 }, { "epoch": 3.577181208053691, "grad_norm": 0.6328125, "learning_rate": 5.7120227414288513e-05, "loss": 0.3412, "step": 8528 }, { "epoch": 3.5788590604026846, "grad_norm": 0.61328125, "learning_rate": 5.699484517523627e-05, "loss": 0.2595, "step": 8532 }, { "epoch": 3.5805369127516777, "grad_norm": 0.50390625, "learning_rate": 5.6869568414379815e-05, "loss": 0.1892, "step": 8536 }, { "epoch": 3.5822147651006713, "grad_norm": 0.53125, "learning_rate": 5.674439727379688e-05, "loss": 0.3495, "step": 8540 }, { "epoch": 3.5838926174496644, "grad_norm": 0.4921875, "learning_rate": 5.6619331895445474e-05, "loss": 0.2164, "step": 8544 }, { "epoch": 3.5855704697986575, "grad_norm": 0.55859375, "learning_rate": 5.64943724211637e-05, "loss": 0.2026, "step": 8548 }, { "epoch": 3.587248322147651, "grad_norm": 0.62890625, "learning_rate": 5.636951899266948e-05, "loss": 0.2235, "step": 8552 }, { "epoch": 3.5889261744966445, "grad_norm": 0.53515625, "learning_rate": 5.624477175156051e-05, "loss": 0.2497, "step": 8556 }, { "epoch": 3.5906040268456376, "grad_norm": 0.5859375, "learning_rate": 5.6120130839314024e-05, "loss": 0.2529, "step": 8560 }, { "epoch": 3.5922818791946307, "grad_norm": 0.61328125, "learning_rate": 5.5995596397286715e-05, "loss": 0.2466, "step": 8564 }, { "epoch": 3.5939597315436242, "grad_norm": 0.59765625, "learning_rate": 5.5871168566714495e-05, "loss": 0.2678, "step": 8568 }, { "epoch": 3.5956375838926173, "grad_norm": 0.640625, "learning_rate": 5.574684748871235e-05, "loss": 0.2706, "step": 8572 }, { "epoch": 3.597315436241611, "grad_norm": 0.58203125, "learning_rate": 5.562263330427428e-05, "loss": 0.2181, "step": 8576 }, { "epoch": 3.598993288590604, "grad_norm": 0.474609375, "learning_rate": 5.54985261542729e-05, "loss": 0.2455, "step": 8580 }, { "epoch": 3.600671140939597, "grad_norm": 0.5703125, "learning_rate": 5.537452617945956e-05, "loss": 0.2686, "step": 8584 }, { "epoch": 3.6023489932885906, "grad_norm": 0.53515625, "learning_rate": 5.525063352046406e-05, "loss": 0.1667, "step": 8588 }, { "epoch": 3.604026845637584, "grad_norm": 0.625, "learning_rate": 5.5126848317794436e-05, "loss": 0.3323, "step": 8592 }, { "epoch": 3.6057046979865772, "grad_norm": 0.625, "learning_rate": 5.5003170711836934e-05, "loss": 0.2523, "step": 8596 }, { "epoch": 3.6073825503355703, "grad_norm": 0.65234375, "learning_rate": 5.487960084285566e-05, "loss": 0.3474, "step": 8600 }, { "epoch": 3.609060402684564, "grad_norm": 0.671875, "learning_rate": 5.47561388509926e-05, "loss": 0.1802, "step": 8604 }, { "epoch": 3.610738255033557, "grad_norm": 0.61328125, "learning_rate": 5.4632784876267415e-05, "loss": 0.2319, "step": 8608 }, { "epoch": 3.6124161073825505, "grad_norm": 0.40234375, "learning_rate": 5.450953905857724e-05, "loss": 0.2182, "step": 8612 }, { "epoch": 3.6140939597315436, "grad_norm": 0.5703125, "learning_rate": 5.4386401537696536e-05, "loss": 0.2849, "step": 8616 }, { "epoch": 3.6157718120805367, "grad_norm": 0.59375, "learning_rate": 5.426337245327702e-05, "loss": 0.3556, "step": 8620 }, { "epoch": 3.61744966442953, "grad_norm": 0.64453125, "learning_rate": 5.4140451944847276e-05, "loss": 0.2355, "step": 8624 }, { "epoch": 3.6191275167785237, "grad_norm": 0.59375, "learning_rate": 5.4017640151812846e-05, "loss": 0.1975, "step": 8628 }, { "epoch": 3.620805369127517, "grad_norm": 0.58203125, "learning_rate": 5.389493721345601e-05, "loss": 0.3559, "step": 8632 }, { "epoch": 3.62248322147651, "grad_norm": 0.640625, "learning_rate": 5.377234326893554e-05, "loss": 0.2103, "step": 8636 }, { "epoch": 3.6241610738255035, "grad_norm": 0.61328125, "learning_rate": 5.3649858457286615e-05, "loss": 0.3406, "step": 8640 }, { "epoch": 3.6258389261744965, "grad_norm": 0.66015625, "learning_rate": 5.352748291742064e-05, "loss": 0.2021, "step": 8644 }, { "epoch": 3.62751677852349, "grad_norm": 0.474609375, "learning_rate": 5.340521678812511e-05, "loss": 0.2228, "step": 8648 }, { "epoch": 3.629194630872483, "grad_norm": 0.58984375, "learning_rate": 5.3283060208063413e-05, "loss": 0.2881, "step": 8652 }, { "epoch": 3.6308724832214763, "grad_norm": 0.56640625, "learning_rate": 5.316101331577469e-05, "loss": 0.2687, "step": 8656 }, { "epoch": 3.63255033557047, "grad_norm": 0.470703125, "learning_rate": 5.303907624967377e-05, "loss": 0.2265, "step": 8660 }, { "epoch": 3.6342281879194633, "grad_norm": 0.50390625, "learning_rate": 5.291724914805073e-05, "loss": 0.2319, "step": 8664 }, { "epoch": 3.6359060402684564, "grad_norm": 0.6328125, "learning_rate": 5.279553214907115e-05, "loss": 0.2813, "step": 8668 }, { "epoch": 3.6375838926174495, "grad_norm": 0.7578125, "learning_rate": 5.267392539077562e-05, "loss": 0.2345, "step": 8672 }, { "epoch": 3.639261744966443, "grad_norm": 0.5859375, "learning_rate": 5.255242901107974e-05, "loss": 0.315, "step": 8676 }, { "epoch": 3.640939597315436, "grad_norm": 0.69140625, "learning_rate": 5.243104314777394e-05, "loss": 0.3092, "step": 8680 }, { "epoch": 3.6426174496644297, "grad_norm": 0.60546875, "learning_rate": 5.230976793852336e-05, "loss": 0.2261, "step": 8684 }, { "epoch": 3.6442953020134228, "grad_norm": 0.474609375, "learning_rate": 5.2188603520867446e-05, "loss": 0.2452, "step": 8688 }, { "epoch": 3.645973154362416, "grad_norm": 0.55859375, "learning_rate": 5.206755003222019e-05, "loss": 0.2585, "step": 8692 }, { "epoch": 3.6476510067114094, "grad_norm": 0.51953125, "learning_rate": 5.194660760986975e-05, "loss": 0.2352, "step": 8696 }, { "epoch": 3.649328859060403, "grad_norm": 0.59765625, "learning_rate": 5.182577639097826e-05, "loss": 0.2449, "step": 8700 }, { "epoch": 3.651006711409396, "grad_norm": 0.66796875, "learning_rate": 5.170505651258178e-05, "loss": 0.2981, "step": 8704 }, { "epoch": 3.652684563758389, "grad_norm": 0.8203125, "learning_rate": 5.158444811159014e-05, "loss": 0.2743, "step": 8708 }, { "epoch": 3.6543624161073827, "grad_norm": 0.50390625, "learning_rate": 5.146395132478659e-05, "loss": 0.315, "step": 8712 }, { "epoch": 3.6560402684563758, "grad_norm": 0.498046875, "learning_rate": 5.1343566288827936e-05, "loss": 0.2124, "step": 8716 }, { "epoch": 3.6577181208053693, "grad_norm": 0.478515625, "learning_rate": 5.1223293140244216e-05, "loss": 0.2312, "step": 8720 }, { "epoch": 3.6593959731543624, "grad_norm": 0.53515625, "learning_rate": 5.110313201543856e-05, "loss": 0.2832, "step": 8724 }, { "epoch": 3.6610738255033555, "grad_norm": 0.578125, "learning_rate": 5.098308305068706e-05, "loss": 0.3261, "step": 8728 }, { "epoch": 3.662751677852349, "grad_norm": 0.60546875, "learning_rate": 5.0863146382138586e-05, "loss": 0.2435, "step": 8732 }, { "epoch": 3.6644295302013425, "grad_norm": 0.59765625, "learning_rate": 5.0743322145814695e-05, "loss": 0.3142, "step": 8736 }, { "epoch": 3.6661073825503356, "grad_norm": 0.52734375, "learning_rate": 5.062361047760938e-05, "loss": 0.2056, "step": 8740 }, { "epoch": 3.6677852348993287, "grad_norm": 0.49609375, "learning_rate": 5.050401151328903e-05, "loss": 0.0876, "step": 8744 }, { "epoch": 3.6694630872483223, "grad_norm": 0.5859375, "learning_rate": 5.038452538849218e-05, "loss": 0.2023, "step": 8748 }, { "epoch": 3.6711409395973154, "grad_norm": 0.75, "learning_rate": 5.026515223872933e-05, "loss": 0.1498, "step": 8752 }, { "epoch": 3.672818791946309, "grad_norm": 0.55078125, "learning_rate": 5.0145892199382945e-05, "loss": 0.2651, "step": 8756 }, { "epoch": 3.674496644295302, "grad_norm": 0.484375, "learning_rate": 5.0026745405707196e-05, "loss": 0.2441, "step": 8760 }, { "epoch": 3.676174496644295, "grad_norm": 0.51953125, "learning_rate": 4.990771199282782e-05, "loss": 0.1827, "step": 8764 }, { "epoch": 3.6778523489932886, "grad_norm": 0.67578125, "learning_rate": 4.9788792095741955e-05, "loss": 0.2472, "step": 8768 }, { "epoch": 3.679530201342282, "grad_norm": 0.75390625, "learning_rate": 4.966998584931796e-05, "loss": 0.248, "step": 8772 }, { "epoch": 3.6812080536912752, "grad_norm": 0.58984375, "learning_rate": 4.955129338829535e-05, "loss": 0.2823, "step": 8776 }, { "epoch": 3.6828859060402683, "grad_norm": 0.388671875, "learning_rate": 4.9432714847284594e-05, "loss": 0.1832, "step": 8780 }, { "epoch": 3.684563758389262, "grad_norm": 0.5859375, "learning_rate": 4.931425036076694e-05, "loss": 0.2501, "step": 8784 }, { "epoch": 3.686241610738255, "grad_norm": 0.390625, "learning_rate": 4.9195900063094327e-05, "loss": 0.1696, "step": 8788 }, { "epoch": 3.6879194630872485, "grad_norm": 0.61328125, "learning_rate": 4.90776640884892e-05, "loss": 0.2966, "step": 8792 }, { "epoch": 3.6895973154362416, "grad_norm": 0.546875, "learning_rate": 4.89595425710442e-05, "loss": 0.1972, "step": 8796 }, { "epoch": 3.6912751677852347, "grad_norm": 0.640625, "learning_rate": 4.8841535644722345e-05, "loss": 0.2488, "step": 8800 }, { "epoch": 3.692953020134228, "grad_norm": 0.41015625, "learning_rate": 4.8723643443356574e-05, "loss": 0.322, "step": 8804 }, { "epoch": 3.6946308724832218, "grad_norm": 0.69921875, "learning_rate": 4.8605866100649885e-05, "loss": 0.3554, "step": 8808 }, { "epoch": 3.696308724832215, "grad_norm": 0.6640625, "learning_rate": 4.848820375017475e-05, "loss": 0.3185, "step": 8812 }, { "epoch": 3.697986577181208, "grad_norm": 0.6796875, "learning_rate": 4.8370656525373446e-05, "loss": 0.3254, "step": 8816 }, { "epoch": 3.6996644295302015, "grad_norm": 0.5234375, "learning_rate": 4.825322455955758e-05, "loss": 0.1807, "step": 8820 }, { "epoch": 3.7013422818791946, "grad_norm": 0.56640625, "learning_rate": 4.8135907985908065e-05, "loss": 0.255, "step": 8824 }, { "epoch": 3.703020134228188, "grad_norm": 0.64453125, "learning_rate": 4.801870693747498e-05, "loss": 0.3176, "step": 8828 }, { "epoch": 3.704697986577181, "grad_norm": 0.62890625, "learning_rate": 4.790162154717738e-05, "loss": 0.2621, "step": 8832 }, { "epoch": 3.7063758389261743, "grad_norm": 0.640625, "learning_rate": 4.778465194780305e-05, "loss": 0.2435, "step": 8836 }, { "epoch": 3.708053691275168, "grad_norm": 0.65625, "learning_rate": 4.766779827200857e-05, "loss": 0.2858, "step": 8840 }, { "epoch": 3.709731543624161, "grad_norm": 0.498046875, "learning_rate": 4.7551060652318995e-05, "loss": 0.2888, "step": 8844 }, { "epoch": 3.7114093959731544, "grad_norm": 0.470703125, "learning_rate": 4.743443922112781e-05, "loss": 0.2553, "step": 8848 }, { "epoch": 3.7130872483221475, "grad_norm": 0.6171875, "learning_rate": 4.7317934110696685e-05, "loss": 0.2797, "step": 8852 }, { "epoch": 3.714765100671141, "grad_norm": 0.734375, "learning_rate": 4.7201545453155425e-05, "loss": 0.1571, "step": 8856 }, { "epoch": 3.716442953020134, "grad_norm": 0.47265625, "learning_rate": 4.708527338050161e-05, "loss": 0.341, "step": 8860 }, { "epoch": 3.7181208053691277, "grad_norm": 0.32421875, "learning_rate": 4.696911802460079e-05, "loss": 0.2375, "step": 8864 }, { "epoch": 3.719798657718121, "grad_norm": 0.5390625, "learning_rate": 4.6853079517186035e-05, "loss": 0.2413, "step": 8868 }, { "epoch": 3.721476510067114, "grad_norm": 0.58203125, "learning_rate": 4.6737157989857935e-05, "loss": 0.2608, "step": 8872 }, { "epoch": 3.7231543624161074, "grad_norm": 0.578125, "learning_rate": 4.662135357408447e-05, "loss": 0.2019, "step": 8876 }, { "epoch": 3.7248322147651005, "grad_norm": 0.68359375, "learning_rate": 4.650566640120061e-05, "loss": 0.3372, "step": 8880 }, { "epoch": 3.726510067114094, "grad_norm": 0.494140625, "learning_rate": 4.6390096602408555e-05, "loss": 0.3253, "step": 8884 }, { "epoch": 3.728187919463087, "grad_norm": 0.55859375, "learning_rate": 4.6274644308777255e-05, "loss": 0.274, "step": 8888 }, { "epoch": 3.7298657718120807, "grad_norm": 0.671875, "learning_rate": 4.615930965124255e-05, "loss": 0.1668, "step": 8892 }, { "epoch": 3.7315436241610738, "grad_norm": 0.55859375, "learning_rate": 4.6044092760606794e-05, "loss": 0.2434, "step": 8896 }, { "epoch": 3.7332214765100673, "grad_norm": 0.6796875, "learning_rate": 4.592899376753865e-05, "loss": 0.171, "step": 8900 }, { "epoch": 3.7348993288590604, "grad_norm": 0.71484375, "learning_rate": 4.581401280257326e-05, "loss": 0.2245, "step": 8904 }, { "epoch": 3.7365771812080535, "grad_norm": 0.57421875, "learning_rate": 4.56991499961118e-05, "loss": 0.235, "step": 8908 }, { "epoch": 3.738255033557047, "grad_norm": 0.6640625, "learning_rate": 4.5584405478421506e-05, "loss": 0.3145, "step": 8912 }, { "epoch": 3.73993288590604, "grad_norm": 0.56640625, "learning_rate": 4.5469779379635456e-05, "loss": 0.191, "step": 8916 }, { "epoch": 3.7416107382550337, "grad_norm": 0.71484375, "learning_rate": 4.5355271829752306e-05, "loss": 0.3116, "step": 8920 }, { "epoch": 3.7432885906040267, "grad_norm": 0.578125, "learning_rate": 4.524088295863643e-05, "loss": 0.2544, "step": 8924 }, { "epoch": 3.7449664429530203, "grad_norm": 0.609375, "learning_rate": 4.5126612896017515e-05, "loss": 0.2299, "step": 8928 }, { "epoch": 3.7466442953020134, "grad_norm": 0.72265625, "learning_rate": 4.501246177149056e-05, "loss": 0.2716, "step": 8932 }, { "epoch": 3.748322147651007, "grad_norm": 0.6328125, "learning_rate": 4.489842971451561e-05, "loss": 0.2521, "step": 8936 }, { "epoch": 3.75, "grad_norm": 0.6015625, "learning_rate": 4.478451685441775e-05, "loss": 0.3264, "step": 8940 }, { "epoch": 3.751677852348993, "grad_norm": 0.734375, "learning_rate": 4.467072332038678e-05, "loss": 0.3955, "step": 8944 }, { "epoch": 3.7533557046979866, "grad_norm": 0.62890625, "learning_rate": 4.455704924147726e-05, "loss": 0.3008, "step": 8948 }, { "epoch": 3.7550335570469797, "grad_norm": 0.52734375, "learning_rate": 4.4443494746608246e-05, "loss": 0.215, "step": 8952 }, { "epoch": 3.7567114093959733, "grad_norm": 0.455078125, "learning_rate": 4.433005996456317e-05, "loss": 0.2396, "step": 8956 }, { "epoch": 3.7583892617449663, "grad_norm": 0.66796875, "learning_rate": 4.421674502398971e-05, "loss": 0.2641, "step": 8960 }, { "epoch": 3.76006711409396, "grad_norm": 0.5703125, "learning_rate": 4.4103550053399666e-05, "loss": 0.2718, "step": 8964 }, { "epoch": 3.761744966442953, "grad_norm": 0.494140625, "learning_rate": 4.399047518116861e-05, "loss": 0.2329, "step": 8968 }, { "epoch": 3.7634228187919465, "grad_norm": 0.5859375, "learning_rate": 4.387752053553606e-05, "loss": 0.2874, "step": 8972 }, { "epoch": 3.7651006711409396, "grad_norm": 0.439453125, "learning_rate": 4.376468624460524e-05, "loss": 0.2607, "step": 8976 }, { "epoch": 3.7667785234899327, "grad_norm": 0.5390625, "learning_rate": 4.3651972436342794e-05, "loss": 0.2358, "step": 8980 }, { "epoch": 3.7684563758389262, "grad_norm": 0.73046875, "learning_rate": 4.353937923857863e-05, "loss": 0.2135, "step": 8984 }, { "epoch": 3.7701342281879193, "grad_norm": 0.671875, "learning_rate": 4.3426906779005995e-05, "loss": 0.2533, "step": 8988 }, { "epoch": 3.771812080536913, "grad_norm": 0.65234375, "learning_rate": 4.331455518518121e-05, "loss": 0.1724, "step": 8992 }, { "epoch": 3.773489932885906, "grad_norm": 0.6640625, "learning_rate": 4.320232458452344e-05, "loss": 0.2765, "step": 8996 }, { "epoch": 3.7751677852348995, "grad_norm": 0.68359375, "learning_rate": 4.309021510431473e-05, "loss": 0.1832, "step": 9000 }, { "epoch": 3.7768456375838926, "grad_norm": 0.55859375, "learning_rate": 4.2978226871699704e-05, "loss": 0.1897, "step": 9004 }, { "epoch": 3.778523489932886, "grad_norm": 0.5625, "learning_rate": 4.286636001368541e-05, "loss": 0.3581, "step": 9008 }, { "epoch": 3.780201342281879, "grad_norm": 0.578125, "learning_rate": 4.2754614657141365e-05, "loss": 0.2819, "step": 9012 }, { "epoch": 3.7818791946308723, "grad_norm": 0.65625, "learning_rate": 4.2642990928799224e-05, "loss": 0.2424, "step": 9016 }, { "epoch": 3.783557046979866, "grad_norm": 0.58203125, "learning_rate": 4.253148895525272e-05, "loss": 0.211, "step": 9020 }, { "epoch": 3.785234899328859, "grad_norm": 0.6015625, "learning_rate": 4.242010886295752e-05, "loss": 0.2735, "step": 9024 }, { "epoch": 3.7869127516778525, "grad_norm": 0.5703125, "learning_rate": 4.230885077823107e-05, "loss": 0.2122, "step": 9028 }, { "epoch": 3.7885906040268456, "grad_norm": 0.359375, "learning_rate": 4.219771482725233e-05, "loss": 0.201, "step": 9032 }, { "epoch": 3.790268456375839, "grad_norm": 0.55078125, "learning_rate": 4.208670113606189e-05, "loss": 0.2324, "step": 9036 }, { "epoch": 3.791946308724832, "grad_norm": 0.55078125, "learning_rate": 4.1975809830561616e-05, "loss": 0.207, "step": 9040 }, { "epoch": 3.7936241610738257, "grad_norm": 0.5234375, "learning_rate": 4.18650410365146e-05, "loss": 0.2025, "step": 9044 }, { "epoch": 3.795302013422819, "grad_norm": 0.5625, "learning_rate": 4.175439487954502e-05, "loss": 0.3443, "step": 9048 }, { "epoch": 3.796979865771812, "grad_norm": 0.51953125, "learning_rate": 4.164387148513787e-05, "loss": 0.2827, "step": 9052 }, { "epoch": 3.7986577181208054, "grad_norm": 0.51171875, "learning_rate": 4.1533470978638925e-05, "loss": 0.1183, "step": 9056 }, { "epoch": 3.8003355704697985, "grad_norm": 0.55859375, "learning_rate": 4.142319348525477e-05, "loss": 0.389, "step": 9060 }, { "epoch": 3.802013422818792, "grad_norm": 0.63671875, "learning_rate": 4.131303913005231e-05, "loss": 0.2546, "step": 9064 }, { "epoch": 3.803691275167785, "grad_norm": 0.57421875, "learning_rate": 4.120300803795889e-05, "loss": 0.2244, "step": 9068 }, { "epoch": 3.8053691275167782, "grad_norm": 0.63671875, "learning_rate": 4.10931003337619e-05, "loss": 0.2222, "step": 9072 }, { "epoch": 3.807046979865772, "grad_norm": 0.66796875, "learning_rate": 4.098331614210899e-05, "loss": 0.284, "step": 9076 }, { "epoch": 3.8087248322147653, "grad_norm": 0.6328125, "learning_rate": 4.087365558750761e-05, "loss": 0.262, "step": 9080 }, { "epoch": 3.8104026845637584, "grad_norm": 0.6484375, "learning_rate": 4.0764118794325056e-05, "loss": 0.3396, "step": 9084 }, { "epoch": 3.8120805369127515, "grad_norm": 0.75, "learning_rate": 4.06547058867883e-05, "loss": 0.2449, "step": 9088 }, { "epoch": 3.813758389261745, "grad_norm": 0.55859375, "learning_rate": 4.054541698898364e-05, "loss": 0.3491, "step": 9092 }, { "epoch": 3.815436241610738, "grad_norm": 0.71484375, "learning_rate": 4.043625222485688e-05, "loss": 0.3304, "step": 9096 }, { "epoch": 3.8171140939597317, "grad_norm": 0.51171875, "learning_rate": 4.032721171821306e-05, "loss": 0.1501, "step": 9100 }, { "epoch": 3.8187919463087248, "grad_norm": 0.51953125, "learning_rate": 4.0218295592716205e-05, "loss": 0.3264, "step": 9104 }, { "epoch": 3.820469798657718, "grad_norm": 0.6484375, "learning_rate": 4.010950397188934e-05, "loss": 0.4183, "step": 9108 }, { "epoch": 3.8221476510067114, "grad_norm": 0.65625, "learning_rate": 4.000083697911432e-05, "loss": 0.2829, "step": 9112 }, { "epoch": 3.823825503355705, "grad_norm": 0.5703125, "learning_rate": 3.9892294737631494e-05, "loss": 0.2997, "step": 9116 }, { "epoch": 3.825503355704698, "grad_norm": 0.5625, "learning_rate": 3.978387737053994e-05, "loss": 0.3864, "step": 9120 }, { "epoch": 3.827181208053691, "grad_norm": 0.5703125, "learning_rate": 3.9675585000796945e-05, "loss": 0.2563, "step": 9124 }, { "epoch": 3.8288590604026846, "grad_norm": 0.6015625, "learning_rate": 3.956741775121817e-05, "loss": 0.1947, "step": 9128 }, { "epoch": 3.8305369127516777, "grad_norm": 0.6640625, "learning_rate": 3.945937574447729e-05, "loss": 0.2986, "step": 9132 }, { "epoch": 3.8322147651006713, "grad_norm": 0.609375, "learning_rate": 3.9351459103106e-05, "loss": 0.2414, "step": 9136 }, { "epoch": 3.8338926174496644, "grad_norm": 0.50390625, "learning_rate": 3.924366794949366e-05, "loss": 0.2101, "step": 9140 }, { "epoch": 3.8355704697986575, "grad_norm": 0.55078125, "learning_rate": 3.913600240588756e-05, "loss": 0.3651, "step": 9144 }, { "epoch": 3.837248322147651, "grad_norm": 0.625, "learning_rate": 3.902846259439236e-05, "loss": 0.29, "step": 9148 }, { "epoch": 3.8389261744966445, "grad_norm": 0.59765625, "learning_rate": 3.8921048636970233e-05, "loss": 0.407, "step": 9152 }, { "epoch": 3.8406040268456376, "grad_norm": 0.578125, "learning_rate": 3.881376065544047e-05, "loss": 0.2058, "step": 9156 }, { "epoch": 3.8422818791946307, "grad_norm": 0.494140625, "learning_rate": 3.870659877147962e-05, "loss": 0.2161, "step": 9160 }, { "epoch": 3.8439597315436242, "grad_norm": 0.50390625, "learning_rate": 3.859956310662117e-05, "loss": 0.1922, "step": 9164 }, { "epoch": 3.8456375838926173, "grad_norm": 0.62890625, "learning_rate": 3.84926537822555e-05, "loss": 0.3143, "step": 9168 }, { "epoch": 3.847315436241611, "grad_norm": 0.5625, "learning_rate": 3.838587091962966e-05, "loss": 0.2298, "step": 9172 }, { "epoch": 3.848993288590604, "grad_norm": 0.5390625, "learning_rate": 3.827921463984738e-05, "loss": 0.2497, "step": 9176 }, { "epoch": 3.850671140939597, "grad_norm": 0.5234375, "learning_rate": 3.8172685063868616e-05, "loss": 0.1326, "step": 9180 }, { "epoch": 3.8523489932885906, "grad_norm": 0.5390625, "learning_rate": 3.806628231250984e-05, "loss": 0.2847, "step": 9184 }, { "epoch": 3.854026845637584, "grad_norm": 0.56640625, "learning_rate": 3.796000650644359e-05, "loss": 0.2444, "step": 9188 }, { "epoch": 3.8557046979865772, "grad_norm": 0.400390625, "learning_rate": 3.7853857766198495e-05, "loss": 0.2262, "step": 9192 }, { "epoch": 3.8573825503355703, "grad_norm": 0.55859375, "learning_rate": 3.7747836212159036e-05, "loss": 0.2356, "step": 9196 }, { "epoch": 3.859060402684564, "grad_norm": 0.57421875, "learning_rate": 3.7641941964565384e-05, "loss": 0.2482, "step": 9200 }, { "epoch": 3.860738255033557, "grad_norm": 0.66796875, "learning_rate": 3.7536175143513456e-05, "loss": 0.2939, "step": 9204 }, { "epoch": 3.8624161073825505, "grad_norm": 0.69140625, "learning_rate": 3.7430535868954596e-05, "loss": 0.3481, "step": 9208 }, { "epoch": 3.8640939597315436, "grad_norm": 0.640625, "learning_rate": 3.732502426069548e-05, "loss": 0.2803, "step": 9212 }, { "epoch": 3.8657718120805367, "grad_norm": 0.578125, "learning_rate": 3.721964043839804e-05, "loss": 0.3349, "step": 9216 }, { "epoch": 3.86744966442953, "grad_norm": 0.62109375, "learning_rate": 3.711438452157923e-05, "loss": 0.2769, "step": 9220 }, { "epoch": 3.8691275167785237, "grad_norm": 0.39453125, "learning_rate": 3.700925662961097e-05, "loss": 0.1929, "step": 9224 }, { "epoch": 3.870805369127517, "grad_norm": 0.58984375, "learning_rate": 3.6904256881720035e-05, "loss": 0.2825, "step": 9228 }, { "epoch": 3.87248322147651, "grad_norm": 0.484375, "learning_rate": 3.679938539698777e-05, "loss": 0.2163, "step": 9232 }, { "epoch": 3.8741610738255035, "grad_norm": 0.6171875, "learning_rate": 3.6694642294350184e-05, "loss": 0.288, "step": 9236 }, { "epoch": 3.8758389261744965, "grad_norm": 0.6171875, "learning_rate": 3.659002769259753e-05, "loss": 0.3024, "step": 9240 }, { "epoch": 3.87751677852349, "grad_norm": 0.68359375, "learning_rate": 3.6485541710374434e-05, "loss": 0.2644, "step": 9244 }, { "epoch": 3.879194630872483, "grad_norm": 0.640625, "learning_rate": 3.6381184466179625e-05, "loss": 0.2139, "step": 9248 }, { "epoch": 3.8808724832214763, "grad_norm": 0.62890625, "learning_rate": 3.627695607836583e-05, "loss": 0.3083, "step": 9252 }, { "epoch": 3.88255033557047, "grad_norm": 0.4375, "learning_rate": 3.617285666513966e-05, "loss": 0.25, "step": 9256 }, { "epoch": 3.8842281879194633, "grad_norm": 0.65625, "learning_rate": 3.606888634456149e-05, "loss": 0.2541, "step": 9260 }, { "epoch": 3.8859060402684564, "grad_norm": 0.84375, "learning_rate": 3.596504523454509e-05, "loss": 0.2571, "step": 9264 }, { "epoch": 3.8875838926174495, "grad_norm": 0.66796875, "learning_rate": 3.5861333452857944e-05, "loss": 0.2141, "step": 9268 }, { "epoch": 3.889261744966443, "grad_norm": 0.51953125, "learning_rate": 3.57577511171207e-05, "loss": 0.1935, "step": 9272 }, { "epoch": 3.890939597315436, "grad_norm": 0.64453125, "learning_rate": 3.565429834480729e-05, "loss": 0.2559, "step": 9276 }, { "epoch": 3.8926174496644297, "grad_norm": 0.75390625, "learning_rate": 3.555097525324464e-05, "loss": 0.359, "step": 9280 }, { "epoch": 3.8942953020134228, "grad_norm": 0.52734375, "learning_rate": 3.544778195961272e-05, "loss": 0.1855, "step": 9284 }, { "epoch": 3.895973154362416, "grad_norm": 0.46484375, "learning_rate": 3.5344718580944076e-05, "loss": 0.1742, "step": 9288 }, { "epoch": 3.8976510067114094, "grad_norm": 0.498046875, "learning_rate": 3.5241785234124126e-05, "loss": 0.3219, "step": 9292 }, { "epoch": 3.899328859060403, "grad_norm": 0.6328125, "learning_rate": 3.513898203589073e-05, "loss": 0.2866, "step": 9296 }, { "epoch": 3.901006711409396, "grad_norm": 0.55859375, "learning_rate": 3.503630910283416e-05, "loss": 0.2972, "step": 9300 }, { "epoch": 3.902684563758389, "grad_norm": 0.7578125, "learning_rate": 3.493376655139692e-05, "loss": 0.2058, "step": 9304 }, { "epoch": 3.9043624161073827, "grad_norm": 0.66796875, "learning_rate": 3.48313544978737e-05, "loss": 0.3502, "step": 9308 }, { "epoch": 3.9060402684563758, "grad_norm": 0.52734375, "learning_rate": 3.472907305841118e-05, "loss": 0.1631, "step": 9312 }, { "epoch": 3.9077181208053693, "grad_norm": 0.60546875, "learning_rate": 3.4626922349007856e-05, "loss": 0.2841, "step": 9316 }, { "epoch": 3.9093959731543624, "grad_norm": 0.5859375, "learning_rate": 3.452490248551404e-05, "loss": 0.2514, "step": 9320 }, { "epoch": 3.9110738255033555, "grad_norm": 0.55078125, "learning_rate": 3.442301358363163e-05, "loss": 0.3439, "step": 9324 }, { "epoch": 3.912751677852349, "grad_norm": 0.625, "learning_rate": 3.4321255758913904e-05, "loss": 0.3041, "step": 9328 }, { "epoch": 3.9144295302013425, "grad_norm": 0.64453125, "learning_rate": 3.421962912676559e-05, "loss": 0.2591, "step": 9332 }, { "epoch": 3.9161073825503356, "grad_norm": 0.59375, "learning_rate": 3.41181338024426e-05, "loss": 0.2491, "step": 9336 }, { "epoch": 3.9177852348993287, "grad_norm": 0.6796875, "learning_rate": 3.401676990105195e-05, "loss": 0.3316, "step": 9340 }, { "epoch": 3.9194630872483223, "grad_norm": 0.5859375, "learning_rate": 3.391553753755155e-05, "loss": 0.2862, "step": 9344 }, { "epoch": 3.9211409395973154, "grad_norm": 0.462890625, "learning_rate": 3.381443682675023e-05, "loss": 0.1956, "step": 9348 }, { "epoch": 3.922818791946309, "grad_norm": 0.671875, "learning_rate": 3.371346788330737e-05, "loss": 0.2702, "step": 9352 }, { "epoch": 3.924496644295302, "grad_norm": 0.703125, "learning_rate": 3.361263082173301e-05, "loss": 0.2516, "step": 9356 }, { "epoch": 3.926174496644295, "grad_norm": 0.62109375, "learning_rate": 3.351192575638761e-05, "loss": 0.1959, "step": 9360 }, { "epoch": 3.9278523489932886, "grad_norm": 0.474609375, "learning_rate": 3.341135280148192e-05, "loss": 0.1469, "step": 9364 }, { "epoch": 3.929530201342282, "grad_norm": 0.81640625, "learning_rate": 3.331091207107689e-05, "loss": 0.3163, "step": 9368 }, { "epoch": 3.9312080536912752, "grad_norm": 0.7109375, "learning_rate": 3.321060367908344e-05, "loss": 0.2857, "step": 9372 }, { "epoch": 3.9328859060402683, "grad_norm": 0.57421875, "learning_rate": 3.311042773926246e-05, "loss": 0.2177, "step": 9376 }, { "epoch": 3.934563758389262, "grad_norm": 0.63671875, "learning_rate": 3.301038436522461e-05, "loss": 0.238, "step": 9380 }, { "epoch": 3.936241610738255, "grad_norm": 0.52734375, "learning_rate": 3.291047367043023e-05, "loss": 0.2474, "step": 9384 }, { "epoch": 3.9379194630872485, "grad_norm": 0.439453125, "learning_rate": 3.281069576818916e-05, "loss": 0.2176, "step": 9388 }, { "epoch": 3.9395973154362416, "grad_norm": 0.5, "learning_rate": 3.271105077166066e-05, "loss": 0.1952, "step": 9392 }, { "epoch": 3.9412751677852347, "grad_norm": 0.6328125, "learning_rate": 3.261153879385321e-05, "loss": 0.1896, "step": 9396 }, { "epoch": 3.942953020134228, "grad_norm": 0.62109375, "learning_rate": 3.251215994762449e-05, "loss": 0.3449, "step": 9400 }, { "epoch": 3.9446308724832218, "grad_norm": 0.609375, "learning_rate": 3.241291434568116e-05, "loss": 0.2764, "step": 9404 }, { "epoch": 3.946308724832215, "grad_norm": 0.62890625, "learning_rate": 3.231380210057883e-05, "loss": 0.2983, "step": 9408 }, { "epoch": 3.947986577181208, "grad_norm": 0.443359375, "learning_rate": 3.221482332472172e-05, "loss": 0.1129, "step": 9412 }, { "epoch": 3.9496644295302015, "grad_norm": 0.73828125, "learning_rate": 3.2115978130362845e-05, "loss": 0.3243, "step": 9416 }, { "epoch": 3.9513422818791946, "grad_norm": 0.41015625, "learning_rate": 3.2017266629603625e-05, "loss": 0.1212, "step": 9420 }, { "epoch": 3.953020134228188, "grad_norm": 0.63671875, "learning_rate": 3.1918688934393896e-05, "loss": 0.2704, "step": 9424 }, { "epoch": 3.954697986577181, "grad_norm": 0.58203125, "learning_rate": 3.182024515653175e-05, "loss": 0.1658, "step": 9428 }, { "epoch": 3.9563758389261743, "grad_norm": 0.78125, "learning_rate": 3.172193540766342e-05, "loss": 0.3355, "step": 9432 }, { "epoch": 3.958053691275168, "grad_norm": 0.5, "learning_rate": 3.1623759799283005e-05, "loss": 0.1919, "step": 9436 }, { "epoch": 3.959731543624161, "grad_norm": 0.67578125, "learning_rate": 3.152571844273261e-05, "loss": 0.1844, "step": 9440 }, { "epoch": 3.9614093959731544, "grad_norm": 0.35546875, "learning_rate": 3.142781144920208e-05, "loss": 0.2021, "step": 9444 }, { "epoch": 3.9630872483221475, "grad_norm": 0.671875, "learning_rate": 3.13300389297288e-05, "loss": 0.2814, "step": 9448 }, { "epoch": 3.964765100671141, "grad_norm": 0.71875, "learning_rate": 3.1232400995197715e-05, "loss": 0.2314, "step": 9452 }, { "epoch": 3.966442953020134, "grad_norm": 0.65625, "learning_rate": 3.113489775634113e-05, "loss": 0.2645, "step": 9456 }, { "epoch": 3.9681208053691277, "grad_norm": 0.58203125, "learning_rate": 3.1037529323738494e-05, "loss": 0.3189, "step": 9460 }, { "epoch": 3.969798657718121, "grad_norm": 0.65625, "learning_rate": 3.094029580781651e-05, "loss": 0.2994, "step": 9464 }, { "epoch": 3.971476510067114, "grad_norm": 0.56640625, "learning_rate": 3.084319731884876e-05, "loss": 0.2464, "step": 9468 }, { "epoch": 3.9731543624161074, "grad_norm": 0.59765625, "learning_rate": 3.0746233966955774e-05, "loss": 0.1847, "step": 9472 }, { "epoch": 3.9748322147651005, "grad_norm": 0.546875, "learning_rate": 3.064940586210477e-05, "loss": 0.2501, "step": 9476 }, { "epoch": 3.976510067114094, "grad_norm": 0.62890625, "learning_rate": 3.055271311410959e-05, "loss": 0.2849, "step": 9480 }, { "epoch": 3.978187919463087, "grad_norm": 0.5859375, "learning_rate": 3.045615583263058e-05, "loss": 0.3469, "step": 9484 }, { "epoch": 3.9798657718120807, "grad_norm": 0.6171875, "learning_rate": 3.0359734127174428e-05, "loss": 0.2502, "step": 9488 }, { "epoch": 3.9815436241610738, "grad_norm": 0.357421875, "learning_rate": 3.0263448107094096e-05, "loss": 0.2323, "step": 9492 }, { "epoch": 3.9832214765100673, "grad_norm": 0.53515625, "learning_rate": 3.016729788158868e-05, "loss": 0.2161, "step": 9496 }, { "epoch": 3.9848993288590604, "grad_norm": 0.55859375, "learning_rate": 3.0071283559703157e-05, "loss": 0.2361, "step": 9500 }, { "epoch": 3.9865771812080535, "grad_norm": 0.86328125, "learning_rate": 2.9975405250328487e-05, "loss": 0.2957, "step": 9504 }, { "epoch": 3.988255033557047, "grad_norm": 0.65234375, "learning_rate": 2.9879663062201343e-05, "loss": 0.2131, "step": 9508 }, { "epoch": 3.98993288590604, "grad_norm": 0.48046875, "learning_rate": 2.978405710390403e-05, "loss": 0.1498, "step": 9512 }, { "epoch": 3.9916107382550337, "grad_norm": 0.4921875, "learning_rate": 2.968858748386437e-05, "loss": 0.1858, "step": 9516 }, { "epoch": 3.9932885906040267, "grad_norm": 0.6015625, "learning_rate": 2.9593254310355478e-05, "loss": 0.281, "step": 9520 }, { "epoch": 3.9949664429530203, "grad_norm": 0.54296875, "learning_rate": 2.9498057691495798e-05, "loss": 0.2316, "step": 9524 }, { "epoch": 3.9966442953020134, "grad_norm": 0.50390625, "learning_rate": 2.9402997735248906e-05, "loss": 0.2764, "step": 9528 }, { "epoch": 3.998322147651007, "grad_norm": 0.62109375, "learning_rate": 2.930807454942337e-05, "loss": 0.1535, "step": 9532 }, { "epoch": 4.0, "grad_norm": 0.6328125, "learning_rate": 2.9213288241672643e-05, "loss": 0.2255, "step": 9536 }, { "epoch": 4.001677852348993, "grad_norm": 0.59765625, "learning_rate": 2.9118638919494986e-05, "loss": 0.2377, "step": 9540 }, { "epoch": 4.003355704697986, "grad_norm": 0.53125, "learning_rate": 2.902412669023318e-05, "loss": 0.2731, "step": 9544 }, { "epoch": 4.00503355704698, "grad_norm": 0.53515625, "learning_rate": 2.892975166107465e-05, "loss": 0.322, "step": 9548 }, { "epoch": 4.006711409395973, "grad_norm": 0.396484375, "learning_rate": 2.883551393905115e-05, "loss": 0.222, "step": 9552 }, { "epoch": 4.008389261744966, "grad_norm": 0.431640625, "learning_rate": 2.8741413631038873e-05, "loss": 0.194, "step": 9556 }, { "epoch": 4.010067114093959, "grad_norm": 0.546875, "learning_rate": 2.8647450843757897e-05, "loss": 0.2744, "step": 9560 }, { "epoch": 4.011744966442953, "grad_norm": 0.4609375, "learning_rate": 2.855362568377253e-05, "loss": 0.2434, "step": 9564 }, { "epoch": 4.0134228187919465, "grad_norm": 0.67578125, "learning_rate": 2.8459938257490937e-05, "loss": 0.2043, "step": 9568 }, { "epoch": 4.01510067114094, "grad_norm": 0.490234375, "learning_rate": 2.8366388671165113e-05, "loss": 0.2875, "step": 9572 }, { "epoch": 4.016778523489933, "grad_norm": 0.58984375, "learning_rate": 2.827297703089067e-05, "loss": 0.2693, "step": 9576 }, { "epoch": 4.018456375838926, "grad_norm": 0.56640625, "learning_rate": 2.817970344260685e-05, "loss": 0.2808, "step": 9580 }, { "epoch": 4.02013422818792, "grad_norm": 0.54296875, "learning_rate": 2.8086568012096215e-05, "loss": 0.3269, "step": 9584 }, { "epoch": 4.021812080536913, "grad_norm": 0.60546875, "learning_rate": 2.7993570844984747e-05, "loss": 0.3156, "step": 9588 }, { "epoch": 4.023489932885906, "grad_norm": 0.46875, "learning_rate": 2.7900712046741556e-05, "loss": 0.2216, "step": 9592 }, { "epoch": 4.025167785234899, "grad_norm": 0.5625, "learning_rate": 2.780799172267889e-05, "loss": 0.2234, "step": 9596 }, { "epoch": 4.026845637583893, "grad_norm": 0.380859375, "learning_rate": 2.771540997795188e-05, "loss": 0.1192, "step": 9600 }, { "epoch": 4.028523489932886, "grad_norm": 0.58984375, "learning_rate": 2.7622966917558588e-05, "loss": 0.1728, "step": 9604 }, { "epoch": 4.030201342281879, "grad_norm": 0.32421875, "learning_rate": 2.7530662646339664e-05, "loss": 0.1775, "step": 9608 }, { "epoch": 4.031879194630872, "grad_norm": 0.365234375, "learning_rate": 2.7438497268978436e-05, "loss": 0.1607, "step": 9612 }, { "epoch": 4.033557046979865, "grad_norm": 0.5078125, "learning_rate": 2.7346470890000717e-05, "loss": 0.2333, "step": 9616 }, { "epoch": 4.035234899328859, "grad_norm": 0.59375, "learning_rate": 2.7254583613774644e-05, "loss": 0.2772, "step": 9620 }, { "epoch": 4.0369127516778525, "grad_norm": 0.6171875, "learning_rate": 2.716283554451068e-05, "loss": 0.2263, "step": 9624 }, { "epoch": 4.0385906040268456, "grad_norm": 0.53515625, "learning_rate": 2.707122678626124e-05, "loss": 0.259, "step": 9628 }, { "epoch": 4.040268456375839, "grad_norm": 0.3046875, "learning_rate": 2.6979757442920902e-05, "loss": 0.2037, "step": 9632 }, { "epoch": 4.041946308724833, "grad_norm": 0.451171875, "learning_rate": 2.6888427618226067e-05, "loss": 0.2221, "step": 9636 }, { "epoch": 4.043624161073826, "grad_norm": 0.34375, "learning_rate": 2.6797237415754947e-05, "loss": 0.2372, "step": 9640 }, { "epoch": 4.045302013422819, "grad_norm": 0.51171875, "learning_rate": 2.6706186938927416e-05, "loss": 0.1584, "step": 9644 }, { "epoch": 4.046979865771812, "grad_norm": 0.408203125, "learning_rate": 2.6615276291004756e-05, "loss": 0.107, "step": 9648 }, { "epoch": 4.048657718120805, "grad_norm": 0.42578125, "learning_rate": 2.652450557508981e-05, "loss": 0.1561, "step": 9652 }, { "epoch": 4.050335570469799, "grad_norm": 0.390625, "learning_rate": 2.643387489412665e-05, "loss": 0.1933, "step": 9656 }, { "epoch": 4.052013422818792, "grad_norm": 0.5625, "learning_rate": 2.634338435090057e-05, "loss": 0.2042, "step": 9660 }, { "epoch": 4.053691275167785, "grad_norm": 0.609375, "learning_rate": 2.625303404803793e-05, "loss": 0.2766, "step": 9664 }, { "epoch": 4.055369127516778, "grad_norm": 0.43359375, "learning_rate": 2.616282408800603e-05, "loss": 0.2178, "step": 9668 }, { "epoch": 4.057046979865772, "grad_norm": 0.314453125, "learning_rate": 2.6072754573112948e-05, "loss": 0.1839, "step": 9672 }, { "epoch": 4.058724832214765, "grad_norm": 0.625, "learning_rate": 2.598282560550759e-05, "loss": 0.2196, "step": 9676 }, { "epoch": 4.060402684563758, "grad_norm": 0.60546875, "learning_rate": 2.589303728717939e-05, "loss": 0.1995, "step": 9680 }, { "epoch": 4.0620805369127515, "grad_norm": 0.58203125, "learning_rate": 2.5803389719958294e-05, "loss": 0.2084, "step": 9684 }, { "epoch": 4.063758389261745, "grad_norm": 0.423828125, "learning_rate": 2.5713883005514674e-05, "loss": 0.1425, "step": 9688 }, { "epoch": 4.065436241610739, "grad_norm": 0.78515625, "learning_rate": 2.5624517245359012e-05, "loss": 0.2552, "step": 9692 }, { "epoch": 4.067114093959732, "grad_norm": 0.44140625, "learning_rate": 2.5535292540842067e-05, "loss": 0.2471, "step": 9696 }, { "epoch": 4.068791946308725, "grad_norm": 0.470703125, "learning_rate": 2.54462089931546e-05, "loss": 0.3329, "step": 9700 }, { "epoch": 4.070469798657718, "grad_norm": 0.59375, "learning_rate": 2.5357266703327257e-05, "loss": 0.2701, "step": 9704 }, { "epoch": 4.072147651006712, "grad_norm": 0.6875, "learning_rate": 2.5268465772230483e-05, "loss": 0.3433, "step": 9708 }, { "epoch": 4.073825503355705, "grad_norm": 0.392578125, "learning_rate": 2.5179806300574458e-05, "loss": 0.1058, "step": 9712 }, { "epoch": 4.075503355704698, "grad_norm": 0.50390625, "learning_rate": 2.5091288388908827e-05, "loss": 0.2305, "step": 9716 }, { "epoch": 4.077181208053691, "grad_norm": 0.5390625, "learning_rate": 2.500291213762274e-05, "loss": 0.1496, "step": 9720 }, { "epoch": 4.078859060402684, "grad_norm": 0.498046875, "learning_rate": 2.4914677646944786e-05, "loss": 0.1478, "step": 9724 }, { "epoch": 4.080536912751678, "grad_norm": 0.49609375, "learning_rate": 2.4826585016942674e-05, "loss": 0.1972, "step": 9728 }, { "epoch": 4.082214765100671, "grad_norm": 0.466796875, "learning_rate": 2.4738634347523202e-05, "loss": 0.2396, "step": 9732 }, { "epoch": 4.083892617449664, "grad_norm": 0.494140625, "learning_rate": 2.4650825738432244e-05, "loss": 0.2398, "step": 9736 }, { "epoch": 4.0855704697986575, "grad_norm": 0.53515625, "learning_rate": 2.456315928925453e-05, "loss": 0.2318, "step": 9740 }, { "epoch": 4.087248322147651, "grad_norm": 0.41015625, "learning_rate": 2.4475635099413576e-05, "loss": 0.2839, "step": 9744 }, { "epoch": 4.0889261744966445, "grad_norm": 0.455078125, "learning_rate": 2.438825326817155e-05, "loss": 0.209, "step": 9748 }, { "epoch": 4.090604026845638, "grad_norm": 0.55859375, "learning_rate": 2.430101389462922e-05, "loss": 0.1936, "step": 9752 }, { "epoch": 4.092281879194631, "grad_norm": 0.578125, "learning_rate": 2.421391707772566e-05, "loss": 0.2817, "step": 9756 }, { "epoch": 4.093959731543624, "grad_norm": 0.5390625, "learning_rate": 2.4126962916238372e-05, "loss": 0.2392, "step": 9760 }, { "epoch": 4.095637583892618, "grad_norm": 0.72265625, "learning_rate": 2.40401515087831e-05, "loss": 0.2064, "step": 9764 }, { "epoch": 4.097315436241611, "grad_norm": 0.330078125, "learning_rate": 2.3953482953813584e-05, "loss": 0.1917, "step": 9768 }, { "epoch": 4.098993288590604, "grad_norm": 0.640625, "learning_rate": 2.3866957349621668e-05, "loss": 0.2036, "step": 9772 }, { "epoch": 4.100671140939597, "grad_norm": 0.640625, "learning_rate": 2.3780574794337004e-05, "loss": 0.3572, "step": 9776 }, { "epoch": 4.102348993288591, "grad_norm": 0.51171875, "learning_rate": 2.3694335385926978e-05, "loss": 0.1487, "step": 9780 }, { "epoch": 4.104026845637584, "grad_norm": 0.58984375, "learning_rate": 2.3608239222196683e-05, "loss": 0.2675, "step": 9784 }, { "epoch": 4.105704697986577, "grad_norm": 0.61328125, "learning_rate": 2.3522286400788803e-05, "loss": 0.1675, "step": 9788 }, { "epoch": 4.10738255033557, "grad_norm": 0.48046875, "learning_rate": 2.3436477019183343e-05, "loss": 0.1632, "step": 9792 }, { "epoch": 4.109060402684563, "grad_norm": 0.470703125, "learning_rate": 2.335081117469777e-05, "loss": 0.2099, "step": 9796 }, { "epoch": 4.110738255033557, "grad_norm": 0.474609375, "learning_rate": 2.3265288964486604e-05, "loss": 0.1509, "step": 9800 }, { "epoch": 4.1124161073825505, "grad_norm": 0.380859375, "learning_rate": 2.317991048554153e-05, "loss": 0.1654, "step": 9804 }, { "epoch": 4.114093959731544, "grad_norm": 0.53125, "learning_rate": 2.309467583469134e-05, "loss": 0.1517, "step": 9808 }, { "epoch": 4.115771812080537, "grad_norm": 0.48828125, "learning_rate": 2.3009585108601557e-05, "loss": 0.2333, "step": 9812 }, { "epoch": 4.117449664429531, "grad_norm": 0.392578125, "learning_rate": 2.2924638403774593e-05, "loss": 0.2109, "step": 9816 }, { "epoch": 4.119127516778524, "grad_norm": 0.546875, "learning_rate": 2.2839835816549363e-05, "loss": 0.2558, "step": 9820 }, { "epoch": 4.120805369127517, "grad_norm": 0.61328125, "learning_rate": 2.27551774431015e-05, "loss": 0.1794, "step": 9824 }, { "epoch": 4.12248322147651, "grad_norm": 0.5390625, "learning_rate": 2.267066337944301e-05, "loss": 0.3044, "step": 9828 }, { "epoch": 4.124161073825503, "grad_norm": 0.515625, "learning_rate": 2.258629372142225e-05, "loss": 0.237, "step": 9832 }, { "epoch": 4.125838926174497, "grad_norm": 0.60546875, "learning_rate": 2.250206856472383e-05, "loss": 0.2834, "step": 9836 }, { "epoch": 4.12751677852349, "grad_norm": 0.546875, "learning_rate": 2.241798800486839e-05, "loss": 0.241, "step": 9840 }, { "epoch": 4.129194630872483, "grad_norm": 0.5546875, "learning_rate": 2.2334052137212675e-05, "loss": 0.2775, "step": 9844 }, { "epoch": 4.130872483221476, "grad_norm": 0.5703125, "learning_rate": 2.2250261056949304e-05, "loss": 0.2328, "step": 9848 }, { "epoch": 4.132550335570469, "grad_norm": 0.625, "learning_rate": 2.2166614859106657e-05, "loss": 0.2739, "step": 9852 }, { "epoch": 4.134228187919463, "grad_norm": 0.71484375, "learning_rate": 2.2083113638548873e-05, "loss": 0.259, "step": 9856 }, { "epoch": 4.135906040268456, "grad_norm": 0.486328125, "learning_rate": 2.199975748997563e-05, "loss": 0.234, "step": 9860 }, { "epoch": 4.1375838926174495, "grad_norm": 0.7109375, "learning_rate": 2.1916546507922038e-05, "loss": 0.2287, "step": 9864 }, { "epoch": 4.139261744966443, "grad_norm": 0.68359375, "learning_rate": 2.1833480786758617e-05, "loss": 0.2968, "step": 9868 }, { "epoch": 4.140939597315437, "grad_norm": 0.53125, "learning_rate": 2.1750560420691153e-05, "loss": 0.2238, "step": 9872 }, { "epoch": 4.14261744966443, "grad_norm": 0.66015625, "learning_rate": 2.1667785503760575e-05, "loss": 0.2565, "step": 9876 }, { "epoch": 4.144295302013423, "grad_norm": 0.53515625, "learning_rate": 2.1585156129842823e-05, "loss": 0.2324, "step": 9880 }, { "epoch": 4.145973154362416, "grad_norm": 0.50390625, "learning_rate": 2.1502672392648885e-05, "loss": 0.2405, "step": 9884 }, { "epoch": 4.14765100671141, "grad_norm": 0.53515625, "learning_rate": 2.1420334385724346e-05, "loss": 0.2754, "step": 9888 }, { "epoch": 4.149328859060403, "grad_norm": 0.431640625, "learning_rate": 2.13381422024498e-05, "loss": 0.2262, "step": 9892 }, { "epoch": 4.151006711409396, "grad_norm": 0.57421875, "learning_rate": 2.1256095936040292e-05, "loss": 0.3278, "step": 9896 }, { "epoch": 4.152684563758389, "grad_norm": 0.41015625, "learning_rate": 2.117419567954544e-05, "loss": 0.2245, "step": 9900 }, { "epoch": 4.154362416107382, "grad_norm": 0.546875, "learning_rate": 2.1092441525849184e-05, "loss": 0.2065, "step": 9904 }, { "epoch": 4.156040268456376, "grad_norm": 0.58984375, "learning_rate": 2.101083356766989e-05, "loss": 0.1967, "step": 9908 }, { "epoch": 4.157718120805369, "grad_norm": 0.63671875, "learning_rate": 2.0929371897560032e-05, "loss": 0.2915, "step": 9912 }, { "epoch": 4.159395973154362, "grad_norm": 0.546875, "learning_rate": 2.0848056607906233e-05, "loss": 0.2086, "step": 9916 }, { "epoch": 4.1610738255033555, "grad_norm": 0.4921875, "learning_rate": 2.076688779092907e-05, "loss": 0.2072, "step": 9920 }, { "epoch": 4.162751677852349, "grad_norm": 0.43359375, "learning_rate": 2.068586553868306e-05, "loss": 0.2313, "step": 9924 }, { "epoch": 4.1644295302013425, "grad_norm": 0.60546875, "learning_rate": 2.0604989943056366e-05, "loss": 0.2545, "step": 9928 }, { "epoch": 4.166107382550336, "grad_norm": 0.640625, "learning_rate": 2.052426109577095e-05, "loss": 0.2625, "step": 9932 }, { "epoch": 4.167785234899329, "grad_norm": 0.5546875, "learning_rate": 2.0443679088382326e-05, "loss": 0.199, "step": 9936 }, { "epoch": 4.169463087248322, "grad_norm": 0.59765625, "learning_rate": 2.0363244012279456e-05, "loss": 0.327, "step": 9940 }, { "epoch": 4.171140939597316, "grad_norm": 0.5078125, "learning_rate": 2.028295595868469e-05, "loss": 0.2311, "step": 9944 }, { "epoch": 4.172818791946309, "grad_norm": 0.51953125, "learning_rate": 2.0202815018653536e-05, "loss": 0.1123, "step": 9948 }, { "epoch": 4.174496644295302, "grad_norm": 0.412109375, "learning_rate": 2.0122821283074807e-05, "loss": 0.1676, "step": 9952 }, { "epoch": 4.176174496644295, "grad_norm": 0.6328125, "learning_rate": 2.0042974842670273e-05, "loss": 0.2799, "step": 9956 }, { "epoch": 4.177852348993288, "grad_norm": 0.404296875, "learning_rate": 1.9963275787994708e-05, "loss": 0.2136, "step": 9960 }, { "epoch": 4.179530201342282, "grad_norm": 0.578125, "learning_rate": 1.9883724209435682e-05, "loss": 0.2036, "step": 9964 }, { "epoch": 4.181208053691275, "grad_norm": 0.64453125, "learning_rate": 1.9804320197213554e-05, "loss": 0.2438, "step": 9968 }, { "epoch": 4.182885906040268, "grad_norm": 0.50390625, "learning_rate": 1.9725063841381307e-05, "loss": 0.288, "step": 9972 }, { "epoch": 4.184563758389261, "grad_norm": 0.53515625, "learning_rate": 1.9645955231824473e-05, "loss": 0.2096, "step": 9976 }, { "epoch": 4.186241610738255, "grad_norm": 0.494140625, "learning_rate": 1.9566994458261004e-05, "loss": 0.1169, "step": 9980 }, { "epoch": 4.1879194630872485, "grad_norm": 0.5859375, "learning_rate": 1.9488181610241233e-05, "loss": 0.2262, "step": 9984 }, { "epoch": 4.189597315436242, "grad_norm": 0.515625, "learning_rate": 1.9409516777147693e-05, "loss": 0.3028, "step": 9988 }, { "epoch": 4.191275167785235, "grad_norm": 0.349609375, "learning_rate": 1.9331000048195038e-05, "loss": 0.2078, "step": 9992 }, { "epoch": 4.192953020134228, "grad_norm": 0.46484375, "learning_rate": 1.9252631512429973e-05, "loss": 0.1504, "step": 9996 }, { "epoch": 4.194630872483222, "grad_norm": 0.640625, "learning_rate": 1.9174411258731164e-05, "loss": 0.2128, "step": 10000 }, { "epoch": 4.196308724832215, "grad_norm": 0.439453125, "learning_rate": 1.9096339375809073e-05, "loss": 0.1333, "step": 10004 }, { "epoch": 4.197986577181208, "grad_norm": 0.5546875, "learning_rate": 1.9018415952205916e-05, "loss": 0.2018, "step": 10008 }, { "epoch": 4.199664429530201, "grad_norm": 0.46875, "learning_rate": 1.894064107629549e-05, "loss": 0.1992, "step": 10012 }, { "epoch": 4.201342281879195, "grad_norm": 0.5625, "learning_rate": 1.886301483628319e-05, "loss": 0.2247, "step": 10016 }, { "epoch": 4.203020134228188, "grad_norm": 0.5546875, "learning_rate": 1.8785537320205806e-05, "loss": 0.2936, "step": 10020 }, { "epoch": 4.204697986577181, "grad_norm": 0.54296875, "learning_rate": 1.870820861593147e-05, "loss": 0.2186, "step": 10024 }, { "epoch": 4.206375838926174, "grad_norm": 0.625, "learning_rate": 1.8631028811159525e-05, "loss": 0.2338, "step": 10028 }, { "epoch": 4.208053691275167, "grad_norm": 0.55078125, "learning_rate": 1.8553997993420495e-05, "loss": 0.2464, "step": 10032 }, { "epoch": 4.209731543624161, "grad_norm": 0.63671875, "learning_rate": 1.8477116250075825e-05, "loss": 0.2387, "step": 10036 }, { "epoch": 4.2114093959731544, "grad_norm": 0.337890625, "learning_rate": 1.840038366831802e-05, "loss": 0.1637, "step": 10040 }, { "epoch": 4.2130872483221475, "grad_norm": 0.578125, "learning_rate": 1.832380033517037e-05, "loss": 0.2455, "step": 10044 }, { "epoch": 4.214765100671141, "grad_norm": 0.546875, "learning_rate": 1.8247366337486862e-05, "loss": 0.2365, "step": 10048 }, { "epoch": 4.216442953020135, "grad_norm": 0.466796875, "learning_rate": 1.8171081761952195e-05, "loss": 0.1966, "step": 10052 }, { "epoch": 4.218120805369128, "grad_norm": 0.59765625, "learning_rate": 1.8094946695081556e-05, "loss": 0.1476, "step": 10056 }, { "epoch": 4.219798657718121, "grad_norm": 0.59375, "learning_rate": 1.801896122322057e-05, "loss": 0.3106, "step": 10060 }, { "epoch": 4.221476510067114, "grad_norm": 0.72265625, "learning_rate": 1.794312543254524e-05, "loss": 0.232, "step": 10064 }, { "epoch": 4.223154362416107, "grad_norm": 0.60546875, "learning_rate": 1.7867439409061763e-05, "loss": 0.2865, "step": 10068 }, { "epoch": 4.224832214765101, "grad_norm": 0.54296875, "learning_rate": 1.779190323860657e-05, "loss": 0.2283, "step": 10072 }, { "epoch": 4.226510067114094, "grad_norm": 0.498046875, "learning_rate": 1.771651700684602e-05, "loss": 0.1261, "step": 10076 }, { "epoch": 4.228187919463087, "grad_norm": 0.71875, "learning_rate": 1.7641280799276507e-05, "loss": 0.3202, "step": 10080 }, { "epoch": 4.22986577181208, "grad_norm": 0.65625, "learning_rate": 1.7566194701224264e-05, "loss": 0.3276, "step": 10084 }, { "epoch": 4.231543624161074, "grad_norm": 0.5625, "learning_rate": 1.749125879784528e-05, "loss": 0.1785, "step": 10088 }, { "epoch": 4.233221476510067, "grad_norm": 0.859375, "learning_rate": 1.7416473174125235e-05, "loss": 0.2461, "step": 10092 }, { "epoch": 4.23489932885906, "grad_norm": 0.578125, "learning_rate": 1.734183791487936e-05, "loss": 0.2565, "step": 10096 }, { "epoch": 4.2365771812080535, "grad_norm": 0.59765625, "learning_rate": 1.7267353104752265e-05, "loss": 0.3176, "step": 10100 }, { "epoch": 4.238255033557047, "grad_norm": 0.369140625, "learning_rate": 1.7193018828218074e-05, "loss": 0.2281, "step": 10104 }, { "epoch": 4.239932885906041, "grad_norm": 0.55078125, "learning_rate": 1.7118835169580125e-05, "loss": 0.2065, "step": 10108 }, { "epoch": 4.241610738255034, "grad_norm": 0.515625, "learning_rate": 1.7044802212970933e-05, "loss": 0.2143, "step": 10112 }, { "epoch": 4.243288590604027, "grad_norm": 0.609375, "learning_rate": 1.697092004235213e-05, "loss": 0.1935, "step": 10116 }, { "epoch": 4.24496644295302, "grad_norm": 0.60546875, "learning_rate": 1.6897188741514284e-05, "loss": 0.1471, "step": 10120 }, { "epoch": 4.246644295302014, "grad_norm": 0.58984375, "learning_rate": 1.6823608394076916e-05, "loss": 0.366, "step": 10124 }, { "epoch": 4.248322147651007, "grad_norm": 0.64453125, "learning_rate": 1.6750179083488324e-05, "loss": 0.2634, "step": 10128 }, { "epoch": 4.25, "grad_norm": 0.56640625, "learning_rate": 1.6676900893025514e-05, "loss": 0.2469, "step": 10132 }, { "epoch": 4.251677852348993, "grad_norm": 0.416015625, "learning_rate": 1.660377390579412e-05, "loss": 0.2069, "step": 10136 }, { "epoch": 4.253355704697986, "grad_norm": 0.435546875, "learning_rate": 1.6530798204728276e-05, "loss": 0.1907, "step": 10140 }, { "epoch": 4.25503355704698, "grad_norm": 0.5390625, "learning_rate": 1.645797387259055e-05, "loss": 0.1577, "step": 10144 }, { "epoch": 4.256711409395973, "grad_norm": 0.390625, "learning_rate": 1.638530099197184e-05, "loss": 0.2162, "step": 10148 }, { "epoch": 4.258389261744966, "grad_norm": 0.466796875, "learning_rate": 1.6312779645291292e-05, "loss": 0.2504, "step": 10152 }, { "epoch": 4.260067114093959, "grad_norm": 0.59375, "learning_rate": 1.6240409914796182e-05, "loss": 0.2451, "step": 10156 }, { "epoch": 4.261744966442953, "grad_norm": 0.56640625, "learning_rate": 1.616819188256181e-05, "loss": 0.2563, "step": 10160 }, { "epoch": 4.2634228187919465, "grad_norm": 0.5703125, "learning_rate": 1.6096125630491473e-05, "loss": 0.1749, "step": 10164 }, { "epoch": 4.26510067114094, "grad_norm": 0.5390625, "learning_rate": 1.602421124031633e-05, "loss": 0.1404, "step": 10168 }, { "epoch": 4.266778523489933, "grad_norm": 0.3359375, "learning_rate": 1.5952448793595307e-05, "loss": 0.1535, "step": 10172 }, { "epoch": 4.268456375838926, "grad_norm": 0.61328125, "learning_rate": 1.5880838371714984e-05, "loss": 0.256, "step": 10176 }, { "epoch": 4.27013422818792, "grad_norm": 0.53515625, "learning_rate": 1.5809380055889592e-05, "loss": 0.1771, "step": 10180 }, { "epoch": 4.271812080536913, "grad_norm": 0.57421875, "learning_rate": 1.573807392716076e-05, "loss": 0.3218, "step": 10184 }, { "epoch": 4.273489932885906, "grad_norm": 0.341796875, "learning_rate": 1.5666920066397597e-05, "loss": 0.1822, "step": 10188 }, { "epoch": 4.275167785234899, "grad_norm": 0.5234375, "learning_rate": 1.5595918554296504e-05, "loss": 0.2198, "step": 10192 }, { "epoch": 4.276845637583893, "grad_norm": 0.48046875, "learning_rate": 1.5525069471381086e-05, "loss": 0.1321, "step": 10196 }, { "epoch": 4.278523489932886, "grad_norm": 0.61328125, "learning_rate": 1.5454372898002083e-05, "loss": 0.1757, "step": 10200 }, { "epoch": 4.280201342281879, "grad_norm": 0.61328125, "learning_rate": 1.5383828914337325e-05, "loss": 0.2088, "step": 10204 }, { "epoch": 4.281879194630872, "grad_norm": 0.57421875, "learning_rate": 1.5313437600391487e-05, "loss": 0.3156, "step": 10208 }, { "epoch": 4.283557046979865, "grad_norm": 0.5390625, "learning_rate": 1.5243199035996156e-05, "loss": 0.2316, "step": 10212 }, { "epoch": 4.285234899328859, "grad_norm": 0.53515625, "learning_rate": 1.5173113300809697e-05, "loss": 0.1617, "step": 10216 }, { "epoch": 4.2869127516778525, "grad_norm": 0.49609375, "learning_rate": 1.5103180474317129e-05, "loss": 0.1796, "step": 10220 }, { "epoch": 4.2885906040268456, "grad_norm": 0.5, "learning_rate": 1.5033400635830057e-05, "loss": 0.187, "step": 10224 }, { "epoch": 4.290268456375839, "grad_norm": 0.51953125, "learning_rate": 1.4963773864486578e-05, "loss": 0.1899, "step": 10228 }, { "epoch": 4.291946308724833, "grad_norm": 0.54296875, "learning_rate": 1.4894300239251228e-05, "loss": 0.2221, "step": 10232 }, { "epoch": 4.293624161073826, "grad_norm": 0.5859375, "learning_rate": 1.4824979838914796e-05, "loss": 0.3078, "step": 10236 }, { "epoch": 4.295302013422819, "grad_norm": 0.5625, "learning_rate": 1.4755812742094347e-05, "loss": 0.1987, "step": 10240 }, { "epoch": 4.296979865771812, "grad_norm": 0.6171875, "learning_rate": 1.4686799027233082e-05, "loss": 0.2088, "step": 10244 }, { "epoch": 4.298657718120805, "grad_norm": 0.451171875, "learning_rate": 1.4617938772600207e-05, "loss": 0.2347, "step": 10248 }, { "epoch": 4.300335570469799, "grad_norm": 0.609375, "learning_rate": 1.4549232056290905e-05, "loss": 0.2537, "step": 10252 }, { "epoch": 4.302013422818792, "grad_norm": 0.322265625, "learning_rate": 1.448067895622626e-05, "loss": 0.1969, "step": 10256 }, { "epoch": 4.303691275167785, "grad_norm": 0.578125, "learning_rate": 1.4412279550153117e-05, "loss": 0.2474, "step": 10260 }, { "epoch": 4.305369127516778, "grad_norm": 0.5, "learning_rate": 1.4344033915644021e-05, "loss": 0.2482, "step": 10264 }, { "epoch": 4.307046979865772, "grad_norm": 0.4453125, "learning_rate": 1.4275942130097096e-05, "loss": 0.3137, "step": 10268 }, { "epoch": 4.308724832214765, "grad_norm": 0.609375, "learning_rate": 1.4208004270736e-05, "loss": 0.245, "step": 10272 }, { "epoch": 4.310402684563758, "grad_norm": 0.640625, "learning_rate": 1.4140220414609838e-05, "loss": 0.2439, "step": 10276 }, { "epoch": 4.3120805369127515, "grad_norm": 0.515625, "learning_rate": 1.4072590638593046e-05, "loss": 0.1628, "step": 10280 }, { "epoch": 4.313758389261745, "grad_norm": 0.52734375, "learning_rate": 1.4005115019385322e-05, "loss": 0.262, "step": 10284 }, { "epoch": 4.315436241610739, "grad_norm": 0.494140625, "learning_rate": 1.3937793633511552e-05, "loss": 0.0833, "step": 10288 }, { "epoch": 4.317114093959732, "grad_norm": 0.5859375, "learning_rate": 1.3870626557321613e-05, "loss": 0.2017, "step": 10292 }, { "epoch": 4.318791946308725, "grad_norm": 0.451171875, "learning_rate": 1.3803613866990487e-05, "loss": 0.1583, "step": 10296 }, { "epoch": 4.320469798657718, "grad_norm": 0.453125, "learning_rate": 1.3736755638517998e-05, "loss": 0.1855, "step": 10300 }, { "epoch": 4.322147651006711, "grad_norm": 0.56640625, "learning_rate": 1.367005194772886e-05, "loss": 0.1755, "step": 10304 }, { "epoch": 4.323825503355705, "grad_norm": 0.6015625, "learning_rate": 1.3603502870272475e-05, "loss": 0.2651, "step": 10308 }, { "epoch": 4.325503355704698, "grad_norm": 0.63671875, "learning_rate": 1.3537108481622871e-05, "loss": 0.2729, "step": 10312 }, { "epoch": 4.327181208053691, "grad_norm": 0.65234375, "learning_rate": 1.3470868857078681e-05, "loss": 0.2365, "step": 10316 }, { "epoch": 4.328859060402684, "grad_norm": 0.392578125, "learning_rate": 1.3404784071763014e-05, "loss": 0.2686, "step": 10320 }, { "epoch": 4.330536912751678, "grad_norm": 0.6328125, "learning_rate": 1.3338854200623383e-05, "loss": 0.2388, "step": 10324 }, { "epoch": 4.332214765100671, "grad_norm": 0.6796875, "learning_rate": 1.3273079318431612e-05, "loss": 0.2033, "step": 10328 }, { "epoch": 4.333892617449664, "grad_norm": 0.66015625, "learning_rate": 1.3207459499783696e-05, "loss": 0.158, "step": 10332 }, { "epoch": 4.3355704697986575, "grad_norm": 0.53515625, "learning_rate": 1.314199481909982e-05, "loss": 0.2474, "step": 10336 }, { "epoch": 4.337248322147651, "grad_norm": 0.5859375, "learning_rate": 1.3076685350624234e-05, "loss": 0.2617, "step": 10340 }, { "epoch": 4.3389261744966445, "grad_norm": 0.5703125, "learning_rate": 1.3011531168425148e-05, "loss": 0.1409, "step": 10344 }, { "epoch": 4.340604026845638, "grad_norm": 0.47265625, "learning_rate": 1.2946532346394628e-05, "loss": 0.1601, "step": 10348 }, { "epoch": 4.342281879194631, "grad_norm": 0.58984375, "learning_rate": 1.2881688958248637e-05, "loss": 0.231, "step": 10352 }, { "epoch": 4.343959731543624, "grad_norm": 0.392578125, "learning_rate": 1.28170010775267e-05, "loss": 0.2055, "step": 10356 }, { "epoch": 4.345637583892618, "grad_norm": 0.5703125, "learning_rate": 1.2752468777592134e-05, "loss": 0.2596, "step": 10360 }, { "epoch": 4.347315436241611, "grad_norm": 0.63671875, "learning_rate": 1.2688092131631728e-05, "loss": 0.3269, "step": 10364 }, { "epoch": 4.348993288590604, "grad_norm": 0.4609375, "learning_rate": 1.2623871212655762e-05, "loss": 0.2255, "step": 10368 }, { "epoch": 4.350671140939597, "grad_norm": 0.58984375, "learning_rate": 1.2559806093497942e-05, "loss": 0.1785, "step": 10372 }, { "epoch": 4.35234899328859, "grad_norm": 0.423828125, "learning_rate": 1.2495896846815184e-05, "loss": 0.2849, "step": 10376 }, { "epoch": 4.354026845637584, "grad_norm": 0.42578125, "learning_rate": 1.2432143545087708e-05, "loss": 0.1328, "step": 10380 }, { "epoch": 4.355704697986577, "grad_norm": 0.5703125, "learning_rate": 1.2368546260618844e-05, "loss": 0.3433, "step": 10384 }, { "epoch": 4.35738255033557, "grad_norm": 0.51953125, "learning_rate": 1.2305105065535014e-05, "loss": 0.2084, "step": 10388 }, { "epoch": 4.359060402684563, "grad_norm": 0.54296875, "learning_rate": 1.2241820031785631e-05, "loss": 0.1541, "step": 10392 }, { "epoch": 4.360738255033557, "grad_norm": 0.52734375, "learning_rate": 1.2178691231142883e-05, "loss": 0.2195, "step": 10396 }, { "epoch": 4.3624161073825505, "grad_norm": 0.5859375, "learning_rate": 1.2115718735201885e-05, "loss": 0.2402, "step": 10400 }, { "epoch": 4.364093959731544, "grad_norm": 0.59375, "learning_rate": 1.2052902615380471e-05, "loss": 0.2014, "step": 10404 }, { "epoch": 4.365771812080537, "grad_norm": 0.53125, "learning_rate": 1.1990242942919094e-05, "loss": 0.2028, "step": 10408 }, { "epoch": 4.367449664429531, "grad_norm": 0.51953125, "learning_rate": 1.1927739788880803e-05, "loss": 0.2112, "step": 10412 }, { "epoch": 4.369127516778524, "grad_norm": 0.46875, "learning_rate": 1.1865393224151165e-05, "loss": 0.1716, "step": 10416 }, { "epoch": 4.370805369127517, "grad_norm": 0.49609375, "learning_rate": 1.1803203319438053e-05, "loss": 0.1611, "step": 10420 }, { "epoch": 4.37248322147651, "grad_norm": 0.53125, "learning_rate": 1.1741170145271783e-05, "loss": 0.1907, "step": 10424 }, { "epoch": 4.374161073825503, "grad_norm": 0.376953125, "learning_rate": 1.1679293772004877e-05, "loss": 0.2616, "step": 10428 }, { "epoch": 4.375838926174497, "grad_norm": 0.50390625, "learning_rate": 1.161757426981202e-05, "loss": 0.2742, "step": 10432 }, { "epoch": 4.37751677852349, "grad_norm": 0.44140625, "learning_rate": 1.1556011708690016e-05, "loss": 0.2827, "step": 10436 }, { "epoch": 4.379194630872483, "grad_norm": 0.37890625, "learning_rate": 1.149460615845762e-05, "loss": 0.2222, "step": 10440 }, { "epoch": 4.380872483221476, "grad_norm": 0.51953125, "learning_rate": 1.1433357688755573e-05, "loss": 0.1975, "step": 10444 }, { "epoch": 4.382550335570469, "grad_norm": 0.59765625, "learning_rate": 1.1372266369046478e-05, "loss": 0.2792, "step": 10448 }, { "epoch": 4.384228187919463, "grad_norm": 0.484375, "learning_rate": 1.1311332268614665e-05, "loss": 0.1412, "step": 10452 }, { "epoch": 4.385906040268456, "grad_norm": 0.54296875, "learning_rate": 1.1250555456566217e-05, "loss": 0.1721, "step": 10456 }, { "epoch": 4.3875838926174495, "grad_norm": 0.6328125, "learning_rate": 1.1189936001828786e-05, "loss": 0.2091, "step": 10460 }, { "epoch": 4.389261744966443, "grad_norm": 0.431640625, "learning_rate": 1.1129473973151559e-05, "loss": 0.2342, "step": 10464 }, { "epoch": 4.390939597315437, "grad_norm": 0.63671875, "learning_rate": 1.1069169439105203e-05, "loss": 0.2833, "step": 10468 }, { "epoch": 4.39261744966443, "grad_norm": 0.50390625, "learning_rate": 1.1009022468081807e-05, "loss": 0.1475, "step": 10472 }, { "epoch": 4.394295302013423, "grad_norm": 0.8203125, "learning_rate": 1.0949033128294742e-05, "loss": 0.2631, "step": 10476 }, { "epoch": 4.395973154362416, "grad_norm": 0.41015625, "learning_rate": 1.0889201487778537e-05, "loss": 0.1831, "step": 10480 }, { "epoch": 4.39765100671141, "grad_norm": 0.466796875, "learning_rate": 1.082952761438896e-05, "loss": 0.1624, "step": 10484 }, { "epoch": 4.399328859060403, "grad_norm": 0.51953125, "learning_rate": 1.0770011575802828e-05, "loss": 0.1644, "step": 10488 }, { "epoch": 4.401006711409396, "grad_norm": 0.65625, "learning_rate": 1.0710653439517946e-05, "loss": 0.2429, "step": 10492 }, { "epoch": 4.402684563758389, "grad_norm": 0.5625, "learning_rate": 1.0651453272853034e-05, "loss": 0.1927, "step": 10496 }, { "epoch": 4.404362416107382, "grad_norm": 0.63671875, "learning_rate": 1.0592411142947732e-05, "loss": 0.2045, "step": 10500 }, { "epoch": 4.406040268456376, "grad_norm": 0.439453125, "learning_rate": 1.0533527116762296e-05, "loss": 0.2796, "step": 10504 }, { "epoch": 4.407718120805369, "grad_norm": 0.55078125, "learning_rate": 1.0474801261077809e-05, "loss": 0.2722, "step": 10508 }, { "epoch": 4.409395973154362, "grad_norm": 0.5859375, "learning_rate": 1.0416233642495941e-05, "loss": 0.2868, "step": 10512 }, { "epoch": 4.4110738255033555, "grad_norm": 0.46484375, "learning_rate": 1.0357824327438846e-05, "loss": 0.2394, "step": 10516 }, { "epoch": 4.412751677852349, "grad_norm": 0.466796875, "learning_rate": 1.0299573382149234e-05, "loss": 0.1615, "step": 10520 }, { "epoch": 4.4144295302013425, "grad_norm": 0.5859375, "learning_rate": 1.0241480872690145e-05, "loss": 0.2783, "step": 10524 }, { "epoch": 4.416107382550336, "grad_norm": 0.5859375, "learning_rate": 1.0183546864944942e-05, "loss": 0.187, "step": 10528 }, { "epoch": 4.417785234899329, "grad_norm": 0.74609375, "learning_rate": 1.0125771424617218e-05, "loss": 0.2755, "step": 10532 }, { "epoch": 4.419463087248322, "grad_norm": 0.55859375, "learning_rate": 1.0068154617230778e-05, "loss": 0.226, "step": 10536 }, { "epoch": 4.421140939597316, "grad_norm": 0.71484375, "learning_rate": 1.0010696508129501e-05, "loss": 0.3128, "step": 10540 }, { "epoch": 4.422818791946309, "grad_norm": 0.5859375, "learning_rate": 9.95339716247726e-06, "loss": 0.2358, "step": 10544 }, { "epoch": 4.424496644295302, "grad_norm": 0.60546875, "learning_rate": 9.896256645257893e-06, "loss": 0.2496, "step": 10548 }, { "epoch": 4.426174496644295, "grad_norm": 0.5625, "learning_rate": 9.839275021275078e-06, "loss": 0.2994, "step": 10552 }, { "epoch": 4.427852348993289, "grad_norm": 0.423828125, "learning_rate": 9.78245235515237e-06, "loss": 0.2098, "step": 10556 }, { "epoch": 4.429530201342282, "grad_norm": 0.66796875, "learning_rate": 9.725788711332972e-06, "loss": 0.2725, "step": 10560 }, { "epoch": 4.431208053691275, "grad_norm": 0.48828125, "learning_rate": 9.669284154079776e-06, "loss": 0.1973, "step": 10564 }, { "epoch": 4.432885906040268, "grad_norm": 0.55859375, "learning_rate": 9.612938747475207e-06, "loss": 0.2937, "step": 10568 }, { "epoch": 4.434563758389261, "grad_norm": 0.5859375, "learning_rate": 9.556752555421249e-06, "loss": 0.2015, "step": 10572 }, { "epoch": 4.436241610738255, "grad_norm": 0.388671875, "learning_rate": 9.500725641639294e-06, "loss": 0.1907, "step": 10576 }, { "epoch": 4.4379194630872485, "grad_norm": 0.447265625, "learning_rate": 9.444858069670086e-06, "loss": 0.1634, "step": 10580 }, { "epoch": 4.439597315436242, "grad_norm": 0.53515625, "learning_rate": 9.389149902873688e-06, "loss": 0.1424, "step": 10584 }, { "epoch": 4.441275167785235, "grad_norm": 0.52734375, "learning_rate": 9.333601204429348e-06, "loss": 0.1968, "step": 10588 }, { "epoch": 4.442953020134228, "grad_norm": 0.48828125, "learning_rate": 9.278212037335469e-06, "loss": 0.2081, "step": 10592 }, { "epoch": 4.444630872483222, "grad_norm": 0.66796875, "learning_rate": 9.222982464409522e-06, "loss": 0.2061, "step": 10596 }, { "epoch": 4.446308724832215, "grad_norm": 0.56640625, "learning_rate": 9.16791254828802e-06, "loss": 0.3462, "step": 10600 }, { "epoch": 4.447986577181208, "grad_norm": 0.7734375, "learning_rate": 9.113002351426351e-06, "loss": 0.3171, "step": 10604 }, { "epoch": 4.449664429530201, "grad_norm": 0.65625, "learning_rate": 9.058251936098821e-06, "loss": 0.2802, "step": 10608 }, { "epoch": 4.451342281879195, "grad_norm": 0.625, "learning_rate": 9.00366136439844e-06, "loss": 0.3383, "step": 10612 }, { "epoch": 4.453020134228188, "grad_norm": 0.703125, "learning_rate": 8.949230698237037e-06, "loss": 0.2074, "step": 10616 }, { "epoch": 4.454697986577181, "grad_norm": 0.62109375, "learning_rate": 8.894959999345014e-06, "loss": 0.3204, "step": 10620 }, { "epoch": 4.456375838926174, "grad_norm": 0.64453125, "learning_rate": 8.840849329271404e-06, "loss": 0.2882, "step": 10624 }, { "epoch": 4.458053691275167, "grad_norm": 0.5234375, "learning_rate": 8.78689874938372e-06, "loss": 0.2438, "step": 10628 }, { "epoch": 4.459731543624161, "grad_norm": 0.671875, "learning_rate": 8.733108320867932e-06, "loss": 0.3264, "step": 10632 }, { "epoch": 4.4614093959731544, "grad_norm": 0.4375, "learning_rate": 8.67947810472836e-06, "loss": 0.1199, "step": 10636 }, { "epoch": 4.4630872483221475, "grad_norm": 0.6640625, "learning_rate": 8.62600816178765e-06, "loss": 0.2057, "step": 10640 }, { "epoch": 4.464765100671141, "grad_norm": 0.39453125, "learning_rate": 8.572698552686663e-06, "loss": 0.1707, "step": 10644 }, { "epoch": 4.466442953020135, "grad_norm": 0.6171875, "learning_rate": 8.519549337884435e-06, "loss": 0.2744, "step": 10648 }, { "epoch": 4.468120805369128, "grad_norm": 0.412109375, "learning_rate": 8.466560577658083e-06, "loss": 0.1447, "step": 10652 }, { "epoch": 4.469798657718121, "grad_norm": 0.671875, "learning_rate": 8.413732332102734e-06, "loss": 0.1919, "step": 10656 }, { "epoch": 4.471476510067114, "grad_norm": 0.546875, "learning_rate": 8.361064661131533e-06, "loss": 0.2165, "step": 10660 }, { "epoch": 4.473154362416107, "grad_norm": 0.5078125, "learning_rate": 8.308557624475465e-06, "loss": 0.2112, "step": 10664 }, { "epoch": 4.474832214765101, "grad_norm": 0.546875, "learning_rate": 8.256211281683362e-06, "loss": 0.1069, "step": 10668 }, { "epoch": 4.476510067114094, "grad_norm": 0.5078125, "learning_rate": 8.204025692121802e-06, "loss": 0.1648, "step": 10672 }, { "epoch": 4.478187919463087, "grad_norm": 0.69921875, "learning_rate": 8.152000914975043e-06, "loss": 0.2194, "step": 10676 }, { "epoch": 4.47986577181208, "grad_norm": 0.625, "learning_rate": 8.100137009244956e-06, "loss": 0.2418, "step": 10680 }, { "epoch": 4.481543624161074, "grad_norm": 0.41015625, "learning_rate": 8.048434033751006e-06, "loss": 0.1736, "step": 10684 }, { "epoch": 4.483221476510067, "grad_norm": 0.5859375, "learning_rate": 7.99689204713012e-06, "loss": 0.2271, "step": 10688 }, { "epoch": 4.48489932885906, "grad_norm": 0.48828125, "learning_rate": 7.945511107836661e-06, "loss": 0.1208, "step": 10692 }, { "epoch": 4.4865771812080535, "grad_norm": 0.640625, "learning_rate": 7.894291274142295e-06, "loss": 0.1471, "step": 10696 }, { "epoch": 4.488255033557047, "grad_norm": 0.4296875, "learning_rate": 7.843232604136023e-06, "loss": 0.3142, "step": 10700 }, { "epoch": 4.489932885906041, "grad_norm": 0.59765625, "learning_rate": 7.79233515572406e-06, "loss": 0.2041, "step": 10704 }, { "epoch": 4.491610738255034, "grad_norm": 0.62109375, "learning_rate": 7.7415989866298e-06, "loss": 0.279, "step": 10708 }, { "epoch": 4.493288590604027, "grad_norm": 0.71875, "learning_rate": 7.691024154393676e-06, "loss": 0.1797, "step": 10712 }, { "epoch": 4.49496644295302, "grad_norm": 0.52734375, "learning_rate": 7.640610716373197e-06, "loss": 0.1691, "step": 10716 }, { "epoch": 4.496644295302014, "grad_norm": 0.4609375, "learning_rate": 7.590358729742807e-06, "loss": 0.2136, "step": 10720 }, { "epoch": 4.498322147651007, "grad_norm": 0.62890625, "learning_rate": 7.540268251493836e-06, "loss": 0.2557, "step": 10724 }, { "epoch": 4.5, "grad_norm": 0.60546875, "learning_rate": 7.490339338434476e-06, "loss": 0.1453, "step": 10728 }, { "epoch": 4.501677852348993, "grad_norm": 0.68359375, "learning_rate": 7.44057204718968e-06, "loss": 0.2251, "step": 10732 }, { "epoch": 4.503355704697986, "grad_norm": 0.56640625, "learning_rate": 7.390966434201084e-06, "loss": 0.2123, "step": 10736 }, { "epoch": 4.50503355704698, "grad_norm": 0.66015625, "learning_rate": 7.34152255572697e-06, "loss": 0.217, "step": 10740 }, { "epoch": 4.506711409395973, "grad_norm": 0.48046875, "learning_rate": 7.292240467842198e-06, "loss": 0.1834, "step": 10744 }, { "epoch": 4.508389261744966, "grad_norm": 0.6484375, "learning_rate": 7.243120226438126e-06, "loss": 0.2745, "step": 10748 }, { "epoch": 4.510067114093959, "grad_norm": 0.7421875, "learning_rate": 7.1941618872226104e-06, "loss": 0.2829, "step": 10752 }, { "epoch": 4.5117449664429525, "grad_norm": 0.58984375, "learning_rate": 7.145365505719836e-06, "loss": 0.2756, "step": 10756 }, { "epoch": 4.5134228187919465, "grad_norm": 0.56640625, "learning_rate": 7.096731137270317e-06, "loss": 0.213, "step": 10760 }, { "epoch": 4.51510067114094, "grad_norm": 0.59765625, "learning_rate": 7.048258837030868e-06, "loss": 0.2487, "step": 10764 }, { "epoch": 4.516778523489933, "grad_norm": 0.87890625, "learning_rate": 6.999948659974447e-06, "loss": 0.3058, "step": 10768 }, { "epoch": 4.518456375838926, "grad_norm": 0.5234375, "learning_rate": 6.951800660890178e-06, "loss": 0.2398, "step": 10772 }, { "epoch": 4.52013422818792, "grad_norm": 0.61328125, "learning_rate": 6.903814894383248e-06, "loss": 0.1746, "step": 10776 }, { "epoch": 4.521812080536913, "grad_norm": 0.5625, "learning_rate": 6.85599141487489e-06, "loss": 0.1705, "step": 10780 }, { "epoch": 4.523489932885906, "grad_norm": 0.6328125, "learning_rate": 6.808330276602203e-06, "loss": 0.2033, "step": 10784 }, { "epoch": 4.525167785234899, "grad_norm": 0.56640625, "learning_rate": 6.760831533618233e-06, "loss": 0.3043, "step": 10788 }, { "epoch": 4.526845637583893, "grad_norm": 0.44921875, "learning_rate": 6.713495239791855e-06, "loss": 0.1629, "step": 10792 }, { "epoch": 4.528523489932886, "grad_norm": 0.5078125, "learning_rate": 6.666321448807693e-06, "loss": 0.1826, "step": 10796 }, { "epoch": 4.530201342281879, "grad_norm": 0.349609375, "learning_rate": 6.619310214166052e-06, "loss": 0.1436, "step": 10800 }, { "epoch": 4.531879194630872, "grad_norm": 0.51171875, "learning_rate": 6.572461589182948e-06, "loss": 0.1477, "step": 10804 }, { "epoch": 4.533557046979865, "grad_norm": 0.4609375, "learning_rate": 6.525775626989898e-06, "loss": 0.1743, "step": 10808 }, { "epoch": 4.535234899328859, "grad_norm": 0.53125, "learning_rate": 6.4792523805339836e-06, "loss": 0.1596, "step": 10812 }, { "epoch": 4.5369127516778525, "grad_norm": 0.48046875, "learning_rate": 6.432891902577764e-06, "loss": 0.2155, "step": 10816 }, { "epoch": 4.5385906040268456, "grad_norm": 0.51953125, "learning_rate": 6.386694245699181e-06, "loss": 0.1685, "step": 10820 }, { "epoch": 4.540268456375839, "grad_norm": 0.5546875, "learning_rate": 6.340659462291492e-06, "loss": 0.2136, "step": 10824 }, { "epoch": 4.541946308724832, "grad_norm": 0.50390625, "learning_rate": 6.294787604563267e-06, "loss": 0.2199, "step": 10828 }, { "epoch": 4.543624161073826, "grad_norm": 0.5546875, "learning_rate": 6.249078724538325e-06, "loss": 0.2428, "step": 10832 }, { "epoch": 4.545302013422819, "grad_norm": 0.494140625, "learning_rate": 6.2035328740556e-06, "loss": 0.2194, "step": 10836 }, { "epoch": 4.546979865771812, "grad_norm": 0.65625, "learning_rate": 6.158150104769155e-06, "loss": 0.1617, "step": 10840 }, { "epoch": 4.548657718120805, "grad_norm": 0.412109375, "learning_rate": 6.112930468148119e-06, "loss": 0.2021, "step": 10844 }, { "epoch": 4.550335570469799, "grad_norm": 0.408203125, "learning_rate": 6.067874015476571e-06, "loss": 0.129, "step": 10848 }, { "epoch": 4.552013422818792, "grad_norm": 0.5234375, "learning_rate": 6.0229807978535495e-06, "loss": 0.234, "step": 10852 }, { "epoch": 4.553691275167785, "grad_norm": 0.625, "learning_rate": 5.978250866192963e-06, "loss": 0.3586, "step": 10856 }, { "epoch": 4.555369127516778, "grad_norm": 0.45703125, "learning_rate": 5.933684271223532e-06, "loss": 0.1991, "step": 10860 }, { "epoch": 4.557046979865772, "grad_norm": 0.5625, "learning_rate": 5.889281063488743e-06, "loss": 0.2277, "step": 10864 }, { "epoch": 4.558724832214765, "grad_norm": 0.5859375, "learning_rate": 5.845041293346747e-06, "loss": 0.1784, "step": 10868 }, { "epoch": 4.560402684563758, "grad_norm": 0.50390625, "learning_rate": 5.800965010970393e-06, "loss": 0.1537, "step": 10872 }, { "epoch": 4.5620805369127515, "grad_norm": 0.5234375, "learning_rate": 5.757052266347078e-06, "loss": 0.2138, "step": 10876 }, { "epoch": 4.563758389261745, "grad_norm": 0.5234375, "learning_rate": 5.713303109278749e-06, "loss": 0.2194, "step": 10880 }, { "epoch": 4.565436241610739, "grad_norm": 0.6171875, "learning_rate": 5.669717589381817e-06, "loss": 0.2515, "step": 10884 }, { "epoch": 4.567114093959732, "grad_norm": 0.61328125, "learning_rate": 5.626295756087107e-06, "loss": 0.1879, "step": 10888 }, { "epoch": 4.568791946308725, "grad_norm": 0.6484375, "learning_rate": 5.58303765863981e-06, "loss": 0.2846, "step": 10892 }, { "epoch": 4.570469798657718, "grad_norm": 0.53515625, "learning_rate": 5.539943346099418e-06, "loss": 0.282, "step": 10896 }, { "epoch": 4.572147651006711, "grad_norm": 0.47265625, "learning_rate": 5.497012867339701e-06, "loss": 0.3184, "step": 10900 }, { "epoch": 4.573825503355705, "grad_norm": 0.470703125, "learning_rate": 5.454246271048596e-06, "loss": 0.1623, "step": 10904 }, { "epoch": 4.575503355704698, "grad_norm": 0.546875, "learning_rate": 5.411643605728139e-06, "loss": 0.2577, "step": 10908 }, { "epoch": 4.577181208053691, "grad_norm": 0.64453125, "learning_rate": 5.3692049196945475e-06, "loss": 0.2599, "step": 10912 }, { "epoch": 4.578859060402684, "grad_norm": 0.61328125, "learning_rate": 5.326930261077972e-06, "loss": 0.2441, "step": 10916 }, { "epoch": 4.580536912751678, "grad_norm": 0.53515625, "learning_rate": 5.28481967782261e-06, "loss": 0.1762, "step": 10920 }, { "epoch": 4.582214765100671, "grad_norm": 0.4921875, "learning_rate": 5.242873217686527e-06, "loss": 0.1926, "step": 10924 }, { "epoch": 4.583892617449664, "grad_norm": 0.578125, "learning_rate": 5.201090928241719e-06, "loss": 0.2167, "step": 10928 }, { "epoch": 4.5855704697986575, "grad_norm": 0.5703125, "learning_rate": 5.1594728568738976e-06, "loss": 0.2332, "step": 10932 }, { "epoch": 4.587248322147651, "grad_norm": 0.49609375, "learning_rate": 5.118019050782624e-06, "loss": 0.1823, "step": 10936 }, { "epoch": 4.5889261744966445, "grad_norm": 0.3359375, "learning_rate": 5.07672955698109e-06, "loss": 0.2021, "step": 10940 }, { "epoch": 4.590604026845638, "grad_norm": 0.640625, "learning_rate": 5.035604422296224e-06, "loss": 0.2555, "step": 10944 }, { "epoch": 4.592281879194631, "grad_norm": 0.50390625, "learning_rate": 4.9946436933684665e-06, "loss": 0.1965, "step": 10948 }, { "epoch": 4.593959731543624, "grad_norm": 0.423828125, "learning_rate": 4.95384741665189e-06, "loss": 0.1166, "step": 10952 }, { "epoch": 4.595637583892618, "grad_norm": 0.5546875, "learning_rate": 4.9132156384139535e-06, "loss": 0.2338, "step": 10956 }, { "epoch": 4.597315436241611, "grad_norm": 0.478515625, "learning_rate": 4.872748404735644e-06, "loss": 0.1499, "step": 10960 }, { "epoch": 4.598993288590604, "grad_norm": 0.65234375, "learning_rate": 4.8324457615113165e-06, "loss": 0.1531, "step": 10964 }, { "epoch": 4.600671140939597, "grad_norm": 0.3046875, "learning_rate": 4.7923077544486266e-06, "loss": 0.121, "step": 10968 }, { "epoch": 4.60234899328859, "grad_norm": 0.2890625, "learning_rate": 4.752334429068577e-06, "loss": 0.3184, "step": 10972 }, { "epoch": 4.604026845637584, "grad_norm": 0.5859375, "learning_rate": 4.712525830705338e-06, "loss": 0.2232, "step": 10976 }, { "epoch": 4.605704697986577, "grad_norm": 0.6484375, "learning_rate": 4.6728820045062954e-06, "loss": 0.2467, "step": 10980 }, { "epoch": 4.60738255033557, "grad_norm": 0.57421875, "learning_rate": 4.633402995431968e-06, "loss": 0.2968, "step": 10984 }, { "epoch": 4.609060402684563, "grad_norm": 0.6171875, "learning_rate": 4.5940888482559555e-06, "loss": 0.1973, "step": 10988 }, { "epoch": 4.610738255033557, "grad_norm": 0.498046875, "learning_rate": 4.554939607564861e-06, "loss": 0.2158, "step": 10992 }, { "epoch": 4.6124161073825505, "grad_norm": 0.609375, "learning_rate": 4.515955317758285e-06, "loss": 0.1665, "step": 10996 }, { "epoch": 4.614093959731544, "grad_norm": 0.546875, "learning_rate": 4.477136023048727e-06, "loss": 0.1944, "step": 11000 }, { "epoch": 4.615771812080537, "grad_norm": 0.412109375, "learning_rate": 4.4384817674616215e-06, "loss": 0.1841, "step": 11004 }, { "epoch": 4.617449664429531, "grad_norm": 0.578125, "learning_rate": 4.399992594835183e-06, "loss": 0.2635, "step": 11008 }, { "epoch": 4.619127516778524, "grad_norm": 0.5546875, "learning_rate": 4.361668548820429e-06, "loss": 0.186, "step": 11012 }, { "epoch": 4.620805369127517, "grad_norm": 0.5859375, "learning_rate": 4.323509672881059e-06, "loss": 0.3112, "step": 11016 }, { "epoch": 4.62248322147651, "grad_norm": 0.6640625, "learning_rate": 4.285516010293522e-06, "loss": 0.3135, "step": 11020 }, { "epoch": 4.624161073825503, "grad_norm": 0.54296875, "learning_rate": 4.247687604146816e-06, "loss": 0.2262, "step": 11024 }, { "epoch": 4.625838926174497, "grad_norm": 0.6640625, "learning_rate": 4.210024497342607e-06, "loss": 0.2089, "step": 11028 }, { "epoch": 4.62751677852349, "grad_norm": 0.45703125, "learning_rate": 4.172526732595044e-06, "loss": 0.2366, "step": 11032 }, { "epoch": 4.629194630872483, "grad_norm": 0.6953125, "learning_rate": 4.13519435243076e-06, "loss": 0.3434, "step": 11036 }, { "epoch": 4.630872483221476, "grad_norm": 0.6171875, "learning_rate": 4.098027399188802e-06, "loss": 0.2559, "step": 11040 }, { "epoch": 4.632550335570469, "grad_norm": 0.47265625, "learning_rate": 4.061025915020655e-06, "loss": 0.2496, "step": 11044 }, { "epoch": 4.634228187919463, "grad_norm": 0.6015625, "learning_rate": 4.024189941890099e-06, "loss": 0.3223, "step": 11048 }, { "epoch": 4.635906040268456, "grad_norm": 0.69140625, "learning_rate": 3.987519521573268e-06, "loss": 0.2616, "step": 11052 }, { "epoch": 4.6375838926174495, "grad_norm": 0.59765625, "learning_rate": 3.951014695658494e-06, "loss": 0.2961, "step": 11056 }, { "epoch": 4.639261744966443, "grad_norm": 0.3828125, "learning_rate": 3.914675505546277e-06, "loss": 0.1907, "step": 11060 }, { "epoch": 4.640939597315437, "grad_norm": 0.7421875, "learning_rate": 3.878501992449318e-06, "loss": 0.2877, "step": 11064 }, { "epoch": 4.64261744966443, "grad_norm": 0.3828125, "learning_rate": 3.842494197392398e-06, "loss": 0.1428, "step": 11068 }, { "epoch": 4.644295302013423, "grad_norm": 0.53515625, "learning_rate": 3.8066521612123866e-06, "loss": 0.2048, "step": 11072 }, { "epoch": 4.645973154362416, "grad_norm": 0.330078125, "learning_rate": 3.770975924558134e-06, "loss": 0.161, "step": 11076 }, { "epoch": 4.64765100671141, "grad_norm": 0.5703125, "learning_rate": 3.735465527890458e-06, "loss": 0.1701, "step": 11080 }, { "epoch": 4.649328859060403, "grad_norm": 0.423828125, "learning_rate": 3.7001210114820935e-06, "loss": 0.2704, "step": 11084 }, { "epoch": 4.651006711409396, "grad_norm": 0.5625, "learning_rate": 3.6649424154176597e-06, "loss": 0.2153, "step": 11088 }, { "epoch": 4.652684563758389, "grad_norm": 0.6640625, "learning_rate": 3.62992977959361e-06, "loss": 0.1816, "step": 11092 }, { "epoch": 4.654362416107382, "grad_norm": 0.51171875, "learning_rate": 3.595083143718147e-06, "loss": 0.2057, "step": 11096 }, { "epoch": 4.656040268456376, "grad_norm": 0.5859375, "learning_rate": 3.560402547311275e-06, "loss": 0.2967, "step": 11100 }, { "epoch": 4.657718120805369, "grad_norm": 0.72265625, "learning_rate": 3.525888029704599e-06, "loss": 0.1972, "step": 11104 }, { "epoch": 4.659395973154362, "grad_norm": 0.65234375, "learning_rate": 3.4915396300414567e-06, "loss": 0.1742, "step": 11108 }, { "epoch": 4.6610738255033555, "grad_norm": 0.59765625, "learning_rate": 3.4573573872767547e-06, "loss": 0.2311, "step": 11112 }, { "epoch": 4.662751677852349, "grad_norm": 0.578125, "learning_rate": 3.423341340176933e-06, "loss": 0.2288, "step": 11116 }, { "epoch": 4.6644295302013425, "grad_norm": 0.65625, "learning_rate": 3.3894915273199987e-06, "loss": 0.3241, "step": 11120 }, { "epoch": 4.666107382550336, "grad_norm": 0.68359375, "learning_rate": 3.3558079870954267e-06, "loss": 0.2839, "step": 11124 }, { "epoch": 4.667785234899329, "grad_norm": 0.431640625, "learning_rate": 3.322290757704044e-06, "loss": 0.2478, "step": 11128 }, { "epoch": 4.669463087248322, "grad_norm": 0.4453125, "learning_rate": 3.288939877158159e-06, "loss": 0.2621, "step": 11132 }, { "epoch": 4.671140939597316, "grad_norm": 0.625, "learning_rate": 3.2557553832813664e-06, "loss": 0.1564, "step": 11136 }, { "epoch": 4.672818791946309, "grad_norm": 0.640625, "learning_rate": 3.2227373137085954e-06, "loss": 0.14, "step": 11140 }, { "epoch": 4.674496644295302, "grad_norm": 0.59765625, "learning_rate": 3.1898857058859917e-06, "loss": 0.1812, "step": 11144 }, { "epoch": 4.676174496644295, "grad_norm": 0.48828125, "learning_rate": 3.1572005970709356e-06, "loss": 0.176, "step": 11148 }, { "epoch": 4.677852348993289, "grad_norm": 0.6328125, "learning_rate": 3.1246820243319924e-06, "loss": 0.316, "step": 11152 }, { "epoch": 4.679530201342282, "grad_norm": 0.47265625, "learning_rate": 3.092330024548828e-06, "loss": 0.1113, "step": 11156 }, { "epoch": 4.681208053691275, "grad_norm": 0.41796875, "learning_rate": 3.0601446344122095e-06, "loss": 0.1785, "step": 11160 }, { "epoch": 4.682885906040268, "grad_norm": 0.70703125, "learning_rate": 3.0281258904239713e-06, "loss": 0.2482, "step": 11164 }, { "epoch": 4.684563758389261, "grad_norm": 0.41796875, "learning_rate": 2.9962738288969003e-06, "loss": 0.1532, "step": 11168 }, { "epoch": 4.686241610738255, "grad_norm": 0.390625, "learning_rate": 2.9645884859547997e-06, "loss": 0.2107, "step": 11172 }, { "epoch": 4.6879194630872485, "grad_norm": 0.55078125, "learning_rate": 2.9330698975323753e-06, "loss": 0.1704, "step": 11176 }, { "epoch": 4.689597315436242, "grad_norm": 0.7890625, "learning_rate": 2.9017180993752e-06, "loss": 0.2408, "step": 11180 }, { "epoch": 4.691275167785235, "grad_norm": 0.5078125, "learning_rate": 2.8705331270397312e-06, "loss": 0.115, "step": 11184 }, { "epoch": 4.692953020134228, "grad_norm": 0.484375, "learning_rate": 2.8395150158931624e-06, "loss": 0.1994, "step": 11188 }, { "epoch": 4.694630872483222, "grad_norm": 0.56640625, "learning_rate": 2.808663801113503e-06, "loss": 0.1701, "step": 11192 }, { "epoch": 4.696308724832215, "grad_norm": 0.671875, "learning_rate": 2.777979517689466e-06, "loss": 0.2325, "step": 11196 }, { "epoch": 4.697986577181208, "grad_norm": 0.70703125, "learning_rate": 2.7474622004204304e-06, "loss": 0.3648, "step": 11200 }, { "epoch": 4.699664429530201, "grad_norm": 0.6328125, "learning_rate": 2.717111883916445e-06, "loss": 0.2266, "step": 11204 }, { "epoch": 4.701342281879195, "grad_norm": 0.37890625, "learning_rate": 2.6869286025981427e-06, "loss": 0.1526, "step": 11208 }, { "epoch": 4.703020134228188, "grad_norm": 0.42578125, "learning_rate": 2.656912390696708e-06, "loss": 0.2569, "step": 11212 }, { "epoch": 4.704697986577181, "grad_norm": 0.462890625, "learning_rate": 2.6270632822538607e-06, "loss": 0.2189, "step": 11216 }, { "epoch": 4.706375838926174, "grad_norm": 0.423828125, "learning_rate": 2.5973813111218546e-06, "loss": 0.138, "step": 11220 }, { "epoch": 4.708053691275168, "grad_norm": 0.609375, "learning_rate": 2.567866510963329e-06, "loss": 0.2691, "step": 11224 }, { "epoch": 4.709731543624161, "grad_norm": 0.59765625, "learning_rate": 2.5385189152513253e-06, "loss": 0.2942, "step": 11228 }, { "epoch": 4.7114093959731544, "grad_norm": 0.494140625, "learning_rate": 2.5093385572693025e-06, "loss": 0.1724, "step": 11232 }, { "epoch": 4.7130872483221475, "grad_norm": 0.5, "learning_rate": 2.4803254701110385e-06, "loss": 0.1315, "step": 11236 }, { "epoch": 4.714765100671141, "grad_norm": 0.609375, "learning_rate": 2.4514796866805964e-06, "loss": 0.2545, "step": 11240 }, { "epoch": 4.716442953020135, "grad_norm": 0.5703125, "learning_rate": 2.422801239692307e-06, "loss": 0.2011, "step": 11244 }, { "epoch": 4.718120805369128, "grad_norm": 0.546875, "learning_rate": 2.3942901616707365e-06, "loss": 0.1545, "step": 11248 }, { "epoch": 4.719798657718121, "grad_norm": 0.6875, "learning_rate": 2.365946484950587e-06, "loss": 0.1874, "step": 11252 }, { "epoch": 4.721476510067114, "grad_norm": 0.76171875, "learning_rate": 2.3377702416767618e-06, "loss": 0.2113, "step": 11256 }, { "epoch": 4.723154362416107, "grad_norm": 0.462890625, "learning_rate": 2.3097614638042493e-06, "loss": 0.1826, "step": 11260 }, { "epoch": 4.724832214765101, "grad_norm": 0.57421875, "learning_rate": 2.2819201830981404e-06, "loss": 0.2048, "step": 11264 }, { "epoch": 4.726510067114094, "grad_norm": 0.71484375, "learning_rate": 2.2542464311335105e-06, "loss": 0.1724, "step": 11268 }, { "epoch": 4.728187919463087, "grad_norm": 0.5, "learning_rate": 2.2267402392955215e-06, "loss": 0.2087, "step": 11272 }, { "epoch": 4.72986577181208, "grad_norm": 0.357421875, "learning_rate": 2.1994016387792024e-06, "loss": 0.1886, "step": 11276 }, { "epoch": 4.731543624161074, "grad_norm": 0.458984375, "learning_rate": 2.1722306605896022e-06, "loss": 0.1647, "step": 11280 }, { "epoch": 4.733221476510067, "grad_norm": 0.5625, "learning_rate": 2.1452273355416216e-06, "loss": 0.1711, "step": 11284 }, { "epoch": 4.73489932885906, "grad_norm": 0.388671875, "learning_rate": 2.1183916942600464e-06, "loss": 0.2413, "step": 11288 }, { "epoch": 4.7365771812080535, "grad_norm": 0.6015625, "learning_rate": 2.0917237671794983e-06, "loss": 0.2341, "step": 11292 }, { "epoch": 4.7382550335570475, "grad_norm": 0.60546875, "learning_rate": 2.0652235845443175e-06, "loss": 0.2874, "step": 11296 }, { "epoch": 4.739932885906041, "grad_norm": 0.486328125, "learning_rate": 2.0388911764086966e-06, "loss": 0.1731, "step": 11300 }, { "epoch": 4.741610738255034, "grad_norm": 0.40234375, "learning_rate": 2.012726572636514e-06, "loss": 0.2236, "step": 11304 }, { "epoch": 4.743288590604027, "grad_norm": 0.625, "learning_rate": 1.986729802901349e-06, "loss": 0.2452, "step": 11308 }, { "epoch": 4.74496644295302, "grad_norm": 0.609375, "learning_rate": 1.9609008966864347e-06, "loss": 0.3026, "step": 11312 }, { "epoch": 4.746644295302014, "grad_norm": 0.54296875, "learning_rate": 1.935239883284606e-06, "loss": 0.1799, "step": 11316 }, { "epoch": 4.748322147651007, "grad_norm": 0.49609375, "learning_rate": 1.909746791798317e-06, "loss": 0.2327, "step": 11320 }, { "epoch": 4.75, "grad_norm": 0.37109375, "learning_rate": 1.8844216511395726e-06, "loss": 0.1774, "step": 11324 }, { "epoch": 4.751677852348993, "grad_norm": 0.703125, "learning_rate": 1.8592644900298826e-06, "loss": 0.1785, "step": 11328 }, { "epoch": 4.753355704697986, "grad_norm": 0.7109375, "learning_rate": 1.8342753370003072e-06, "loss": 0.2302, "step": 11332 }, { "epoch": 4.75503355704698, "grad_norm": 0.5390625, "learning_rate": 1.8094542203912932e-06, "loss": 0.2271, "step": 11336 }, { "epoch": 4.756711409395973, "grad_norm": 0.53515625, "learning_rate": 1.7848011683527562e-06, "loss": 0.2242, "step": 11340 }, { "epoch": 4.758389261744966, "grad_norm": 0.578125, "learning_rate": 1.7603162088440148e-06, "loss": 0.164, "step": 11344 }, { "epoch": 4.760067114093959, "grad_norm": 0.5, "learning_rate": 1.735999369633706e-06, "loss": 0.1973, "step": 11348 }, { "epoch": 4.7617449664429525, "grad_norm": 0.53515625, "learning_rate": 1.7118506782998699e-06, "loss": 0.2446, "step": 11352 }, { "epoch": 4.7634228187919465, "grad_norm": 0.54296875, "learning_rate": 1.687870162229782e-06, "loss": 0.2549, "step": 11356 }, { "epoch": 4.76510067114094, "grad_norm": 0.515625, "learning_rate": 1.6640578486200373e-06, "loss": 0.2169, "step": 11360 }, { "epoch": 4.766778523489933, "grad_norm": 0.60546875, "learning_rate": 1.6404137644764338e-06, "loss": 0.2376, "step": 11364 }, { "epoch": 4.768456375838926, "grad_norm": 0.609375, "learning_rate": 1.6169379366139878e-06, "loss": 0.2062, "step": 11368 }, { "epoch": 4.77013422818792, "grad_norm": 0.451171875, "learning_rate": 1.5936303916569193e-06, "loss": 0.1564, "step": 11372 }, { "epoch": 4.771812080536913, "grad_norm": 0.49609375, "learning_rate": 1.5704911560385668e-06, "loss": 0.314, "step": 11376 }, { "epoch": 4.773489932885906, "grad_norm": 0.65625, "learning_rate": 1.5475202560014054e-06, "loss": 0.2855, "step": 11380 }, { "epoch": 4.775167785234899, "grad_norm": 0.56640625, "learning_rate": 1.5247177175969793e-06, "loss": 0.1574, "step": 11384 }, { "epoch": 4.776845637583893, "grad_norm": 0.5078125, "learning_rate": 1.5020835666859354e-06, "loss": 0.1317, "step": 11388 }, { "epoch": 4.778523489932886, "grad_norm": 0.376953125, "learning_rate": 1.4796178289378735e-06, "loss": 0.167, "step": 11392 }, { "epoch": 4.780201342281879, "grad_norm": 0.484375, "learning_rate": 1.457320529831496e-06, "loss": 0.2433, "step": 11396 }, { "epoch": 4.781879194630872, "grad_norm": 0.419921875, "learning_rate": 1.435191694654375e-06, "loss": 0.1606, "step": 11400 }, { "epoch": 4.783557046979865, "grad_norm": 0.56640625, "learning_rate": 1.4132313485030856e-06, "loss": 0.127, "step": 11404 }, { "epoch": 4.785234899328859, "grad_norm": 0.796875, "learning_rate": 1.391439516283105e-06, "loss": 0.2726, "step": 11408 }, { "epoch": 4.7869127516778525, "grad_norm": 0.50390625, "learning_rate": 1.3698162227087971e-06, "loss": 0.1361, "step": 11412 }, { "epoch": 4.7885906040268456, "grad_norm": 0.5703125, "learning_rate": 1.3483614923033792e-06, "loss": 0.1994, "step": 11416 }, { "epoch": 4.790268456375839, "grad_norm": 0.70703125, "learning_rate": 1.3270753493989373e-06, "loss": 0.3201, "step": 11420 }, { "epoch": 4.791946308724832, "grad_norm": 0.59375, "learning_rate": 1.3059578181362607e-06, "loss": 0.215, "step": 11424 }, { "epoch": 4.793624161073826, "grad_norm": 0.7109375, "learning_rate": 1.2850089224650418e-06, "loss": 0.237, "step": 11428 }, { "epoch": 4.795302013422819, "grad_norm": 0.51953125, "learning_rate": 1.2642286861436256e-06, "loss": 0.3066, "step": 11432 }, { "epoch": 4.796979865771812, "grad_norm": 0.5625, "learning_rate": 1.243617132739111e-06, "loss": 0.2064, "step": 11436 }, { "epoch": 4.798657718120805, "grad_norm": 0.62890625, "learning_rate": 1.2231742856273151e-06, "loss": 0.3162, "step": 11440 }, { "epoch": 4.800335570469799, "grad_norm": 0.62890625, "learning_rate": 1.202900167992693e-06, "loss": 0.3013, "step": 11444 }, { "epoch": 4.802013422818792, "grad_norm": 0.5390625, "learning_rate": 1.1827948028283352e-06, "loss": 0.2074, "step": 11448 }, { "epoch": 4.803691275167785, "grad_norm": 0.55078125, "learning_rate": 1.1628582129359686e-06, "loss": 0.1607, "step": 11452 }, { "epoch": 4.805369127516778, "grad_norm": 0.40625, "learning_rate": 1.143090420925924e-06, "loss": 0.1383, "step": 11456 }, { "epoch": 4.807046979865772, "grad_norm": 0.55859375, "learning_rate": 1.1234914492170678e-06, "loss": 0.2278, "step": 11460 }, { "epoch": 4.808724832214765, "grad_norm": 0.5, "learning_rate": 1.1040613200368032e-06, "loss": 0.2236, "step": 11464 }, { "epoch": 4.810402684563758, "grad_norm": 0.6171875, "learning_rate": 1.08480005542107e-06, "loss": 0.2584, "step": 11468 }, { "epoch": 4.8120805369127515, "grad_norm": 0.482421875, "learning_rate": 1.0657076772142782e-06, "loss": 0.1715, "step": 11472 }, { "epoch": 4.813758389261745, "grad_norm": 0.55859375, "learning_rate": 1.0467842070693234e-06, "loss": 0.1767, "step": 11476 }, { "epoch": 4.815436241610739, "grad_norm": 0.54296875, "learning_rate": 1.0280296664475218e-06, "loss": 0.185, "step": 11480 }, { "epoch": 4.817114093959732, "grad_norm": 0.515625, "learning_rate": 1.0094440766185929e-06, "loss": 0.2105, "step": 11484 }, { "epoch": 4.818791946308725, "grad_norm": 0.443359375, "learning_rate": 9.910274586606925e-07, "loss": 0.0977, "step": 11488 }, { "epoch": 4.820469798657718, "grad_norm": 0.70703125, "learning_rate": 9.727798334602799e-07, "loss": 0.1845, "step": 11492 }, { "epoch": 4.822147651006711, "grad_norm": 0.55078125, "learning_rate": 9.54701221712234e-07, "loss": 0.2232, "step": 11496 }, { "epoch": 4.823825503355705, "grad_norm": 0.46875, "learning_rate": 9.367916439196709e-07, "loss": 0.2361, "step": 11500 } ], "logging_steps": 4, "max_steps": 11920, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8270995965437542e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }