{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 11575, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.8661230868133192, "learning_rate": 1.7271157167530226e-08, "loss": 0.1037, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.1661544306907603, "learning_rate": 8.635578583765113e-08, "loss": 0.0948, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.7537109348930305, "learning_rate": 1.7271157167530226e-07, "loss": 0.1015, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.2061731506590818, "learning_rate": 2.5906735751295336e-07, "loss": 0.1073, "step": 15 }, { "epoch": 0.01, "grad_norm": 1.0211147540146708, "learning_rate": 3.454231433506045e-07, "loss": 0.0973, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.1617195446495134, "learning_rate": 4.317789291882556e-07, "loss": 0.1121, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.8211862670686246, "learning_rate": 5.181347150259067e-07, "loss": 0.1026, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.9348381607194174, "learning_rate": 6.04490500863558e-07, "loss": 0.1192, "step": 35 }, { "epoch": 0.02, "grad_norm": 0.490417880543222, "learning_rate": 6.90846286701209e-07, "loss": 0.094, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.8388913689714425, "learning_rate": 7.772020725388602e-07, "loss": 0.0977, "step": 45 }, { "epoch": 0.02, "grad_norm": 0.7004297572965578, "learning_rate": 8.635578583765112e-07, "loss": 0.1032, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.6059616583139867, "learning_rate": 9.499136442141624e-07, "loss": 0.0749, "step": 55 }, { "epoch": 0.03, "grad_norm": 0.9955847374719834, "learning_rate": 1.0362694300518134e-06, "loss": 0.1005, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.8320891857428879, "learning_rate": 1.1226252158894648e-06, "loss": 0.0853, "step": 65 }, { "epoch": 0.03, "grad_norm": 0.7954532245692706, "learning_rate": 1.208981001727116e-06, "loss": 0.0879, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.8813868206948325, "learning_rate": 1.2953367875647669e-06, "loss": 0.0902, "step": 75 }, { "epoch": 0.03, "grad_norm": 1.0058746202593967, "learning_rate": 1.381692573402418e-06, "loss": 0.1145, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.7597389247113029, "learning_rate": 1.4680483592400694e-06, "loss": 0.1021, "step": 85 }, { "epoch": 0.04, "grad_norm": 0.46103661687167447, "learning_rate": 1.5544041450777204e-06, "loss": 0.0806, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.737893040063868, "learning_rate": 1.6407599309153715e-06, "loss": 0.0875, "step": 95 }, { "epoch": 0.04, "grad_norm": 0.8310674040774385, "learning_rate": 1.7271157167530224e-06, "loss": 0.0832, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.984904820117129, "learning_rate": 1.8134715025906738e-06, "loss": 0.1034, "step": 105 }, { "epoch": 0.05, "grad_norm": 1.035690398188729, "learning_rate": 1.8998272884283248e-06, "loss": 0.0915, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.8084351137915218, "learning_rate": 1.986183074265976e-06, "loss": 0.0955, "step": 115 }, { "epoch": 0.05, "grad_norm": 0.7974766116289803, "learning_rate": 2.072538860103627e-06, "loss": 0.0906, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.6030316166998918, "learning_rate": 2.1588946459412782e-06, "loss": 0.1054, "step": 125 }, { "epoch": 0.06, "grad_norm": 0.8476703419068902, "learning_rate": 2.2452504317789296e-06, "loss": 0.0875, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.7435355944089723, "learning_rate": 2.3316062176165805e-06, "loss": 0.0951, "step": 135 }, { "epoch": 0.06, "grad_norm": 1.019975713303781, "learning_rate": 2.417962003454232e-06, "loss": 0.0918, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.8408723502054206, "learning_rate": 2.504317789291883e-06, "loss": 0.0984, "step": 145 }, { "epoch": 0.06, "grad_norm": 0.787665840797315, "learning_rate": 2.5906735751295338e-06, "loss": 0.0905, "step": 150 }, { "epoch": 0.07, "grad_norm": 0.7319427568278293, "learning_rate": 2.6770293609671847e-06, "loss": 0.0802, "step": 155 }, { "epoch": 0.07, "grad_norm": 0.7710169679916523, "learning_rate": 2.763385146804836e-06, "loss": 0.091, "step": 160 }, { "epoch": 0.07, "grad_norm": 0.7154619591661221, "learning_rate": 2.8497409326424875e-06, "loss": 0.0821, "step": 165 }, { "epoch": 0.07, "grad_norm": 0.939909195416624, "learning_rate": 2.936096718480139e-06, "loss": 0.1003, "step": 170 }, { "epoch": 0.08, "grad_norm": 0.7738010454826961, "learning_rate": 3.0224525043177893e-06, "loss": 0.0975, "step": 175 }, { "epoch": 0.08, "grad_norm": 0.86507670980491, "learning_rate": 3.1088082901554407e-06, "loss": 0.0989, "step": 180 }, { "epoch": 0.08, "grad_norm": 0.6050718382645259, "learning_rate": 3.195164075993092e-06, "loss": 0.0843, "step": 185 }, { "epoch": 0.08, "grad_norm": 0.8827419506686391, "learning_rate": 3.281519861830743e-06, "loss": 0.0956, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.890758444313074, "learning_rate": 3.367875647668394e-06, "loss": 0.0937, "step": 195 }, { "epoch": 0.09, "grad_norm": 1.0254768465690436, "learning_rate": 3.454231433506045e-06, "loss": 0.088, "step": 200 }, { "epoch": 0.09, "grad_norm": 1.1096548052357276, "learning_rate": 3.5405872193436963e-06, "loss": 0.1031, "step": 205 }, { "epoch": 0.09, "grad_norm": 1.0635828264297262, "learning_rate": 3.6269430051813476e-06, "loss": 0.0995, "step": 210 }, { "epoch": 0.09, "grad_norm": 0.7459111481743179, "learning_rate": 3.713298791018999e-06, "loss": 0.073, "step": 215 }, { "epoch": 0.1, "grad_norm": 0.703283436759907, "learning_rate": 3.7996545768566495e-06, "loss": 0.074, "step": 220 }, { "epoch": 0.1, "grad_norm": 1.0566826153490951, "learning_rate": 3.886010362694301e-06, "loss": 0.0936, "step": 225 }, { "epoch": 0.1, "grad_norm": 1.0016492237862977, "learning_rate": 3.972366148531952e-06, "loss": 0.1009, "step": 230 }, { "epoch": 0.1, "grad_norm": 0.5128173139360626, "learning_rate": 4.058721934369604e-06, "loss": 0.0904, "step": 235 }, { "epoch": 0.1, "grad_norm": 0.856968479324872, "learning_rate": 4.145077720207254e-06, "loss": 0.088, "step": 240 }, { "epoch": 0.11, "grad_norm": 0.7601294161989428, "learning_rate": 4.2314335060449055e-06, "loss": 0.0895, "step": 245 }, { "epoch": 0.11, "grad_norm": 1.0746309115244876, "learning_rate": 4.3177892918825564e-06, "loss": 0.0831, "step": 250 }, { "epoch": 0.11, "grad_norm": 0.7906243315776752, "learning_rate": 4.404145077720207e-06, "loss": 0.0737, "step": 255 }, { "epoch": 0.11, "grad_norm": 0.679401480283055, "learning_rate": 4.490500863557859e-06, "loss": 0.0848, "step": 260 }, { "epoch": 0.11, "grad_norm": 0.374569808090678, "learning_rate": 4.576856649395509e-06, "loss": 0.0812, "step": 265 }, { "epoch": 0.12, "grad_norm": 0.905921167975438, "learning_rate": 4.663212435233161e-06, "loss": 0.1048, "step": 270 }, { "epoch": 0.12, "grad_norm": 0.9148670093529526, "learning_rate": 4.749568221070812e-06, "loss": 0.1088, "step": 275 }, { "epoch": 0.12, "grad_norm": 0.6860981856697029, "learning_rate": 4.835924006908464e-06, "loss": 0.0953, "step": 280 }, { "epoch": 0.12, "grad_norm": 0.7141327749409578, "learning_rate": 4.922279792746114e-06, "loss": 0.093, "step": 285 }, { "epoch": 0.13, "grad_norm": 0.7857764564175557, "learning_rate": 5.008635578583766e-06, "loss": 0.0865, "step": 290 }, { "epoch": 0.13, "grad_norm": 0.4587481775006207, "learning_rate": 5.094991364421417e-06, "loss": 0.0715, "step": 295 }, { "epoch": 0.13, "grad_norm": 0.6672088038539078, "learning_rate": 5.1813471502590676e-06, "loss": 0.0766, "step": 300 }, { "epoch": 0.13, "grad_norm": 0.7695334407032586, "learning_rate": 5.267702936096719e-06, "loss": 0.0855, "step": 305 }, { "epoch": 0.13, "grad_norm": 0.5016272306147599, "learning_rate": 5.3540587219343694e-06, "loss": 0.0777, "step": 310 }, { "epoch": 0.14, "grad_norm": 0.49424188342660924, "learning_rate": 5.440414507772021e-06, "loss": 0.0878, "step": 315 }, { "epoch": 0.14, "grad_norm": 0.2989946292550959, "learning_rate": 5.526770293609672e-06, "loss": 0.0811, "step": 320 }, { "epoch": 0.14, "grad_norm": 0.9640494211712551, "learning_rate": 5.613126079447323e-06, "loss": 0.0893, "step": 325 }, { "epoch": 0.14, "grad_norm": 1.449054653018769, "learning_rate": 5.699481865284975e-06, "loss": 0.1132, "step": 330 }, { "epoch": 0.14, "grad_norm": 0.8085449725247954, "learning_rate": 5.785837651122626e-06, "loss": 0.0845, "step": 335 }, { "epoch": 0.15, "grad_norm": 0.7225319435995665, "learning_rate": 5.872193436960278e-06, "loss": 0.0904, "step": 340 }, { "epoch": 0.15, "grad_norm": 0.2928666812431443, "learning_rate": 5.958549222797928e-06, "loss": 0.0655, "step": 345 }, { "epoch": 0.15, "grad_norm": 1.5315887810709465, "learning_rate": 6.044905008635579e-06, "loss": 0.1082, "step": 350 }, { "epoch": 0.15, "grad_norm": 0.7560372608550086, "learning_rate": 6.1312607944732305e-06, "loss": 0.0732, "step": 355 }, { "epoch": 0.16, "grad_norm": 0.8551844011661183, "learning_rate": 6.217616580310881e-06, "loss": 0.0957, "step": 360 }, { "epoch": 0.16, "grad_norm": 0.5218822357688347, "learning_rate": 6.3039723661485315e-06, "loss": 0.077, "step": 365 }, { "epoch": 0.16, "grad_norm": 0.8145206500610358, "learning_rate": 6.390328151986184e-06, "loss": 0.0945, "step": 370 }, { "epoch": 0.16, "grad_norm": 0.8232997561718624, "learning_rate": 6.476683937823834e-06, "loss": 0.0858, "step": 375 }, { "epoch": 0.16, "grad_norm": 0.6907309189014695, "learning_rate": 6.563039723661486e-06, "loss": 0.0886, "step": 380 }, { "epoch": 0.17, "grad_norm": 0.6155827336032126, "learning_rate": 6.649395509499137e-06, "loss": 0.0886, "step": 385 }, { "epoch": 0.17, "grad_norm": 2.121964258128152, "learning_rate": 6.735751295336788e-06, "loss": 0.0964, "step": 390 }, { "epoch": 0.17, "grad_norm": 0.42147856280146334, "learning_rate": 6.82210708117444e-06, "loss": 0.0773, "step": 395 }, { "epoch": 0.17, "grad_norm": 0.7617970675602044, "learning_rate": 6.90846286701209e-06, "loss": 0.088, "step": 400 }, { "epoch": 0.17, "grad_norm": 0.4880498398490212, "learning_rate": 6.994818652849742e-06, "loss": 0.0722, "step": 405 }, { "epoch": 0.18, "grad_norm": 0.5379171086423975, "learning_rate": 7.0811744386873925e-06, "loss": 0.0939, "step": 410 }, { "epoch": 0.18, "grad_norm": 0.8777252382337768, "learning_rate": 7.1675302245250435e-06, "loss": 0.1001, "step": 415 }, { "epoch": 0.18, "grad_norm": 0.8711901075408542, "learning_rate": 7.253886010362695e-06, "loss": 0.0829, "step": 420 }, { "epoch": 0.18, "grad_norm": 0.725690370014714, "learning_rate": 7.340241796200346e-06, "loss": 0.0874, "step": 425 }, { "epoch": 0.19, "grad_norm": 0.8773286754142784, "learning_rate": 7.426597582037998e-06, "loss": 0.0955, "step": 430 }, { "epoch": 0.19, "grad_norm": 0.636866710021352, "learning_rate": 7.512953367875648e-06, "loss": 0.0983, "step": 435 }, { "epoch": 0.19, "grad_norm": 0.6827849545920833, "learning_rate": 7.599309153713299e-06, "loss": 0.0848, "step": 440 }, { "epoch": 0.19, "grad_norm": 0.7079397816443027, "learning_rate": 7.68566493955095e-06, "loss": 0.0965, "step": 445 }, { "epoch": 0.19, "grad_norm": 0.7181170354131496, "learning_rate": 7.772020725388602e-06, "loss": 0.0752, "step": 450 }, { "epoch": 0.2, "grad_norm": 0.322597394817283, "learning_rate": 7.858376511226253e-06, "loss": 0.072, "step": 455 }, { "epoch": 0.2, "grad_norm": 0.5792459466721516, "learning_rate": 7.944732297063904e-06, "loss": 0.0821, "step": 460 }, { "epoch": 0.2, "grad_norm": 0.8786329648976334, "learning_rate": 8.031088082901555e-06, "loss": 0.0988, "step": 465 }, { "epoch": 0.2, "grad_norm": 0.5945920943627181, "learning_rate": 8.117443868739207e-06, "loss": 0.0973, "step": 470 }, { "epoch": 0.21, "grad_norm": 0.8347979512880902, "learning_rate": 8.203799654576856e-06, "loss": 0.101, "step": 475 }, { "epoch": 0.21, "grad_norm": 0.6469239378245417, "learning_rate": 8.290155440414507e-06, "loss": 0.087, "step": 480 }, { "epoch": 0.21, "grad_norm": 0.890031998434686, "learning_rate": 8.37651122625216e-06, "loss": 0.0987, "step": 485 }, { "epoch": 0.21, "grad_norm": 0.8556583399604323, "learning_rate": 8.462867012089811e-06, "loss": 0.0763, "step": 490 }, { "epoch": 0.21, "grad_norm": 0.5748423587848386, "learning_rate": 8.549222797927462e-06, "loss": 0.0939, "step": 495 }, { "epoch": 0.22, "grad_norm": 0.592030133056585, "learning_rate": 8.635578583765113e-06, "loss": 0.0776, "step": 500 }, { "epoch": 0.22, "grad_norm": 0.8496557430119451, "learning_rate": 8.721934369602764e-06, "loss": 0.0834, "step": 505 }, { "epoch": 0.22, "grad_norm": 0.7433725646125416, "learning_rate": 8.808290155440415e-06, "loss": 0.086, "step": 510 }, { "epoch": 0.22, "grad_norm": 0.8962892707404773, "learning_rate": 8.894645941278066e-06, "loss": 0.091, "step": 515 }, { "epoch": 0.22, "grad_norm": 0.7094137576645999, "learning_rate": 8.981001727115718e-06, "loss": 0.0873, "step": 520 }, { "epoch": 0.23, "grad_norm": 1.004889054863807, "learning_rate": 9.06735751295337e-06, "loss": 0.1075, "step": 525 }, { "epoch": 0.23, "grad_norm": 1.0825605461343522, "learning_rate": 9.153713298791019e-06, "loss": 0.0917, "step": 530 }, { "epoch": 0.23, "grad_norm": 0.6138783402804222, "learning_rate": 9.240069084628671e-06, "loss": 0.0886, "step": 535 }, { "epoch": 0.23, "grad_norm": 0.7381864789865002, "learning_rate": 9.326424870466322e-06, "loss": 0.0829, "step": 540 }, { "epoch": 0.24, "grad_norm": 0.6258130636911013, "learning_rate": 9.412780656303973e-06, "loss": 0.0851, "step": 545 }, { "epoch": 0.24, "grad_norm": 0.4788323221052753, "learning_rate": 9.499136442141624e-06, "loss": 0.0804, "step": 550 }, { "epoch": 0.24, "grad_norm": 1.108997196870818, "learning_rate": 9.585492227979275e-06, "loss": 0.0875, "step": 555 }, { "epoch": 0.24, "grad_norm": 0.5979786366623295, "learning_rate": 9.671848013816928e-06, "loss": 0.0784, "step": 560 }, { "epoch": 0.24, "grad_norm": 0.9964613100797841, "learning_rate": 9.758203799654577e-06, "loss": 0.1102, "step": 565 }, { "epoch": 0.25, "grad_norm": 0.6033999040613355, "learning_rate": 9.844559585492228e-06, "loss": 0.0763, "step": 570 }, { "epoch": 0.25, "grad_norm": 0.8903085892847482, "learning_rate": 9.93091537132988e-06, "loss": 0.1059, "step": 575 }, { "epoch": 0.25, "grad_norm": 1.4824096488930525, "learning_rate": 1.0017271157167531e-05, "loss": 0.0894, "step": 580 }, { "epoch": 0.25, "grad_norm": 1.2433951606130729, "learning_rate": 1.0103626943005182e-05, "loss": 0.1024, "step": 585 }, { "epoch": 0.25, "grad_norm": 0.6778057289429442, "learning_rate": 1.0189982728842833e-05, "loss": 0.1009, "step": 590 }, { "epoch": 0.26, "grad_norm": 448.5872376035829, "learning_rate": 1.0276338514680484e-05, "loss": 0.136, "step": 595 }, { "epoch": 0.26, "grad_norm": 8.21806945206516, "learning_rate": 1.0362694300518135e-05, "loss": 0.2339, "step": 600 }, { "epoch": 0.26, "grad_norm": 32.423915481989496, "learning_rate": 1.0449050086355788e-05, "loss": 0.1776, "step": 605 }, { "epoch": 0.26, "grad_norm": 4.773716201573569, "learning_rate": 1.0535405872193439e-05, "loss": 0.1198, "step": 610 }, { "epoch": 0.27, "grad_norm": 2.2737662019398757, "learning_rate": 1.062176165803109e-05, "loss": 0.1325, "step": 615 }, { "epoch": 0.27, "grad_norm": 2.7445938115491577, "learning_rate": 1.0708117443868739e-05, "loss": 0.1243, "step": 620 }, { "epoch": 0.27, "grad_norm": 4.8485979268084956, "learning_rate": 1.079447322970639e-05, "loss": 0.1309, "step": 625 }, { "epoch": 0.27, "grad_norm": 98.21615534763852, "learning_rate": 1.0880829015544042e-05, "loss": 0.6446, "step": 630 }, { "epoch": 0.27, "grad_norm": 20.369120279227072, "learning_rate": 1.0967184801381693e-05, "loss": 0.4051, "step": 635 }, { "epoch": 0.28, "grad_norm": 9.255717401202803, "learning_rate": 1.1053540587219344e-05, "loss": 0.2551, "step": 640 }, { "epoch": 0.28, "grad_norm": 4.584163246732508, "learning_rate": 1.1139896373056995e-05, "loss": 0.1466, "step": 645 }, { "epoch": 0.28, "grad_norm": 2.6563813074260962, "learning_rate": 1.1226252158894646e-05, "loss": 0.1524, "step": 650 }, { "epoch": 0.28, "grad_norm": 11.743230448649134, "learning_rate": 1.1312607944732299e-05, "loss": 0.129, "step": 655 }, { "epoch": 0.29, "grad_norm": 2.210415579371572, "learning_rate": 1.139896373056995e-05, "loss": 0.1433, "step": 660 }, { "epoch": 0.29, "grad_norm": 2.9933290541845463, "learning_rate": 1.14853195164076e-05, "loss": 0.133, "step": 665 }, { "epoch": 0.29, "grad_norm": 3.9237889199038127, "learning_rate": 1.1571675302245252e-05, "loss": 0.1038, "step": 670 }, { "epoch": 0.29, "grad_norm": 1.135124310946562, "learning_rate": 1.1658031088082901e-05, "loss": 0.1103, "step": 675 }, { "epoch": 0.29, "grad_norm": 1.4743333927949402, "learning_rate": 1.1744386873920555e-05, "loss": 0.1268, "step": 680 }, { "epoch": 0.3, "grad_norm": 4.532155326678469, "learning_rate": 1.1830742659758205e-05, "loss": 0.1106, "step": 685 }, { "epoch": 0.3, "grad_norm": 0.9554908116369246, "learning_rate": 1.1917098445595855e-05, "loss": 0.0977, "step": 690 }, { "epoch": 0.3, "grad_norm": 1.6864808315479922, "learning_rate": 1.2003454231433506e-05, "loss": 0.103, "step": 695 }, { "epoch": 0.3, "grad_norm": 0.949196719147114, "learning_rate": 1.2089810017271157e-05, "loss": 0.1181, "step": 700 }, { "epoch": 0.3, "grad_norm": 0.7696240182489283, "learning_rate": 1.217616580310881e-05, "loss": 0.1242, "step": 705 }, { "epoch": 0.31, "grad_norm": 0.8567236220701305, "learning_rate": 1.2262521588946461e-05, "loss": 0.096, "step": 710 }, { "epoch": 0.31, "grad_norm": 0.8225245866733908, "learning_rate": 1.2348877374784112e-05, "loss": 0.1088, "step": 715 }, { "epoch": 0.31, "grad_norm": 0.9565738061517076, "learning_rate": 1.2435233160621763e-05, "loss": 0.0966, "step": 720 }, { "epoch": 0.31, "grad_norm": 0.6439159172681707, "learning_rate": 1.2521588946459414e-05, "loss": 0.098, "step": 725 }, { "epoch": 0.32, "grad_norm": 0.7777338342869954, "learning_rate": 1.2607944732297063e-05, "loss": 0.0934, "step": 730 }, { "epoch": 0.32, "grad_norm": 1.4576043339962497, "learning_rate": 1.2694300518134717e-05, "loss": 0.1199, "step": 735 }, { "epoch": 0.32, "grad_norm": 1.0279834902840417, "learning_rate": 1.2780656303972368e-05, "loss": 0.0885, "step": 740 }, { "epoch": 0.32, "grad_norm": 1.6026067629298502, "learning_rate": 1.2867012089810018e-05, "loss": 0.1264, "step": 745 }, { "epoch": 0.32, "grad_norm": 1.4783691316537515, "learning_rate": 1.2953367875647668e-05, "loss": 0.1059, "step": 750 }, { "epoch": 0.33, "grad_norm": 0.9486933852319395, "learning_rate": 1.303972366148532e-05, "loss": 0.0922, "step": 755 }, { "epoch": 0.33, "grad_norm": 1.4499456879597254, "learning_rate": 1.3126079447322972e-05, "loss": 0.1014, "step": 760 }, { "epoch": 0.33, "grad_norm": 0.9941662211122442, "learning_rate": 1.3212435233160623e-05, "loss": 0.1022, "step": 765 }, { "epoch": 0.33, "grad_norm": 0.8036557979399938, "learning_rate": 1.3298791018998274e-05, "loss": 0.0902, "step": 770 }, { "epoch": 0.33, "grad_norm": 0.7748669920716171, "learning_rate": 1.3385146804835925e-05, "loss": 0.0818, "step": 775 }, { "epoch": 0.34, "grad_norm": 1.657169437820155, "learning_rate": 1.3471502590673576e-05, "loss": 0.1055, "step": 780 }, { "epoch": 0.34, "grad_norm": 0.46710916166028693, "learning_rate": 1.3557858376511228e-05, "loss": 0.1182, "step": 785 }, { "epoch": 0.34, "grad_norm": 0.8598455308727582, "learning_rate": 1.364421416234888e-05, "loss": 0.0931, "step": 790 }, { "epoch": 0.34, "grad_norm": 1.3357712764845688, "learning_rate": 1.373056994818653e-05, "loss": 0.0874, "step": 795 }, { "epoch": 0.35, "grad_norm": 1.0229109182232703, "learning_rate": 1.381692573402418e-05, "loss": 0.0952, "step": 800 }, { "epoch": 0.35, "grad_norm": 0.5623898387210554, "learning_rate": 1.390328151986183e-05, "loss": 0.11, "step": 805 }, { "epoch": 0.35, "grad_norm": 0.7245600666438339, "learning_rate": 1.3989637305699483e-05, "loss": 0.1029, "step": 810 }, { "epoch": 0.35, "grad_norm": 1.510481458435366, "learning_rate": 1.4075993091537134e-05, "loss": 0.0986, "step": 815 }, { "epoch": 0.35, "grad_norm": 0.6791060587785951, "learning_rate": 1.4162348877374785e-05, "loss": 0.1105, "step": 820 }, { "epoch": 0.36, "grad_norm": 0.41798381472993135, "learning_rate": 1.4248704663212436e-05, "loss": 0.1066, "step": 825 }, { "epoch": 0.36, "grad_norm": 0.9469611913330616, "learning_rate": 1.4335060449050087e-05, "loss": 0.0805, "step": 830 }, { "epoch": 0.36, "grad_norm": 0.7479499626670872, "learning_rate": 1.442141623488774e-05, "loss": 0.0987, "step": 835 }, { "epoch": 0.36, "grad_norm": 0.7514723994820159, "learning_rate": 1.450777202072539e-05, "loss": 0.1001, "step": 840 }, { "epoch": 0.37, "grad_norm": 0.7402662681471279, "learning_rate": 1.4594127806563041e-05, "loss": 0.0872, "step": 845 }, { "epoch": 0.37, "grad_norm": 0.7009014324991375, "learning_rate": 1.4680483592400692e-05, "loss": 0.0999, "step": 850 }, { "epoch": 0.37, "grad_norm": 0.5733120839978919, "learning_rate": 1.4766839378238342e-05, "loss": 0.0987, "step": 855 }, { "epoch": 0.37, "grad_norm": 0.6928308586582662, "learning_rate": 1.4853195164075996e-05, "loss": 0.0954, "step": 860 }, { "epoch": 0.37, "grad_norm": 1.1809120617319104, "learning_rate": 1.4939550949913645e-05, "loss": 0.0931, "step": 865 }, { "epoch": 0.38, "grad_norm": 0.5924468807561128, "learning_rate": 1.5025906735751296e-05, "loss": 0.079, "step": 870 }, { "epoch": 0.38, "grad_norm": 1.6862371179811677, "learning_rate": 1.5112262521588947e-05, "loss": 0.0963, "step": 875 }, { "epoch": 0.38, "grad_norm": 0.8000620112908129, "learning_rate": 1.5198618307426598e-05, "loss": 0.1036, "step": 880 }, { "epoch": 0.38, "grad_norm": 1.2520177997722077, "learning_rate": 1.5284974093264252e-05, "loss": 0.1247, "step": 885 }, { "epoch": 0.38, "grad_norm": 0.8857701186569507, "learning_rate": 1.53713298791019e-05, "loss": 0.0829, "step": 890 }, { "epoch": 0.39, "grad_norm": 0.9391358915324642, "learning_rate": 1.545768566493955e-05, "loss": 0.1097, "step": 895 }, { "epoch": 0.39, "grad_norm": 0.867842560800044, "learning_rate": 1.5544041450777204e-05, "loss": 0.0946, "step": 900 }, { "epoch": 0.39, "grad_norm": 0.9400349073231179, "learning_rate": 1.5630397236614853e-05, "loss": 0.1056, "step": 905 }, { "epoch": 0.39, "grad_norm": 0.6224643324430222, "learning_rate": 1.5716753022452505e-05, "loss": 0.0884, "step": 910 }, { "epoch": 0.4, "grad_norm": 0.6375382173467924, "learning_rate": 1.5803108808290158e-05, "loss": 0.1019, "step": 915 }, { "epoch": 0.4, "grad_norm": 0.5289546723303301, "learning_rate": 1.5889464594127807e-05, "loss": 0.1053, "step": 920 }, { "epoch": 0.4, "grad_norm": 0.7834683566163679, "learning_rate": 1.597582037996546e-05, "loss": 0.1037, "step": 925 }, { "epoch": 0.4, "grad_norm": 0.9867612062335633, "learning_rate": 1.606217616580311e-05, "loss": 0.1095, "step": 930 }, { "epoch": 0.4, "grad_norm": 0.8209415616389953, "learning_rate": 1.6148531951640762e-05, "loss": 0.0885, "step": 935 }, { "epoch": 0.41, "grad_norm": 1.1682789140133532, "learning_rate": 1.6234887737478414e-05, "loss": 0.0885, "step": 940 }, { "epoch": 0.41, "grad_norm": 0.9859818071339542, "learning_rate": 1.6321243523316064e-05, "loss": 0.0962, "step": 945 }, { "epoch": 0.41, "grad_norm": 0.7829183817929026, "learning_rate": 1.6407599309153713e-05, "loss": 0.0894, "step": 950 }, { "epoch": 0.41, "grad_norm": 0.9752553349973404, "learning_rate": 1.6493955094991366e-05, "loss": 0.0822, "step": 955 }, { "epoch": 0.41, "grad_norm": 0.7139804233290242, "learning_rate": 1.6580310880829015e-05, "loss": 0.1096, "step": 960 }, { "epoch": 0.42, "grad_norm": 0.9303149577410212, "learning_rate": 1.6666666666666667e-05, "loss": 0.0819, "step": 965 }, { "epoch": 0.42, "grad_norm": 1.2058202041327308, "learning_rate": 1.675302245250432e-05, "loss": 0.0982, "step": 970 }, { "epoch": 0.42, "grad_norm": 1.9759790480560875, "learning_rate": 1.683937823834197e-05, "loss": 0.0834, "step": 975 }, { "epoch": 0.42, "grad_norm": 0.9091397007217376, "learning_rate": 1.6925734024179622e-05, "loss": 0.116, "step": 980 }, { "epoch": 0.43, "grad_norm": 0.5255731686872832, "learning_rate": 1.701208981001727e-05, "loss": 0.0884, "step": 985 }, { "epoch": 0.43, "grad_norm": 0.9767889473211209, "learning_rate": 1.7098445595854924e-05, "loss": 0.0974, "step": 990 }, { "epoch": 0.43, "grad_norm": 1.4010325380143416, "learning_rate": 1.7184801381692577e-05, "loss": 0.1152, "step": 995 }, { "epoch": 0.43, "grad_norm": 0.8767740796336314, "learning_rate": 1.7271157167530226e-05, "loss": 0.0906, "step": 1000 }, { "epoch": 0.43, "grad_norm": 1.8985824020069753, "learning_rate": 1.7357512953367875e-05, "loss": 0.1123, "step": 1005 }, { "epoch": 0.44, "grad_norm": 0.7505161160084586, "learning_rate": 1.7443868739205528e-05, "loss": 0.0849, "step": 1010 }, { "epoch": 0.44, "grad_norm": 1.6840014025213934, "learning_rate": 1.753022452504318e-05, "loss": 0.0988, "step": 1015 }, { "epoch": 0.44, "grad_norm": 0.7035053998018641, "learning_rate": 1.761658031088083e-05, "loss": 0.0886, "step": 1020 }, { "epoch": 0.44, "grad_norm": 0.8977571896188331, "learning_rate": 1.7702936096718482e-05, "loss": 0.0838, "step": 1025 }, { "epoch": 0.44, "grad_norm": 0.5137751559858482, "learning_rate": 1.778929188255613e-05, "loss": 0.0803, "step": 1030 }, { "epoch": 0.45, "grad_norm": 0.5990089687696084, "learning_rate": 1.7875647668393784e-05, "loss": 0.0871, "step": 1035 }, { "epoch": 0.45, "grad_norm": 1.046378770763183, "learning_rate": 1.7962003454231437e-05, "loss": 0.0907, "step": 1040 }, { "epoch": 0.45, "grad_norm": 0.4817479325028605, "learning_rate": 1.8048359240069086e-05, "loss": 0.0858, "step": 1045 }, { "epoch": 0.45, "grad_norm": 0.9294482820288561, "learning_rate": 1.813471502590674e-05, "loss": 0.0748, "step": 1050 }, { "epoch": 0.46, "grad_norm": 0.5354498293460184, "learning_rate": 1.8221070811744388e-05, "loss": 0.0939, "step": 1055 }, { "epoch": 0.46, "grad_norm": 0.5784525074446918, "learning_rate": 1.8307426597582037e-05, "loss": 0.0843, "step": 1060 }, { "epoch": 0.46, "grad_norm": 0.62801785518792, "learning_rate": 1.8393782383419693e-05, "loss": 0.0846, "step": 1065 }, { "epoch": 0.46, "grad_norm": 0.5139295632612221, "learning_rate": 1.8480138169257342e-05, "loss": 0.0801, "step": 1070 }, { "epoch": 0.46, "grad_norm": 0.9194494828553363, "learning_rate": 1.856649395509499e-05, "loss": 0.0819, "step": 1075 }, { "epoch": 0.47, "grad_norm": 0.9742593365295164, "learning_rate": 1.8652849740932644e-05, "loss": 0.1078, "step": 1080 }, { "epoch": 0.47, "grad_norm": 0.7692295674318097, "learning_rate": 1.8739205526770293e-05, "loss": 0.1085, "step": 1085 }, { "epoch": 0.47, "grad_norm": 0.9845970911106678, "learning_rate": 1.8825561312607946e-05, "loss": 0.0884, "step": 1090 }, { "epoch": 0.47, "grad_norm": 2.082820314979471, "learning_rate": 1.89119170984456e-05, "loss": 0.12, "step": 1095 }, { "epoch": 0.48, "grad_norm": 0.8117837080054929, "learning_rate": 1.8998272884283248e-05, "loss": 0.0822, "step": 1100 }, { "epoch": 0.48, "grad_norm": 1.1460092611090402, "learning_rate": 1.90846286701209e-05, "loss": 0.1097, "step": 1105 }, { "epoch": 0.48, "grad_norm": 0.8899569297894988, "learning_rate": 1.917098445595855e-05, "loss": 0.0886, "step": 1110 }, { "epoch": 0.48, "grad_norm": 0.6410464393227897, "learning_rate": 1.9257340241796203e-05, "loss": 0.0789, "step": 1115 }, { "epoch": 0.48, "grad_norm": 0.4505620588270192, "learning_rate": 1.9343696027633855e-05, "loss": 0.0909, "step": 1120 }, { "epoch": 0.49, "grad_norm": 0.852506053614793, "learning_rate": 1.9430051813471504e-05, "loss": 0.0974, "step": 1125 }, { "epoch": 0.49, "grad_norm": 2.6689573293041553, "learning_rate": 1.9516407599309154e-05, "loss": 0.0843, "step": 1130 }, { "epoch": 0.49, "grad_norm": 0.5230054777855191, "learning_rate": 1.9602763385146806e-05, "loss": 0.0928, "step": 1135 }, { "epoch": 0.49, "grad_norm": 0.6142185715888877, "learning_rate": 1.9689119170984456e-05, "loss": 0.0945, "step": 1140 }, { "epoch": 0.49, "grad_norm": 0.5872875788362734, "learning_rate": 1.9775474956822108e-05, "loss": 0.0905, "step": 1145 }, { "epoch": 0.5, "grad_norm": 0.5007585461555959, "learning_rate": 1.986183074265976e-05, "loss": 0.0927, "step": 1150 }, { "epoch": 0.5, "grad_norm": 0.6808047766488587, "learning_rate": 1.994818652849741e-05, "loss": 0.0873, "step": 1155 }, { "epoch": 0.5, "grad_norm": 0.6689238734320209, "learning_rate": 1.9999998180950994e-05, "loss": 0.0806, "step": 1160 }, { "epoch": 0.5, "grad_norm": 0.7121185292021445, "learning_rate": 1.9999977716657265e-05, "loss": 0.1034, "step": 1165 }, { "epoch": 0.51, "grad_norm": 0.791635702401396, "learning_rate": 1.9999934514305234e-05, "loss": 0.1118, "step": 1170 }, { "epoch": 0.51, "grad_norm": 0.7777895778365356, "learning_rate": 1.9999868573993138e-05, "loss": 0.1009, "step": 1175 }, { "epoch": 0.51, "grad_norm": 0.7114168138902796, "learning_rate": 1.999977989587091e-05, "loss": 0.0968, "step": 1180 }, { "epoch": 0.51, "grad_norm": 1.1214052182705263, "learning_rate": 1.999966848014019e-05, "loss": 0.0998, "step": 1185 }, { "epoch": 0.51, "grad_norm": 0.5835670870079697, "learning_rate": 1.999953432705431e-05, "loss": 0.0913, "step": 1190 }, { "epoch": 0.52, "grad_norm": 1.0152760699918206, "learning_rate": 1.9999377436918314e-05, "loss": 0.0982, "step": 1195 }, { "epoch": 0.52, "grad_norm": 0.3391884623808986, "learning_rate": 1.9999197810088943e-05, "loss": 0.0781, "step": 1200 }, { "epoch": 0.52, "grad_norm": 0.5009493361172995, "learning_rate": 1.999899544697463e-05, "loss": 0.092, "step": 1205 }, { "epoch": 0.52, "grad_norm": 0.4674731980979764, "learning_rate": 1.999877034803551e-05, "loss": 0.0895, "step": 1210 }, { "epoch": 0.52, "grad_norm": 1.9768349974485036, "learning_rate": 1.9998522513783415e-05, "loss": 0.0954, "step": 1215 }, { "epoch": 0.53, "grad_norm": 0.6329573907921768, "learning_rate": 1.9998251944781878e-05, "loss": 0.0842, "step": 1220 }, { "epoch": 0.53, "grad_norm": 0.5513670604045542, "learning_rate": 1.9997958641646122e-05, "loss": 0.0898, "step": 1225 }, { "epoch": 0.53, "grad_norm": 1.2570984021594174, "learning_rate": 1.9997642605043056e-05, "loss": 0.0966, "step": 1230 }, { "epoch": 0.53, "grad_norm": 1.0391042314097374, "learning_rate": 1.9997303835691292e-05, "loss": 0.0949, "step": 1235 }, { "epoch": 0.54, "grad_norm": 0.6263951819312368, "learning_rate": 1.9996942334361126e-05, "loss": 0.072, "step": 1240 }, { "epoch": 0.54, "grad_norm": 0.5995161680677363, "learning_rate": 1.9996558101874543e-05, "loss": 0.0907, "step": 1245 }, { "epoch": 0.54, "grad_norm": 0.5405848608014154, "learning_rate": 1.999615113910522e-05, "loss": 0.0907, "step": 1250 }, { "epoch": 0.54, "grad_norm": 0.7354793370663812, "learning_rate": 1.999572144697851e-05, "loss": 0.0825, "step": 1255 }, { "epoch": 0.54, "grad_norm": 0.8097571767018434, "learning_rate": 1.9995269026471447e-05, "loss": 0.0873, "step": 1260 }, { "epoch": 0.55, "grad_norm": 0.7413997346551716, "learning_rate": 1.9994793878612756e-05, "loss": 0.0907, "step": 1265 }, { "epoch": 0.55, "grad_norm": 0.49108968812165604, "learning_rate": 1.9994296004482832e-05, "loss": 0.0775, "step": 1270 }, { "epoch": 0.55, "grad_norm": 1.1407732391312855, "learning_rate": 1.9993775405213748e-05, "loss": 0.0883, "step": 1275 }, { "epoch": 0.55, "grad_norm": 0.5322078982239706, "learning_rate": 1.9993232081989243e-05, "loss": 0.1029, "step": 1280 }, { "epoch": 0.56, "grad_norm": 0.3707481376397464, "learning_rate": 1.9992666036044738e-05, "loss": 0.0879, "step": 1285 }, { "epoch": 0.56, "grad_norm": 0.43162394310399216, "learning_rate": 1.999207726866731e-05, "loss": 0.0823, "step": 1290 }, { "epoch": 0.56, "grad_norm": 0.39430801592296844, "learning_rate": 1.9991465781195706e-05, "loss": 0.0896, "step": 1295 }, { "epoch": 0.56, "grad_norm": 0.5694159677621585, "learning_rate": 1.999083157502033e-05, "loss": 0.0816, "step": 1300 }, { "epoch": 0.56, "grad_norm": 0.5060620019833312, "learning_rate": 1.9990174651583252e-05, "loss": 0.0884, "step": 1305 }, { "epoch": 0.57, "grad_norm": 0.6293129592641865, "learning_rate": 1.998949501237819e-05, "loss": 0.0809, "step": 1310 }, { "epoch": 0.57, "grad_norm": 0.3046083825730037, "learning_rate": 1.998879265895051e-05, "loss": 0.0835, "step": 1315 }, { "epoch": 0.57, "grad_norm": 0.5537324247444009, "learning_rate": 1.998806759289724e-05, "loss": 0.0951, "step": 1320 }, { "epoch": 0.57, "grad_norm": 0.5478670125082655, "learning_rate": 1.9987319815867037e-05, "loss": 0.0845, "step": 1325 }, { "epoch": 0.57, "grad_norm": 0.7964012064137652, "learning_rate": 1.9986549329560204e-05, "loss": 0.1053, "step": 1330 }, { "epoch": 0.58, "grad_norm": 1.8201537024321701, "learning_rate": 1.9985756135728685e-05, "loss": 0.0904, "step": 1335 }, { "epoch": 0.58, "grad_norm": 0.5932463119726163, "learning_rate": 1.998494023617605e-05, "loss": 0.0733, "step": 1340 }, { "epoch": 0.58, "grad_norm": 0.5212259318017795, "learning_rate": 1.99841016327575e-05, "loss": 0.0829, "step": 1345 }, { "epoch": 0.58, "grad_norm": 0.7552207970663853, "learning_rate": 1.9983240327379862e-05, "loss": 0.1056, "step": 1350 }, { "epoch": 0.59, "grad_norm": 0.5644526446704441, "learning_rate": 1.9982356322001584e-05, "loss": 0.0815, "step": 1355 }, { "epoch": 0.59, "grad_norm": 2.312470171279909, "learning_rate": 1.998144961863272e-05, "loss": 0.0992, "step": 1360 }, { "epoch": 0.59, "grad_norm": 1.1405067091990706, "learning_rate": 1.9980520219334947e-05, "loss": 0.1161, "step": 1365 }, { "epoch": 0.59, "grad_norm": 0.33573877882920905, "learning_rate": 1.9979568126221543e-05, "loss": 0.0862, "step": 1370 }, { "epoch": 0.59, "grad_norm": 0.5219279117942052, "learning_rate": 1.997859334145739e-05, "loss": 0.1008, "step": 1375 }, { "epoch": 0.6, "grad_norm": 0.5106260104942524, "learning_rate": 1.997759586725896e-05, "loss": 0.0924, "step": 1380 }, { "epoch": 0.6, "grad_norm": 1.0248504115962325, "learning_rate": 1.997657570589432e-05, "loss": 0.1078, "step": 1385 }, { "epoch": 0.6, "grad_norm": 0.6094328587436685, "learning_rate": 1.9975532859683126e-05, "loss": 0.0772, "step": 1390 }, { "epoch": 0.6, "grad_norm": 1.1002689291796384, "learning_rate": 1.9974467330996616e-05, "loss": 0.1, "step": 1395 }, { "epoch": 0.6, "grad_norm": 0.5136753011629079, "learning_rate": 1.99733791222576e-05, "loss": 0.0947, "step": 1400 }, { "epoch": 0.61, "grad_norm": 0.7928839770448126, "learning_rate": 1.997226823594045e-05, "loss": 0.0899, "step": 1405 }, { "epoch": 0.61, "grad_norm": 0.3194381533981615, "learning_rate": 1.997113467457112e-05, "loss": 0.0908, "step": 1410 }, { "epoch": 0.61, "grad_norm": 0.4640425843189779, "learning_rate": 1.9969978440727116e-05, "loss": 0.0975, "step": 1415 }, { "epoch": 0.61, "grad_norm": 0.8225929364452668, "learning_rate": 1.9968799537037488e-05, "loss": 0.0989, "step": 1420 }, { "epoch": 0.62, "grad_norm": 0.5521492749721465, "learning_rate": 1.9967597966182846e-05, "loss": 0.0976, "step": 1425 }, { "epoch": 0.62, "grad_norm": 0.5472673648386764, "learning_rate": 1.9966373730895333e-05, "loss": 0.0908, "step": 1430 }, { "epoch": 0.62, "grad_norm": 0.9585374769415203, "learning_rate": 1.9965126833958625e-05, "loss": 0.0875, "step": 1435 }, { "epoch": 0.62, "grad_norm": 0.8371145278722188, "learning_rate": 1.9963857278207937e-05, "loss": 0.0934, "step": 1440 }, { "epoch": 0.62, "grad_norm": 0.7680571773730615, "learning_rate": 1.996256506652999e-05, "loss": 0.1097, "step": 1445 }, { "epoch": 0.63, "grad_norm": 1.0592484287359163, "learning_rate": 1.9961250201863037e-05, "loss": 0.0967, "step": 1450 }, { "epoch": 0.63, "grad_norm": 0.44679514300900086, "learning_rate": 1.995991268719683e-05, "loss": 0.0884, "step": 1455 }, { "epoch": 0.63, "grad_norm": 0.6909332703927928, "learning_rate": 1.9958552525572615e-05, "loss": 0.1118, "step": 1460 }, { "epoch": 0.63, "grad_norm": 0.5668123307660959, "learning_rate": 1.9957169720083157e-05, "loss": 0.089, "step": 1465 }, { "epoch": 0.63, "grad_norm": 1.0174208845150305, "learning_rate": 1.9955764273872682e-05, "loss": 0.0996, "step": 1470 }, { "epoch": 0.64, "grad_norm": 0.5429972018826015, "learning_rate": 1.995433619013692e-05, "loss": 0.0876, "step": 1475 }, { "epoch": 0.64, "grad_norm": 0.9880842640326745, "learning_rate": 1.9952885472123055e-05, "loss": 0.1013, "step": 1480 }, { "epoch": 0.64, "grad_norm": 0.7178755350757968, "learning_rate": 1.9951412123129744e-05, "loss": 0.0979, "step": 1485 }, { "epoch": 0.64, "grad_norm": 0.3992707003588622, "learning_rate": 1.9949916146507112e-05, "loss": 0.1148, "step": 1490 }, { "epoch": 0.65, "grad_norm": 0.631851323844705, "learning_rate": 1.9948397545656723e-05, "loss": 0.0842, "step": 1495 }, { "epoch": 0.65, "grad_norm": 0.6374008156020984, "learning_rate": 1.9946856324031583e-05, "loss": 0.0903, "step": 1500 }, { "epoch": 0.65, "grad_norm": 2.6567821637198543, "learning_rate": 1.9945292485136147e-05, "loss": 0.0823, "step": 1505 }, { "epoch": 0.65, "grad_norm": 0.7212264227394842, "learning_rate": 1.9943706032526285e-05, "loss": 0.0873, "step": 1510 }, { "epoch": 0.65, "grad_norm": 0.45023488726069755, "learning_rate": 1.994209696980929e-05, "loss": 0.0826, "step": 1515 }, { "epoch": 0.66, "grad_norm": 0.7119656947598436, "learning_rate": 1.9940465300643873e-05, "loss": 0.0891, "step": 1520 }, { "epoch": 0.66, "grad_norm": 0.5964456186339716, "learning_rate": 1.993881102874013e-05, "loss": 0.0984, "step": 1525 }, { "epoch": 0.66, "grad_norm": 0.6914123199335683, "learning_rate": 1.9937134157859564e-05, "loss": 0.0758, "step": 1530 }, { "epoch": 0.66, "grad_norm": 0.5560481104608749, "learning_rate": 1.993543469181507e-05, "loss": 0.0668, "step": 1535 }, { "epoch": 0.67, "grad_norm": 0.8057082881679304, "learning_rate": 1.9933712634470908e-05, "loss": 0.0887, "step": 1540 }, { "epoch": 0.67, "grad_norm": 0.7710775739750066, "learning_rate": 1.993196798974271e-05, "loss": 0.0996, "step": 1545 }, { "epoch": 0.67, "grad_norm": 0.502429831287691, "learning_rate": 1.993020076159747e-05, "loss": 0.0803, "step": 1550 }, { "epoch": 0.67, "grad_norm": 0.5868894943798167, "learning_rate": 1.992841095405353e-05, "loss": 0.0864, "step": 1555 }, { "epoch": 0.67, "grad_norm": 0.28510438917209935, "learning_rate": 1.9926598571180572e-05, "loss": 0.0682, "step": 1560 }, { "epoch": 0.68, "grad_norm": 0.7590086408954494, "learning_rate": 1.9924763617099618e-05, "loss": 0.0927, "step": 1565 }, { "epoch": 0.68, "grad_norm": 0.5037762336158287, "learning_rate": 1.9922906095982997e-05, "loss": 0.0869, "step": 1570 }, { "epoch": 0.68, "grad_norm": 0.5921698361775859, "learning_rate": 1.992102601205437e-05, "loss": 0.0819, "step": 1575 }, { "epoch": 0.68, "grad_norm": 0.5825079229431567, "learning_rate": 1.9919123369588688e-05, "loss": 0.0961, "step": 1580 }, { "epoch": 0.68, "grad_norm": 0.7608364067381084, "learning_rate": 1.9917198172912195e-05, "loss": 0.1001, "step": 1585 }, { "epoch": 0.69, "grad_norm": 1.1934597870924588, "learning_rate": 1.9915250426402436e-05, "loss": 0.0933, "step": 1590 }, { "epoch": 0.69, "grad_norm": 0.6841623546034824, "learning_rate": 1.991328013448821e-05, "loss": 0.0828, "step": 1595 }, { "epoch": 0.69, "grad_norm": 0.6882329284891476, "learning_rate": 1.991128730164959e-05, "loss": 0.0808, "step": 1600 }, { "epoch": 0.69, "grad_norm": 0.8720753704539425, "learning_rate": 1.99092719324179e-05, "loss": 0.0837, "step": 1605 }, { "epoch": 0.7, "grad_norm": 0.5402610151606684, "learning_rate": 1.990723403137571e-05, "loss": 0.0916, "step": 1610 }, { "epoch": 0.7, "grad_norm": 0.6838808424621337, "learning_rate": 1.9905173603156824e-05, "loss": 0.0923, "step": 1615 }, { "epoch": 0.7, "grad_norm": 0.30202755881155974, "learning_rate": 1.990309065244626e-05, "loss": 0.0989, "step": 1620 }, { "epoch": 0.7, "grad_norm": 0.6086110751842494, "learning_rate": 1.9900985183980256e-05, "loss": 0.0941, "step": 1625 }, { "epoch": 0.7, "grad_norm": 0.6688653967743655, "learning_rate": 1.9898857202546255e-05, "loss": 0.0959, "step": 1630 }, { "epoch": 0.71, "grad_norm": 0.7484858228329726, "learning_rate": 1.9896706712982877e-05, "loss": 0.0922, "step": 1635 }, { "epoch": 0.71, "grad_norm": 0.676025015630956, "learning_rate": 1.9894533720179932e-05, "loss": 0.0994, "step": 1640 }, { "epoch": 0.71, "grad_norm": 0.5491864239928795, "learning_rate": 1.9892338229078397e-05, "loss": 0.0806, "step": 1645 }, { "epoch": 0.71, "grad_norm": 0.725976378150946, "learning_rate": 1.98901202446704e-05, "loss": 0.1067, "step": 1650 }, { "epoch": 0.71, "grad_norm": 0.4913084517466355, "learning_rate": 1.9887879771999214e-05, "loss": 0.097, "step": 1655 }, { "epoch": 0.72, "grad_norm": 0.7471224320617841, "learning_rate": 1.988561681615926e-05, "loss": 0.0966, "step": 1660 }, { "epoch": 0.72, "grad_norm": 0.5014553278723048, "learning_rate": 1.9883331382296064e-05, "loss": 0.1027, "step": 1665 }, { "epoch": 0.72, "grad_norm": 0.6818025963494562, "learning_rate": 1.9881023475606277e-05, "loss": 0.096, "step": 1670 }, { "epoch": 0.72, "grad_norm": 0.960144563702732, "learning_rate": 1.9878693101337637e-05, "loss": 0.0967, "step": 1675 }, { "epoch": 0.73, "grad_norm": 0.5620856263756633, "learning_rate": 1.9876340264788975e-05, "loss": 0.0749, "step": 1680 }, { "epoch": 0.73, "grad_norm": 0.4955492724470792, "learning_rate": 1.9873964971310195e-05, "loss": 0.0903, "step": 1685 }, { "epoch": 0.73, "grad_norm": 1.2069504512746692, "learning_rate": 1.987156722630227e-05, "loss": 0.104, "step": 1690 }, { "epoch": 0.73, "grad_norm": 0.8683018619940261, "learning_rate": 1.986914703521722e-05, "loss": 0.1089, "step": 1695 }, { "epoch": 0.73, "grad_norm": 0.733977762749242, "learning_rate": 1.9866704403558092e-05, "loss": 0.0784, "step": 1700 }, { "epoch": 0.74, "grad_norm": 0.6334701598336082, "learning_rate": 1.9864239336878978e-05, "loss": 0.089, "step": 1705 }, { "epoch": 0.74, "grad_norm": 0.648143567197238, "learning_rate": 1.9861751840784967e-05, "loss": 0.0903, "step": 1710 }, { "epoch": 0.74, "grad_norm": 0.7652710041657063, "learning_rate": 1.9859241920932164e-05, "loss": 0.0942, "step": 1715 }, { "epoch": 0.74, "grad_norm": 0.7305568688315708, "learning_rate": 1.9856709583027642e-05, "loss": 0.1115, "step": 1720 }, { "epoch": 0.75, "grad_norm": 1.7278653299987823, "learning_rate": 1.985415483282946e-05, "loss": 0.0809, "step": 1725 }, { "epoch": 0.75, "grad_norm": 0.6529801862227353, "learning_rate": 1.9851577676146644e-05, "loss": 0.0755, "step": 1730 }, { "epoch": 0.75, "grad_norm": 0.4144692070134487, "learning_rate": 1.9848978118839155e-05, "loss": 0.0943, "step": 1735 }, { "epoch": 0.75, "grad_norm": 1.9734791631021336, "learning_rate": 1.9846356166817894e-05, "loss": 0.0898, "step": 1740 }, { "epoch": 0.75, "grad_norm": 0.5459253898472208, "learning_rate": 1.984371182604469e-05, "loss": 0.0804, "step": 1745 }, { "epoch": 0.76, "grad_norm": 0.5382475774373716, "learning_rate": 1.9841045102532263e-05, "loss": 0.0851, "step": 1750 }, { "epoch": 0.76, "grad_norm": 1.551104458429542, "learning_rate": 1.983835600234425e-05, "loss": 0.1059, "step": 1755 }, { "epoch": 0.76, "grad_norm": 0.7868898688507436, "learning_rate": 1.983564453159515e-05, "loss": 0.1054, "step": 1760 }, { "epoch": 0.76, "grad_norm": 1.020652098910767, "learning_rate": 1.983291069645034e-05, "loss": 0.0964, "step": 1765 }, { "epoch": 0.76, "grad_norm": 0.7107879271932049, "learning_rate": 1.9830154503126034e-05, "loss": 0.0903, "step": 1770 }, { "epoch": 0.77, "grad_norm": 0.6594309834743366, "learning_rate": 1.9827375957889303e-05, "loss": 0.0846, "step": 1775 }, { "epoch": 0.77, "grad_norm": 0.5263342809036547, "learning_rate": 1.9824575067058028e-05, "loss": 0.0695, "step": 1780 }, { "epoch": 0.77, "grad_norm": 1.0348679313164832, "learning_rate": 1.9821751837000912e-05, "loss": 0.0856, "step": 1785 }, { "epoch": 0.77, "grad_norm": 1.117064780483444, "learning_rate": 1.981890627413744e-05, "loss": 0.1036, "step": 1790 }, { "epoch": 0.78, "grad_norm": 0.4180917277551505, "learning_rate": 1.981603838493789e-05, "loss": 0.0916, "step": 1795 }, { "epoch": 0.78, "grad_norm": 2.292965557154143, "learning_rate": 1.9813148175923295e-05, "loss": 0.0974, "step": 1800 }, { "epoch": 0.78, "grad_norm": 1.0931436805632515, "learning_rate": 1.981023565366544e-05, "loss": 0.0804, "step": 1805 }, { "epoch": 0.78, "grad_norm": 1.1197940353971019, "learning_rate": 1.980730082478686e-05, "loss": 0.0931, "step": 1810 }, { "epoch": 0.78, "grad_norm": 0.8241163830027554, "learning_rate": 1.980434369596079e-05, "loss": 0.0837, "step": 1815 }, { "epoch": 0.79, "grad_norm": 1.5779394216611302, "learning_rate": 1.980136427391119e-05, "loss": 0.0959, "step": 1820 }, { "epoch": 0.79, "grad_norm": 1.5568696619522093, "learning_rate": 1.97983625654127e-05, "loss": 0.1035, "step": 1825 }, { "epoch": 0.79, "grad_norm": 5.119739750277216, "learning_rate": 1.979533857729064e-05, "loss": 0.1234, "step": 1830 }, { "epoch": 0.79, "grad_norm": 1.223109478908416, "learning_rate": 1.9792292316420977e-05, "loss": 0.0979, "step": 1835 }, { "epoch": 0.79, "grad_norm": 1.4446616152884881, "learning_rate": 1.978922378973034e-05, "loss": 0.106, "step": 1840 }, { "epoch": 0.8, "grad_norm": 1.214455605272822, "learning_rate": 1.9786133004195985e-05, "loss": 0.0913, "step": 1845 }, { "epoch": 0.8, "grad_norm": 1.5457682654468834, "learning_rate": 1.978301996684576e-05, "loss": 0.0979, "step": 1850 }, { "epoch": 0.8, "grad_norm": 2.3570341758823226, "learning_rate": 1.9779884684758134e-05, "loss": 0.0886, "step": 1855 }, { "epoch": 0.8, "grad_norm": 1.7709673245293045, "learning_rate": 1.9776727165062138e-05, "loss": 0.0846, "step": 1860 }, { "epoch": 0.81, "grad_norm": 1.7004550493706148, "learning_rate": 1.9773547414937377e-05, "loss": 0.1061, "step": 1865 }, { "epoch": 0.81, "grad_norm": 0.8904480634107864, "learning_rate": 1.9770345441614006e-05, "loss": 0.0898, "step": 1870 }, { "epoch": 0.81, "grad_norm": 1.0038580798349137, "learning_rate": 1.97671212523727e-05, "loss": 0.1083, "step": 1875 }, { "epoch": 0.81, "grad_norm": 0.7913132768445936, "learning_rate": 1.976387485454466e-05, "loss": 0.0821, "step": 1880 }, { "epoch": 0.81, "grad_norm": 0.8873095461861253, "learning_rate": 1.976060625551158e-05, "loss": 0.0971, "step": 1885 }, { "epoch": 0.82, "grad_norm": 0.7699631611759321, "learning_rate": 1.9757315462705638e-05, "loss": 0.1138, "step": 1890 }, { "epoch": 0.82, "grad_norm": 1.1036033723581375, "learning_rate": 1.975400248360947e-05, "loss": 0.1006, "step": 1895 }, { "epoch": 0.82, "grad_norm": 2.026065926826909, "learning_rate": 1.9750667325756166e-05, "loss": 0.114, "step": 1900 }, { "epoch": 0.82, "grad_norm": 0.7998712377145, "learning_rate": 1.9747309996729247e-05, "loss": 0.1032, "step": 1905 }, { "epoch": 0.83, "grad_norm": 1.05768089227519, "learning_rate": 1.9743930504162638e-05, "loss": 0.0856, "step": 1910 }, { "epoch": 0.83, "grad_norm": 1.2658434011585944, "learning_rate": 1.9740528855740675e-05, "loss": 0.0867, "step": 1915 }, { "epoch": 0.83, "grad_norm": 0.923144746542567, "learning_rate": 1.9737105059198053e-05, "loss": 0.1023, "step": 1920 }, { "epoch": 0.83, "grad_norm": 12.069080174223988, "learning_rate": 1.9733659122319847e-05, "loss": 0.0918, "step": 1925 }, { "epoch": 0.83, "grad_norm": 0.8369351356561107, "learning_rate": 1.9730191052941458e-05, "loss": 0.0779, "step": 1930 }, { "epoch": 0.84, "grad_norm": 0.8064785728299022, "learning_rate": 1.972670085894863e-05, "loss": 0.0819, "step": 1935 }, { "epoch": 0.84, "grad_norm": 0.700096547854496, "learning_rate": 1.972318854827739e-05, "loss": 0.0884, "step": 1940 }, { "epoch": 0.84, "grad_norm": 1.403023986764824, "learning_rate": 1.9719654128914082e-05, "loss": 0.0849, "step": 1945 }, { "epoch": 0.84, "grad_norm": 0.8097419704135963, "learning_rate": 1.97160976088953e-05, "loss": 0.0842, "step": 1950 }, { "epoch": 0.84, "grad_norm": 2.1665359212528466, "learning_rate": 1.9712518996307902e-05, "loss": 0.1007, "step": 1955 }, { "epoch": 0.85, "grad_norm": 1.4963106562167054, "learning_rate": 1.9708918299288968e-05, "loss": 0.0959, "step": 1960 }, { "epoch": 0.85, "grad_norm": 1.3607731524123874, "learning_rate": 1.9705295526025813e-05, "loss": 0.1083, "step": 1965 }, { "epoch": 0.85, "grad_norm": 0.6737531185490563, "learning_rate": 1.970165068475593e-05, "loss": 0.1055, "step": 1970 }, { "epoch": 0.85, "grad_norm": 0.5919103617849452, "learning_rate": 1.9697983783767003e-05, "loss": 0.0932, "step": 1975 }, { "epoch": 0.86, "grad_norm": 0.6377311219864845, "learning_rate": 1.969429483139687e-05, "loss": 0.0758, "step": 1980 }, { "epoch": 0.86, "grad_norm": 0.9478830645476063, "learning_rate": 1.9690583836033514e-05, "loss": 0.093, "step": 1985 }, { "epoch": 0.86, "grad_norm": 6.8762210911874675, "learning_rate": 1.968685080611503e-05, "loss": 0.0913, "step": 1990 }, { "epoch": 0.86, "grad_norm": 0.7519885869151743, "learning_rate": 1.968309575012963e-05, "loss": 0.0942, "step": 1995 }, { "epoch": 0.86, "grad_norm": 1.3317465601556326, "learning_rate": 1.9679318676615596e-05, "loss": 0.0933, "step": 2000 }, { "epoch": 0.87, "grad_norm": 0.6132759065494825, "learning_rate": 1.9675519594161283e-05, "loss": 0.082, "step": 2005 }, { "epoch": 0.87, "grad_norm": 0.7537614600148012, "learning_rate": 1.967169851140508e-05, "loss": 0.0951, "step": 2010 }, { "epoch": 0.87, "grad_norm": 0.7735155086030167, "learning_rate": 1.9667855437035413e-05, "loss": 0.0938, "step": 2015 }, { "epoch": 0.87, "grad_norm": 0.6692324440774802, "learning_rate": 1.9663990379790704e-05, "loss": 0.0735, "step": 2020 }, { "epoch": 0.87, "grad_norm": 0.9080167579059008, "learning_rate": 1.966010334845936e-05, "loss": 0.0811, "step": 2025 }, { "epoch": 0.88, "grad_norm": 0.4884667463437012, "learning_rate": 1.9656194351879757e-05, "loss": 0.0866, "step": 2030 }, { "epoch": 0.88, "grad_norm": 0.3515686523041155, "learning_rate": 1.9652263398940216e-05, "loss": 0.0849, "step": 2035 }, { "epoch": 0.88, "grad_norm": 0.5009655704966927, "learning_rate": 1.9648310498578972e-05, "loss": 0.083, "step": 2040 }, { "epoch": 0.88, "grad_norm": 0.7143483676761784, "learning_rate": 1.9644335659784185e-05, "loss": 0.0764, "step": 2045 }, { "epoch": 0.89, "grad_norm": 1.8204976491891331, "learning_rate": 1.9640338891593883e-05, "loss": 0.1264, "step": 2050 }, { "epoch": 0.89, "grad_norm": 0.6807437366900461, "learning_rate": 1.9636320203095954e-05, "loss": 0.1116, "step": 2055 }, { "epoch": 0.89, "grad_norm": 1.9341021097885769, "learning_rate": 1.9632279603428146e-05, "loss": 0.1026, "step": 2060 }, { "epoch": 0.89, "grad_norm": 1.179956292587945, "learning_rate": 1.9628217101778015e-05, "loss": 0.1183, "step": 2065 }, { "epoch": 0.89, "grad_norm": 0.8899768937827349, "learning_rate": 1.9624132707382918e-05, "loss": 0.0977, "step": 2070 }, { "epoch": 0.9, "grad_norm": 1.3166443426125516, "learning_rate": 1.9620026429529994e-05, "loss": 0.1012, "step": 2075 }, { "epoch": 0.9, "grad_norm": 0.5339079016472141, "learning_rate": 1.961589827755615e-05, "loss": 0.1082, "step": 2080 }, { "epoch": 0.9, "grad_norm": 0.4596990220083425, "learning_rate": 1.961174826084802e-05, "loss": 0.0912, "step": 2085 }, { "epoch": 0.9, "grad_norm": 0.3915418119448165, "learning_rate": 1.9607576388841953e-05, "loss": 0.0783, "step": 2090 }, { "epoch": 0.9, "grad_norm": 0.6327168049132509, "learning_rate": 1.9603382671024e-05, "loss": 0.0822, "step": 2095 }, { "epoch": 0.91, "grad_norm": 0.6150588801181179, "learning_rate": 1.9599167116929886e-05, "loss": 0.0871, "step": 2100 }, { "epoch": 0.91, "grad_norm": 0.6436422597157241, "learning_rate": 1.9594929736144978e-05, "loss": 0.093, "step": 2105 }, { "epoch": 0.91, "grad_norm": 0.8254699958514967, "learning_rate": 1.9590670538304278e-05, "loss": 0.096, "step": 2110 }, { "epoch": 0.91, "grad_norm": 0.5197543328388274, "learning_rate": 1.9586389533092402e-05, "loss": 0.0918, "step": 2115 }, { "epoch": 0.92, "grad_norm": 0.9557781768242266, "learning_rate": 1.9582086730243545e-05, "loss": 0.099, "step": 2120 }, { "epoch": 0.92, "grad_norm": 0.5427166899498477, "learning_rate": 1.9577762139541465e-05, "loss": 0.0811, "step": 2125 }, { "epoch": 0.92, "grad_norm": 1.8756961308130387, "learning_rate": 1.9573415770819463e-05, "loss": 0.0945, "step": 2130 }, { "epoch": 0.92, "grad_norm": 0.6779111478511379, "learning_rate": 1.956904763396036e-05, "loss": 0.0986, "step": 2135 }, { "epoch": 0.92, "grad_norm": 0.5791588172751216, "learning_rate": 1.9564657738896478e-05, "loss": 0.0884, "step": 2140 }, { "epoch": 0.93, "grad_norm": 1.1065008243272856, "learning_rate": 1.95602460956096e-05, "loss": 0.1007, "step": 2145 }, { "epoch": 0.93, "grad_norm": 0.7131509483548544, "learning_rate": 1.9555812714130974e-05, "loss": 0.0866, "step": 2150 }, { "epoch": 0.93, "grad_norm": 0.4799981904596929, "learning_rate": 1.955135760454127e-05, "loss": 0.0657, "step": 2155 }, { "epoch": 0.93, "grad_norm": 0.5250069540649991, "learning_rate": 1.9546880776970566e-05, "loss": 0.083, "step": 2160 }, { "epoch": 0.94, "grad_norm": 0.7836812084717903, "learning_rate": 1.954238224159832e-05, "loss": 0.0882, "step": 2165 }, { "epoch": 0.94, "grad_norm": 0.560112589183633, "learning_rate": 1.9537862008653347e-05, "loss": 0.0913, "step": 2170 }, { "epoch": 0.94, "grad_norm": 0.6335189752801311, "learning_rate": 1.9533320088413807e-05, "loss": 0.0812, "step": 2175 }, { "epoch": 0.94, "grad_norm": 0.8026969319535522, "learning_rate": 1.9528756491207167e-05, "loss": 0.1014, "step": 2180 }, { "epoch": 0.94, "grad_norm": 0.5958206176680662, "learning_rate": 1.9524171227410185e-05, "loss": 0.0819, "step": 2185 }, { "epoch": 0.95, "grad_norm": 3.8784625039711, "learning_rate": 1.951956430744888e-05, "loss": 0.0782, "step": 2190 }, { "epoch": 0.95, "grad_norm": 0.7051759938172582, "learning_rate": 1.951493574179852e-05, "loss": 0.1002, "step": 2195 }, { "epoch": 0.95, "grad_norm": 0.9979595896719976, "learning_rate": 1.9510285540983583e-05, "loss": 0.0888, "step": 2200 }, { "epoch": 0.95, "grad_norm": 1.8283276654029694, "learning_rate": 1.9505613715577754e-05, "loss": 0.1212, "step": 2205 }, { "epoch": 0.95, "grad_norm": 1.0278569665206916, "learning_rate": 1.9500920276203878e-05, "loss": 0.1009, "step": 2210 }, { "epoch": 0.96, "grad_norm": 0.7532073985765524, "learning_rate": 1.9496205233533945e-05, "loss": 0.1139, "step": 2215 }, { "epoch": 0.96, "grad_norm": 0.6704633875384296, "learning_rate": 1.949146859828907e-05, "loss": 0.0977, "step": 2220 }, { "epoch": 0.96, "grad_norm": 0.6270491613257371, "learning_rate": 1.9486710381239472e-05, "loss": 0.0874, "step": 2225 }, { "epoch": 0.96, "grad_norm": 0.5665665710542613, "learning_rate": 1.9481930593204437e-05, "loss": 0.1056, "step": 2230 }, { "epoch": 0.97, "grad_norm": 0.8350923101831028, "learning_rate": 1.947712924505229e-05, "loss": 0.0782, "step": 2235 }, { "epoch": 0.97, "grad_norm": 0.8029288115147665, "learning_rate": 1.9472306347700397e-05, "loss": 0.08, "step": 2240 }, { "epoch": 0.97, "grad_norm": 0.7177515615744224, "learning_rate": 1.946746191211511e-05, "loss": 0.0937, "step": 2245 }, { "epoch": 0.97, "grad_norm": 0.9012213341405209, "learning_rate": 1.946259594931176e-05, "loss": 0.0954, "step": 2250 }, { "epoch": 0.97, "grad_norm": 0.619578875231581, "learning_rate": 1.945770847035463e-05, "loss": 0.0877, "step": 2255 }, { "epoch": 0.98, "grad_norm": 0.7866371900368929, "learning_rate": 1.9452799486356918e-05, "loss": 0.094, "step": 2260 }, { "epoch": 0.98, "grad_norm": 0.9258671786779609, "learning_rate": 1.944786900848073e-05, "loss": 0.1004, "step": 2265 }, { "epoch": 0.98, "grad_norm": 0.48317941437781525, "learning_rate": 1.9442917047937036e-05, "loss": 0.0789, "step": 2270 }, { "epoch": 0.98, "grad_norm": 0.704690320595154, "learning_rate": 1.9437943615985663e-05, "loss": 0.1044, "step": 2275 }, { "epoch": 0.98, "grad_norm": 0.7643215375132546, "learning_rate": 1.943294872393525e-05, "loss": 0.0903, "step": 2280 }, { "epoch": 0.99, "grad_norm": 0.6086609450905198, "learning_rate": 1.9427932383143236e-05, "loss": 0.0775, "step": 2285 }, { "epoch": 0.99, "grad_norm": 0.5769438697299223, "learning_rate": 1.9422894605015838e-05, "loss": 0.093, "step": 2290 }, { "epoch": 0.99, "grad_norm": 0.5140997819366792, "learning_rate": 1.9417835401008004e-05, "loss": 0.0825, "step": 2295 }, { "epoch": 0.99, "grad_norm": 0.5132532878495304, "learning_rate": 1.9412754782623412e-05, "loss": 0.0851, "step": 2300 }, { "epoch": 1.0, "grad_norm": 0.8932621214196316, "learning_rate": 1.9407652761414422e-05, "loss": 0.0905, "step": 2305 }, { "epoch": 1.0, "grad_norm": 0.6069639942137219, "learning_rate": 1.9402529348982072e-05, "loss": 0.0877, "step": 2310 }, { "epoch": 1.0, "grad_norm": 0.6366735477424228, "learning_rate": 1.9397384556976026e-05, "loss": 0.0936, "step": 2315 }, { "epoch": 1.0, "eval_loss": 0.9145181179046631, "eval_runtime": 352.5269, "eval_samples_per_second": 21.295, "eval_steps_per_second": 0.335, "step": 2315 }, { "epoch": 1.0, "grad_norm": 0.5616275078056484, "learning_rate": 1.9392218397094575e-05, "loss": 0.0657, "step": 2320 }, { "epoch": 1.0, "grad_norm": 0.6596828032006968, "learning_rate": 1.9387030881084588e-05, "loss": 0.0697, "step": 2325 }, { "epoch": 1.01, "grad_norm": 0.43755967023151665, "learning_rate": 1.938182202074149e-05, "loss": 0.0594, "step": 2330 }, { "epoch": 1.01, "grad_norm": 0.6234667120691315, "learning_rate": 1.9376591827909253e-05, "loss": 0.0645, "step": 2335 }, { "epoch": 1.01, "grad_norm": 1.3794470677309096, "learning_rate": 1.937134031448034e-05, "loss": 0.0696, "step": 2340 }, { "epoch": 1.01, "grad_norm": 0.5849723108496292, "learning_rate": 1.93660674923957e-05, "loss": 0.059, "step": 2345 }, { "epoch": 1.02, "grad_norm": 0.48370007210601973, "learning_rate": 1.9360773373644734e-05, "loss": 0.0711, "step": 2350 }, { "epoch": 1.02, "grad_norm": 1.1325120023666992, "learning_rate": 1.935545797026527e-05, "loss": 0.0547, "step": 2355 }, { "epoch": 1.02, "grad_norm": 0.28494184406753825, "learning_rate": 1.9350121294343526e-05, "loss": 0.059, "step": 2360 }, { "epoch": 1.02, "grad_norm": 0.6700705334846775, "learning_rate": 1.9344763358014095e-05, "loss": 0.0755, "step": 2365 }, { "epoch": 1.02, "grad_norm": 0.485478426214818, "learning_rate": 1.9339384173459913e-05, "loss": 0.0697, "step": 2370 }, { "epoch": 1.03, "grad_norm": 0.8524933638521246, "learning_rate": 1.9333983752912227e-05, "loss": 0.0583, "step": 2375 }, { "epoch": 1.03, "grad_norm": 0.4269504257394539, "learning_rate": 1.9328562108650572e-05, "loss": 0.0582, "step": 2380 }, { "epoch": 1.03, "grad_norm": 0.6480515153844685, "learning_rate": 1.932311925300274e-05, "loss": 0.0688, "step": 2385 }, { "epoch": 1.03, "grad_norm": 0.34585135913386694, "learning_rate": 1.931765519834476e-05, "loss": 0.0646, "step": 2390 }, { "epoch": 1.03, "grad_norm": 0.6680263661426055, "learning_rate": 1.9312169957100857e-05, "loss": 0.065, "step": 2395 }, { "epoch": 1.04, "grad_norm": 0.9926336730464236, "learning_rate": 1.930666354174343e-05, "loss": 0.0697, "step": 2400 }, { "epoch": 1.04, "grad_norm": 1.0773333832561918, "learning_rate": 1.9301135964793024e-05, "loss": 0.0701, "step": 2405 }, { "epoch": 1.04, "grad_norm": 0.8220974394525326, "learning_rate": 1.9295587238818313e-05, "loss": 0.0814, "step": 2410 }, { "epoch": 1.04, "grad_norm": 0.2662772500245477, "learning_rate": 1.9290017376436044e-05, "loss": 0.0592, "step": 2415 }, { "epoch": 1.05, "grad_norm": 6.222299959444174, "learning_rate": 1.9284426390311033e-05, "loss": 0.0702, "step": 2420 }, { "epoch": 1.05, "grad_norm": 0.6830827737400599, "learning_rate": 1.9278814293156122e-05, "loss": 0.0626, "step": 2425 }, { "epoch": 1.05, "grad_norm": 0.6648648902311897, "learning_rate": 1.927318109773216e-05, "loss": 0.0596, "step": 2430 }, { "epoch": 1.05, "grad_norm": 15.184086990125708, "learning_rate": 1.9267526816847972e-05, "loss": 0.0944, "step": 2435 }, { "epoch": 1.05, "grad_norm": 13.086549805442004, "learning_rate": 1.9261851463360323e-05, "loss": 1.138, "step": 2440 }, { "epoch": 1.06, "grad_norm": 12.903910620618289, "learning_rate": 1.9256155050173885e-05, "loss": 0.9527, "step": 2445 }, { "epoch": 1.06, "grad_norm": 5.01343033739344, "learning_rate": 1.9250437590241234e-05, "loss": 0.9946, "step": 2450 }, { "epoch": 1.06, "grad_norm": 4.334664635242735, "learning_rate": 1.9244699096562785e-05, "loss": 0.9971, "step": 2455 }, { "epoch": 1.06, "grad_norm": 3.744318496627235, "learning_rate": 1.9238939582186798e-05, "loss": 0.9953, "step": 2460 }, { "epoch": 1.06, "grad_norm": 3.0516984634704736, "learning_rate": 1.923315906020931e-05, "loss": 0.9043, "step": 2465 }, { "epoch": 1.07, "grad_norm": 4.056313270501341, "learning_rate": 1.9227357543774136e-05, "loss": 1.3543, "step": 2470 }, { "epoch": 1.07, "grad_norm": 1.9651974681311575, "learning_rate": 1.922153504607283e-05, "loss": 1.0212, "step": 2475 }, { "epoch": 1.07, "grad_norm": 4.101849169363717, "learning_rate": 1.9215691580344653e-05, "loss": 0.9517, "step": 2480 }, { "epoch": 1.07, "grad_norm": 3.1937357498817387, "learning_rate": 1.9209827159876538e-05, "loss": 1.0283, "step": 2485 }, { "epoch": 1.08, "grad_norm": 1.9804787062964042, "learning_rate": 1.920394179800306e-05, "loss": 1.17, "step": 2490 }, { "epoch": 1.08, "grad_norm": 3.9385750059156934, "learning_rate": 1.919803550810644e-05, "loss": 1.1176, "step": 2495 }, { "epoch": 1.08, "grad_norm": 3.3729979129811953, "learning_rate": 1.9192108303616443e-05, "loss": 1.1851, "step": 2500 }, { "epoch": 1.08, "grad_norm": 9.464224848094316, "learning_rate": 1.9186160198010424e-05, "loss": 1.2127, "step": 2505 }, { "epoch": 1.08, "grad_norm": 6.691608094833035, "learning_rate": 1.9180191204813243e-05, "loss": 0.9996, "step": 2510 }, { "epoch": 1.09, "grad_norm": 2.989401318952922, "learning_rate": 1.9174201337597266e-05, "loss": 1.1232, "step": 2515 }, { "epoch": 1.09, "grad_norm": 2.1641301879625052, "learning_rate": 1.9168190609982317e-05, "loss": 0.9688, "step": 2520 }, { "epoch": 1.09, "grad_norm": 2.7269265589356824, "learning_rate": 1.9162159035635654e-05, "loss": 1.1601, "step": 2525 }, { "epoch": 1.09, "grad_norm": 1.7533229665086245, "learning_rate": 1.9156106628271938e-05, "loss": 1.0813, "step": 2530 }, { "epoch": 1.1, "grad_norm": 1.1593285241535218, "learning_rate": 1.9150033401653193e-05, "loss": 1.1281, "step": 2535 }, { "epoch": 1.1, "grad_norm": 1.7658481049626809, "learning_rate": 1.9143939369588794e-05, "loss": 1.111, "step": 2540 }, { "epoch": 1.1, "grad_norm": 3.3407448124691763, "learning_rate": 1.913782454593542e-05, "loss": 1.3092, "step": 2545 }, { "epoch": 1.1, "grad_norm": 2.6070550190431905, "learning_rate": 1.9131688944597017e-05, "loss": 1.0744, "step": 2550 }, { "epoch": 1.1, "grad_norm": 2.7601896393598007, "learning_rate": 1.9125532579524783e-05, "loss": 1.2218, "step": 2555 }, { "epoch": 1.11, "grad_norm": 7.3856240027677815, "learning_rate": 1.911935546471713e-05, "loss": 1.1591, "step": 2560 }, { "epoch": 1.11, "grad_norm": 4.24382439098411, "learning_rate": 1.911315761421965e-05, "loss": 1.2513, "step": 2565 }, { "epoch": 1.11, "grad_norm": 1.8814096184126177, "learning_rate": 1.9106939042125085e-05, "loss": 1.037, "step": 2570 }, { "epoch": 1.11, "grad_norm": 1.8500629691055899, "learning_rate": 1.9100699762573282e-05, "loss": 1.0205, "step": 2575 }, { "epoch": 1.11, "grad_norm": 2.2506993444324785, "learning_rate": 1.909443978975119e-05, "loss": 1.1285, "step": 2580 }, { "epoch": 1.12, "grad_norm": 2.160603867389095, "learning_rate": 1.9088159137892806e-05, "loss": 1.2168, "step": 2585 }, { "epoch": 1.12, "grad_norm": 2.4521010037166753, "learning_rate": 1.908185782127914e-05, "loss": 0.9641, "step": 2590 }, { "epoch": 1.12, "grad_norm": 3.3405574242520215, "learning_rate": 1.9075535854238195e-05, "loss": 1.1123, "step": 2595 }, { "epoch": 1.12, "grad_norm": 1.719202294463501, "learning_rate": 1.906919325114493e-05, "loss": 1.0601, "step": 2600 }, { "epoch": 1.13, "grad_norm": 1.0275191129064691, "learning_rate": 1.9062830026421226e-05, "loss": 1.067, "step": 2605 }, { "epoch": 1.13, "grad_norm": 1.9723618781686103, "learning_rate": 1.905644619453585e-05, "loss": 1.0872, "step": 2610 }, { "epoch": 1.13, "grad_norm": 0.8049997370300347, "learning_rate": 1.905004177000443e-05, "loss": 1.132, "step": 2615 }, { "epoch": 1.13, "grad_norm": 2.8051709497363646, "learning_rate": 1.9043616767389416e-05, "loss": 1.1224, "step": 2620 }, { "epoch": 1.13, "grad_norm": 2.039428292133686, "learning_rate": 1.9037171201300045e-05, "loss": 0.7828, "step": 2625 }, { "epoch": 1.14, "grad_norm": 1.6162927800964706, "learning_rate": 1.9030705086392323e-05, "loss": 1.1454, "step": 2630 }, { "epoch": 1.14, "grad_norm": 1.512216353436176, "learning_rate": 1.9024218437368968e-05, "loss": 1.0732, "step": 2635 }, { "epoch": 1.14, "grad_norm": 1.9741080273381615, "learning_rate": 1.9017711268979394e-05, "loss": 1.1747, "step": 2640 }, { "epoch": 1.14, "grad_norm": 1.745899658973244, "learning_rate": 1.9011183596019672e-05, "loss": 1.0929, "step": 2645 }, { "epoch": 1.14, "grad_norm": 3.1272153267838902, "learning_rate": 1.9004635433332498e-05, "loss": 1.0864, "step": 2650 }, { "epoch": 1.15, "grad_norm": 2.923091983001033, "learning_rate": 1.899806679580715e-05, "loss": 1.0525, "step": 2655 }, { "epoch": 1.15, "grad_norm": 4.984968727888495, "learning_rate": 1.899147769837947e-05, "loss": 1.0726, "step": 2660 }, { "epoch": 1.15, "grad_norm": 1.8217132402018963, "learning_rate": 1.898486815603182e-05, "loss": 1.1832, "step": 2665 }, { "epoch": 1.15, "grad_norm": 1.769890651202349, "learning_rate": 1.8978238183793052e-05, "loss": 1.0143, "step": 2670 }, { "epoch": 1.16, "grad_norm": 2.878970152426382, "learning_rate": 1.8971587796738463e-05, "loss": 1.3317, "step": 2675 }, { "epoch": 1.16, "grad_norm": 3.107994186560129, "learning_rate": 1.8964917009989783e-05, "loss": 1.1506, "step": 2680 }, { "epoch": 1.16, "grad_norm": 2.115211771464286, "learning_rate": 1.8958225838715113e-05, "loss": 1.0785, "step": 2685 }, { "epoch": 1.16, "grad_norm": 1.85498934986892, "learning_rate": 1.8951514298128914e-05, "loss": 0.9929, "step": 2690 }, { "epoch": 1.16, "grad_norm": 1.0586931662100967, "learning_rate": 1.8944782403491964e-05, "loss": 0.8947, "step": 2695 }, { "epoch": 1.17, "grad_norm": 2.9650657595211563, "learning_rate": 1.893803017011131e-05, "loss": 0.9875, "step": 2700 }, { "epoch": 1.17, "grad_norm": 2.278562489218499, "learning_rate": 1.8931257613340262e-05, "loss": 1.0959, "step": 2705 }, { "epoch": 1.17, "grad_norm": 2.9520728930198263, "learning_rate": 1.892446474857833e-05, "loss": 1.1596, "step": 2710 }, { "epoch": 1.17, "grad_norm": 6.671374376765014, "learning_rate": 1.89176515912712e-05, "loss": 1.179, "step": 2715 }, { "epoch": 1.17, "grad_norm": 2.510105971294028, "learning_rate": 1.8910818156910707e-05, "loss": 1.0341, "step": 2720 }, { "epoch": 1.18, "grad_norm": 3.358943297796002, "learning_rate": 1.890396446103479e-05, "loss": 1.209, "step": 2725 }, { "epoch": 1.18, "grad_norm": 0.6924599517955392, "learning_rate": 1.8897090519227458e-05, "loss": 1.0079, "step": 2730 }, { "epoch": 1.18, "grad_norm": 1.2144311754717434, "learning_rate": 1.8890196347118747e-05, "loss": 0.95, "step": 2735 }, { "epoch": 1.18, "grad_norm": 1.628799029535916, "learning_rate": 1.888328196038471e-05, "loss": 0.999, "step": 2740 }, { "epoch": 1.19, "grad_norm": 1.6323476110763768, "learning_rate": 1.887634737474735e-05, "loss": 1.1144, "step": 2745 }, { "epoch": 1.19, "grad_norm": 0.8318865630869238, "learning_rate": 1.8869392605974605e-05, "loss": 0.8806, "step": 2750 }, { "epoch": 1.19, "grad_norm": 2.292543263234408, "learning_rate": 1.8862417669880307e-05, "loss": 1.0152, "step": 2755 }, { "epoch": 1.19, "grad_norm": 1.1718305039079837, "learning_rate": 1.8855422582324136e-05, "loss": 0.9284, "step": 2760 }, { "epoch": 1.19, "grad_norm": 1.8315532803193473, "learning_rate": 1.88484073592116e-05, "loss": 0.9595, "step": 2765 }, { "epoch": 1.2, "grad_norm": 1.0603422444623511, "learning_rate": 1.8841372016493987e-05, "loss": 1.099, "step": 2770 }, { "epoch": 1.2, "grad_norm": 1.1059393746288249, "learning_rate": 1.8834316570168344e-05, "loss": 0.9097, "step": 2775 }, { "epoch": 1.2, "grad_norm": 9.01735421988145, "learning_rate": 1.8827241036277415e-05, "loss": 1.2295, "step": 2780 }, { "epoch": 1.2, "grad_norm": 3.797225020197302, "learning_rate": 1.8820145430909627e-05, "loss": 0.9622, "step": 2785 }, { "epoch": 1.21, "grad_norm": 3.0587273399955777, "learning_rate": 1.881302977019905e-05, "loss": 1.101, "step": 2790 }, { "epoch": 1.21, "grad_norm": 2.6716610674679675, "learning_rate": 1.8805894070325343e-05, "loss": 1.0955, "step": 2795 }, { "epoch": 1.21, "grad_norm": 4.6190166161920905, "learning_rate": 1.8798738347513743e-05, "loss": 1.1255, "step": 2800 }, { "epoch": 1.21, "grad_norm": 1.2516203303283377, "learning_rate": 1.879156261803501e-05, "loss": 1.2917, "step": 2805 }, { "epoch": 1.21, "grad_norm": 1.8669010681781146, "learning_rate": 1.8784366898205395e-05, "loss": 1.0387, "step": 2810 }, { "epoch": 1.22, "grad_norm": 3.3989695616319953, "learning_rate": 1.8777151204386604e-05, "loss": 1.1687, "step": 2815 }, { "epoch": 1.22, "grad_norm": 3.9839501096580054, "learning_rate": 1.876991555298576e-05, "loss": 1.3672, "step": 2820 }, { "epoch": 1.22, "grad_norm": 2.3359598799774464, "learning_rate": 1.876265996045537e-05, "loss": 1.1501, "step": 2825 }, { "epoch": 1.22, "grad_norm": 1.7595264569997804, "learning_rate": 1.8755384443293273e-05, "loss": 0.9142, "step": 2830 }, { "epoch": 1.22, "grad_norm": 1.4577221871495234, "learning_rate": 1.8748089018042623e-05, "loss": 0.9255, "step": 2835 }, { "epoch": 1.23, "grad_norm": 1.5275145635538994, "learning_rate": 1.8740773701291836e-05, "loss": 0.8818, "step": 2840 }, { "epoch": 1.23, "grad_norm": 1.1264301878987903, "learning_rate": 1.873343850967456e-05, "loss": 0.984, "step": 2845 }, { "epoch": 1.23, "grad_norm": 2.863318146720118, "learning_rate": 1.8726083459869634e-05, "loss": 0.9505, "step": 2850 }, { "epoch": 1.23, "grad_norm": 2.8435565960567706, "learning_rate": 1.8718708568601047e-05, "loss": 1.0112, "step": 2855 }, { "epoch": 1.24, "grad_norm": 2.6724494819760025, "learning_rate": 1.8711313852637914e-05, "loss": 1.194, "step": 2860 }, { "epoch": 1.24, "grad_norm": 1.755170687903758, "learning_rate": 1.8703899328794412e-05, "loss": 1.2048, "step": 2865 }, { "epoch": 1.24, "grad_norm": 2.0242885846033074, "learning_rate": 1.8696465013929776e-05, "loss": 1.0102, "step": 2870 }, { "epoch": 1.24, "grad_norm": 2.095574086345825, "learning_rate": 1.8689010924948222e-05, "loss": 0.9868, "step": 2875 }, { "epoch": 1.24, "grad_norm": 2.166749589787262, "learning_rate": 1.8681537078798945e-05, "loss": 1.0925, "step": 2880 }, { "epoch": 1.25, "grad_norm": 3.339195291028359, "learning_rate": 1.8674043492476057e-05, "loss": 0.8562, "step": 2885 }, { "epoch": 1.25, "grad_norm": 4.246668581883667, "learning_rate": 1.8666530183018555e-05, "loss": 1.0551, "step": 2890 }, { "epoch": 1.25, "grad_norm": 1.8753108057969599, "learning_rate": 1.8658997167510287e-05, "loss": 0.811, "step": 2895 }, { "epoch": 1.25, "grad_norm": 2.6262621213201798, "learning_rate": 1.8651444463079898e-05, "loss": 0.9255, "step": 2900 }, { "epoch": 1.25, "grad_norm": 5.271501553303465, "learning_rate": 1.864387208690082e-05, "loss": 1.1943, "step": 2905 }, { "epoch": 1.26, "grad_norm": 8.306222298118406, "learning_rate": 1.8636280056191195e-05, "loss": 1.1878, "step": 2910 }, { "epoch": 1.26, "grad_norm": 1.1170321181806435, "learning_rate": 1.862866838821387e-05, "loss": 1.0038, "step": 2915 }, { "epoch": 1.26, "grad_norm": 2.2026475395614034, "learning_rate": 1.8621037100276342e-05, "loss": 1.1979, "step": 2920 }, { "epoch": 1.26, "grad_norm": 5.019325813476353, "learning_rate": 1.861338620973071e-05, "loss": 1.1805, "step": 2925 }, { "epoch": 1.27, "grad_norm": 4.975982418870631, "learning_rate": 1.8605715733973654e-05, "loss": 1.1462, "step": 2930 }, { "epoch": 1.27, "grad_norm": 2.1055702133905627, "learning_rate": 1.859802569044639e-05, "loss": 1.1622, "step": 2935 }, { "epoch": 1.27, "grad_norm": 2.2270636877747805, "learning_rate": 1.8590316096634615e-05, "loss": 1.0749, "step": 2940 }, { "epoch": 1.27, "grad_norm": 2.0870628552313883, "learning_rate": 1.8582586970068493e-05, "loss": 0.882, "step": 2945 }, { "epoch": 1.27, "grad_norm": 1.436943410024318, "learning_rate": 1.85748383283226e-05, "loss": 0.9564, "step": 2950 }, { "epoch": 1.28, "grad_norm": 2.8749289552957404, "learning_rate": 1.856707018901587e-05, "loss": 1.0634, "step": 2955 }, { "epoch": 1.28, "grad_norm": 0.9614834633177424, "learning_rate": 1.855928256981159e-05, "loss": 0.87, "step": 2960 }, { "epoch": 1.28, "grad_norm": 1.2686571202532422, "learning_rate": 1.8551475488417336e-05, "loss": 1.0995, "step": 2965 }, { "epoch": 1.28, "grad_norm": 1.5286350406981015, "learning_rate": 1.8543648962584932e-05, "loss": 1.0547, "step": 2970 }, { "epoch": 1.29, "grad_norm": 3.0835167841300257, "learning_rate": 1.8535803010110417e-05, "loss": 1.2377, "step": 2975 }, { "epoch": 1.29, "grad_norm": 2.391017656781517, "learning_rate": 1.8527937648834e-05, "loss": 1.2353, "step": 2980 }, { "epoch": 1.29, "grad_norm": 3.6789551586548352, "learning_rate": 1.8520052896640036e-05, "loss": 0.9409, "step": 2985 }, { "epoch": 1.29, "grad_norm": 1.180581592436138, "learning_rate": 1.8512148771456947e-05, "loss": 1.0151, "step": 2990 }, { "epoch": 1.29, "grad_norm": 2.9530924499547395, "learning_rate": 1.850422529125723e-05, "loss": 1.1419, "step": 2995 }, { "epoch": 1.3, "grad_norm": 2.4122900275395645, "learning_rate": 1.8496282474057373e-05, "loss": 1.0736, "step": 3000 }, { "epoch": 1.3, "grad_norm": 1.1589553877211933, "learning_rate": 1.8488320337917843e-05, "loss": 1.1795, "step": 3005 }, { "epoch": 1.3, "grad_norm": 1.6530700975476218, "learning_rate": 1.8480338900943034e-05, "loss": 1.304, "step": 3010 }, { "epoch": 1.3, "grad_norm": 2.517021072232722, "learning_rate": 1.8472338181281222e-05, "loss": 0.9507, "step": 3015 }, { "epoch": 1.3, "grad_norm": 2.220531755504523, "learning_rate": 1.846431819712453e-05, "loss": 0.9046, "step": 3020 }, { "epoch": 1.31, "grad_norm": 2.764629699311416, "learning_rate": 1.845627896670889e-05, "loss": 1.009, "step": 3025 }, { "epoch": 1.31, "grad_norm": 2.0766834198659314, "learning_rate": 1.8448220508313984e-05, "loss": 1.1998, "step": 3030 }, { "epoch": 1.31, "grad_norm": 2.9669489382279193, "learning_rate": 1.844014284026323e-05, "loss": 1.307, "step": 3035 }, { "epoch": 1.31, "grad_norm": 2.548666722268394, "learning_rate": 1.843204598092371e-05, "loss": 0.9256, "step": 3040 }, { "epoch": 1.32, "grad_norm": 2.0021583660995392, "learning_rate": 1.8423929948706162e-05, "loss": 1.2201, "step": 3045 }, { "epoch": 1.32, "grad_norm": 4.905029779245296, "learning_rate": 1.8415794762064898e-05, "loss": 0.9246, "step": 3050 }, { "epoch": 1.32, "grad_norm": 3.9342941603758885, "learning_rate": 1.8407640439497796e-05, "loss": 1.1058, "step": 3055 }, { "epoch": 1.32, "grad_norm": 3.051956807415952, "learning_rate": 1.839946699954625e-05, "loss": 0.9203, "step": 3060 }, { "epoch": 1.32, "grad_norm": 1.2845686784682366, "learning_rate": 1.839127446079511e-05, "loss": 1.1921, "step": 3065 }, { "epoch": 1.33, "grad_norm": 2.6222870108950374, "learning_rate": 1.838306284187266e-05, "loss": 0.9654, "step": 3070 }, { "epoch": 1.33, "grad_norm": 1.5701543762068435, "learning_rate": 1.837483216145057e-05, "loss": 1.1834, "step": 3075 }, { "epoch": 1.33, "grad_norm": 5.1520937052207545, "learning_rate": 1.8366582438243858e-05, "loss": 1.0738, "step": 3080 }, { "epoch": 1.33, "grad_norm": 4.697802363483906, "learning_rate": 1.835831369101082e-05, "loss": 0.9288, "step": 3085 }, { "epoch": 1.33, "grad_norm": 5.385202446007638, "learning_rate": 1.8350025938553035e-05, "loss": 0.9979, "step": 3090 }, { "epoch": 1.34, "grad_norm": 2.7177333591313473, "learning_rate": 1.8341719199715285e-05, "loss": 1.2136, "step": 3095 }, { "epoch": 1.34, "grad_norm": 3.4538389386304797, "learning_rate": 1.8333393493385515e-05, "loss": 1.0905, "step": 3100 }, { "epoch": 1.34, "grad_norm": 3.3661278616114965, "learning_rate": 1.8325048838494814e-05, "loss": 1.2895, "step": 3105 }, { "epoch": 1.34, "grad_norm": 1.2397004844541852, "learning_rate": 1.8316685254017347e-05, "loss": 1.0898, "step": 3110 }, { "epoch": 1.35, "grad_norm": 1.2760755063636564, "learning_rate": 1.830830275897032e-05, "loss": 1.0004, "step": 3115 }, { "epoch": 1.35, "grad_norm": 1.336182841450121, "learning_rate": 1.829990137241395e-05, "loss": 0.9718, "step": 3120 }, { "epoch": 1.35, "grad_norm": 1.6532051159468562, "learning_rate": 1.8291481113451394e-05, "loss": 1.2108, "step": 3125 }, { "epoch": 1.35, "grad_norm": 1.831500868110592, "learning_rate": 1.828304200122873e-05, "loss": 1.1943, "step": 3130 }, { "epoch": 1.35, "grad_norm": 5.118380579560918, "learning_rate": 1.8274584054934904e-05, "loss": 1.0709, "step": 3135 }, { "epoch": 1.36, "grad_norm": 2.64507819444403, "learning_rate": 1.8266107293801686e-05, "loss": 0.9887, "step": 3140 }, { "epoch": 1.36, "grad_norm": 2.369535703148114, "learning_rate": 1.8257611737103632e-05, "loss": 1.141, "step": 3145 }, { "epoch": 1.36, "grad_norm": 1.105441798485061, "learning_rate": 1.8249097404158023e-05, "loss": 0.9434, "step": 3150 }, { "epoch": 1.36, "grad_norm": 1.1850093992099566, "learning_rate": 1.8240564314324848e-05, "loss": 1.1414, "step": 3155 }, { "epoch": 1.37, "grad_norm": 3.0187844829993997, "learning_rate": 1.823201248700674e-05, "loss": 1.0408, "step": 3160 }, { "epoch": 1.37, "grad_norm": 2.46409735317593, "learning_rate": 1.8223441941648934e-05, "loss": 1.1907, "step": 3165 }, { "epoch": 1.37, "grad_norm": 1.1050394893702846, "learning_rate": 1.8214852697739232e-05, "loss": 1.1438, "step": 3170 }, { "epoch": 1.37, "grad_norm": 1.926128045528065, "learning_rate": 1.820624477480795e-05, "loss": 1.1118, "step": 3175 }, { "epoch": 1.37, "grad_norm": 1.9044939702214854, "learning_rate": 1.8197618192427874e-05, "loss": 1.2493, "step": 3180 }, { "epoch": 1.38, "grad_norm": 6.721570039073311, "learning_rate": 1.8188972970214226e-05, "loss": 1.1755, "step": 3185 }, { "epoch": 1.38, "grad_norm": 2.19822656617475, "learning_rate": 1.81803091278246e-05, "loss": 1.0238, "step": 3190 }, { "epoch": 1.38, "grad_norm": 2.4481511003429963, "learning_rate": 1.8171626684958944e-05, "loss": 1.0708, "step": 3195 }, { "epoch": 1.38, "grad_norm": 1.0407019264410284, "learning_rate": 1.8162925661359487e-05, "loss": 0.9963, "step": 3200 }, { "epoch": 1.38, "grad_norm": 1.0870254854331745, "learning_rate": 1.8154206076810706e-05, "loss": 1.1444, "step": 3205 }, { "epoch": 1.39, "grad_norm": 1.008606553113266, "learning_rate": 1.8145467951139294e-05, "loss": 1.0423, "step": 3210 }, { "epoch": 1.39, "grad_norm": 2.7404259988281, "learning_rate": 1.813671130421409e-05, "loss": 1.0753, "step": 3215 }, { "epoch": 1.39, "grad_norm": 3.822856198318866, "learning_rate": 1.8127936155946062e-05, "loss": 1.1881, "step": 3220 }, { "epoch": 1.39, "grad_norm": 1.763286636104949, "learning_rate": 1.8119142526288237e-05, "loss": 1.1203, "step": 3225 }, { "epoch": 1.4, "grad_norm": 1.3741982296746016, "learning_rate": 1.8110330435235656e-05, "loss": 1.1767, "step": 3230 }, { "epoch": 1.4, "grad_norm": 3.5252778102399747, "learning_rate": 1.8101499902825354e-05, "loss": 1.0326, "step": 3235 }, { "epoch": 1.4, "grad_norm": 0.6361941017976659, "learning_rate": 1.8092650949136295e-05, "loss": 1.2773, "step": 3240 }, { "epoch": 1.4, "grad_norm": 1.0642655331813502, "learning_rate": 1.808378359428932e-05, "loss": 1.0846, "step": 3245 }, { "epoch": 1.4, "grad_norm": 1.300245514729484, "learning_rate": 1.8074897858447123e-05, "loss": 0.9965, "step": 3250 }, { "epoch": 1.41, "grad_norm": 1.8676058682990833, "learning_rate": 1.806599376181418e-05, "loss": 1.0315, "step": 3255 }, { "epoch": 1.41, "grad_norm": 1.8478689368260564, "learning_rate": 1.805707132463673e-05, "loss": 1.0647, "step": 3260 }, { "epoch": 1.41, "grad_norm": 1.5380958946700751, "learning_rate": 1.8048130567202707e-05, "loss": 1.204, "step": 3265 }, { "epoch": 1.41, "grad_norm": 1.1114063704861679, "learning_rate": 1.8039171509841697e-05, "loss": 1.0207, "step": 3270 }, { "epoch": 1.41, "grad_norm": 1.8248625661555662, "learning_rate": 1.8030194172924906e-05, "loss": 1.3113, "step": 3275 }, { "epoch": 1.42, "grad_norm": 2.4034069263852764, "learning_rate": 1.80211985768651e-05, "loss": 1.1165, "step": 3280 }, { "epoch": 1.42, "grad_norm": 1.6590028307791285, "learning_rate": 1.8012184742116566e-05, "loss": 1.1675, "step": 3285 }, { "epoch": 1.42, "grad_norm": 2.4344348218555933, "learning_rate": 1.8003152689175055e-05, "loss": 0.9994, "step": 3290 }, { "epoch": 1.42, "grad_norm": 2.0011634617126592, "learning_rate": 1.7994102438577753e-05, "loss": 1.1049, "step": 3295 }, { "epoch": 1.43, "grad_norm": 1.8236411685650236, "learning_rate": 1.7985034010903212e-05, "loss": 1.0944, "step": 3300 }, { "epoch": 1.43, "grad_norm": 1.4601573929509397, "learning_rate": 1.7975947426771327e-05, "loss": 1.2179, "step": 3305 }, { "epoch": 1.43, "grad_norm": 4.6685454697583015, "learning_rate": 1.796684270684327e-05, "loss": 1.3198, "step": 3310 }, { "epoch": 1.43, "grad_norm": 5.895936325447172, "learning_rate": 1.795771987182145e-05, "loss": 1.085, "step": 3315 }, { "epoch": 1.43, "grad_norm": 3.0349834958019843, "learning_rate": 1.794857894244947e-05, "loss": 1.243, "step": 3320 }, { "epoch": 1.44, "grad_norm": 2.4051043441438447, "learning_rate": 1.793941993951208e-05, "loss": 1.0862, "step": 3325 }, { "epoch": 1.44, "grad_norm": 5.537290266543172, "learning_rate": 1.7930242883835114e-05, "loss": 0.9081, "step": 3330 }, { "epoch": 1.44, "grad_norm": 2.000136679317919, "learning_rate": 1.7921047796285464e-05, "loss": 0.9509, "step": 3335 }, { "epoch": 1.44, "grad_norm": 2.0237555621206975, "learning_rate": 1.7911834697771017e-05, "loss": 1.0043, "step": 3340 }, { "epoch": 1.44, "grad_norm": 2.876113888412484, "learning_rate": 1.7902603609240623e-05, "loss": 0.9543, "step": 3345 }, { "epoch": 1.45, "grad_norm": 6.833172198802181, "learning_rate": 1.789335455168403e-05, "loss": 1.1886, "step": 3350 }, { "epoch": 1.45, "grad_norm": 2.8740987465913173, "learning_rate": 1.788408754613184e-05, "loss": 1.016, "step": 3355 }, { "epoch": 1.45, "grad_norm": 1.9149073164585646, "learning_rate": 1.7874802613655478e-05, "loss": 0.9419, "step": 3360 }, { "epoch": 1.45, "grad_norm": 4.205668899674492, "learning_rate": 1.786549977536712e-05, "loss": 1.0141, "step": 3365 }, { "epoch": 1.46, "grad_norm": 3.7286171999927227, "learning_rate": 1.7856179052419667e-05, "loss": 1.0132, "step": 3370 }, { "epoch": 1.46, "grad_norm": 3.555982799960603, "learning_rate": 1.7846840466006675e-05, "loss": 0.9953, "step": 3375 }, { "epoch": 1.46, "grad_norm": 2.7729455636026015, "learning_rate": 1.783748403736232e-05, "loss": 0.9252, "step": 3380 }, { "epoch": 1.46, "grad_norm": 5.2333848411684905, "learning_rate": 1.7828109787761364e-05, "loss": 1.03, "step": 3385 }, { "epoch": 1.46, "grad_norm": 4.649862211067886, "learning_rate": 1.7818717738519064e-05, "loss": 1.1632, "step": 3390 }, { "epoch": 1.47, "grad_norm": 2.4148563124642726, "learning_rate": 1.7809307910991177e-05, "loss": 0.8583, "step": 3395 }, { "epoch": 1.47, "grad_norm": 3.7308969676385573, "learning_rate": 1.779988032657386e-05, "loss": 0.921, "step": 3400 }, { "epoch": 1.47, "grad_norm": 10.233259023305683, "learning_rate": 1.7790435006703663e-05, "loss": 1.216, "step": 3405 }, { "epoch": 1.47, "grad_norm": 4.750028602895991, "learning_rate": 1.7780971972857454e-05, "loss": 1.3289, "step": 3410 }, { "epoch": 1.48, "grad_norm": 1.605503970424356, "learning_rate": 1.7771491246552388e-05, "loss": 0.8949, "step": 3415 }, { "epoch": 1.48, "grad_norm": 1.7687292828558763, "learning_rate": 1.776199284934584e-05, "loss": 1.0494, "step": 3420 }, { "epoch": 1.48, "grad_norm": 5.172907684395909, "learning_rate": 1.775247680283537e-05, "loss": 0.8951, "step": 3425 }, { "epoch": 1.48, "grad_norm": 1.759596035216593, "learning_rate": 1.7742943128658664e-05, "loss": 0.9194, "step": 3430 }, { "epoch": 1.48, "grad_norm": 2.40491220347799, "learning_rate": 1.77333918484935e-05, "loss": 0.9912, "step": 3435 }, { "epoch": 1.49, "grad_norm": 3.0396454651265197, "learning_rate": 1.7723822984057682e-05, "loss": 0.9934, "step": 3440 }, { "epoch": 1.49, "grad_norm": 5.964381710275646, "learning_rate": 1.7714236557108998e-05, "loss": 0.9603, "step": 3445 }, { "epoch": 1.49, "grad_norm": 3.0976228058993627, "learning_rate": 1.7704632589445167e-05, "loss": 0.9246, "step": 3450 }, { "epoch": 1.49, "grad_norm": 22.29026429234738, "learning_rate": 1.7695011102903796e-05, "loss": 1.0157, "step": 3455 }, { "epoch": 1.49, "grad_norm": 10.692379917639276, "learning_rate": 1.768537211936233e-05, "loss": 0.9716, "step": 3460 }, { "epoch": 1.5, "grad_norm": 3.9612716203667016, "learning_rate": 1.767571566073799e-05, "loss": 0.8151, "step": 3465 }, { "epoch": 1.5, "grad_norm": 1.464022924418229, "learning_rate": 1.7666041748987744e-05, "loss": 0.9119, "step": 3470 }, { "epoch": 1.5, "grad_norm": 1.2077797246084108, "learning_rate": 1.7656350406108228e-05, "loss": 0.7946, "step": 3475 }, { "epoch": 1.5, "grad_norm": 1.296998157972292, "learning_rate": 1.764664165413573e-05, "loss": 0.905, "step": 3480 }, { "epoch": 1.51, "grad_norm": 1.6098965454363554, "learning_rate": 1.763691551514611e-05, "loss": 0.8778, "step": 3485 }, { "epoch": 1.51, "grad_norm": 3.6816917985635995, "learning_rate": 1.7627172011254777e-05, "loss": 1.0083, "step": 3490 }, { "epoch": 1.51, "grad_norm": 3.8188571824489963, "learning_rate": 1.7617411164616613e-05, "loss": 0.9896, "step": 3495 }, { "epoch": 1.51, "grad_norm": 4.920995898630756, "learning_rate": 1.7607632997425936e-05, "loss": 1.0074, "step": 3500 }, { "epoch": 1.51, "grad_norm": 4.921413022519996, "learning_rate": 1.7597837531916447e-05, "loss": 1.1118, "step": 3505 }, { "epoch": 1.52, "grad_norm": 7.1316673838198925, "learning_rate": 1.758802479036119e-05, "loss": 1.0156, "step": 3510 }, { "epoch": 1.52, "grad_norm": 3.939674963752455, "learning_rate": 1.7578194795072474e-05, "loss": 0.9165, "step": 3515 }, { "epoch": 1.52, "grad_norm": 2.5775877484888063, "learning_rate": 1.7568347568401857e-05, "loss": 1.0156, "step": 3520 }, { "epoch": 1.52, "grad_norm": 1.5703752754794065, "learning_rate": 1.7558483132740066e-05, "loss": 0.9154, "step": 3525 }, { "epoch": 1.52, "grad_norm": 1.4765467973691762, "learning_rate": 1.7548601510516965e-05, "loss": 0.9683, "step": 3530 }, { "epoch": 1.53, "grad_norm": 1.4821344651743134, "learning_rate": 1.7538702724201492e-05, "loss": 0.8375, "step": 3535 }, { "epoch": 1.53, "grad_norm": 1.0318729292387396, "learning_rate": 1.7528786796301615e-05, "loss": 0.8248, "step": 3540 }, { "epoch": 1.53, "grad_norm": 2.2167517550488425, "learning_rate": 1.7518853749364283e-05, "loss": 0.9371, "step": 3545 }, { "epoch": 1.53, "grad_norm": 2.876447434863973, "learning_rate": 1.750890360597536e-05, "loss": 0.9319, "step": 3550 }, { "epoch": 1.54, "grad_norm": 4.626600470000748, "learning_rate": 1.7498936388759596e-05, "loss": 0.8646, "step": 3555 }, { "epoch": 1.54, "grad_norm": 3.0238441858852942, "learning_rate": 1.7488952120380556e-05, "loss": 0.9372, "step": 3560 }, { "epoch": 1.54, "grad_norm": 3.3222120957614663, "learning_rate": 1.747895082354058e-05, "loss": 0.8487, "step": 3565 }, { "epoch": 1.54, "grad_norm": 8.28429504970897, "learning_rate": 1.7468932520980722e-05, "loss": 0.9472, "step": 3570 }, { "epoch": 1.54, "grad_norm": 2.807509123378574, "learning_rate": 1.745889723548071e-05, "loss": 1.0485, "step": 3575 }, { "epoch": 1.55, "grad_norm": 5.081306727094978, "learning_rate": 1.7448844989858886e-05, "loss": 0.7305, "step": 3580 }, { "epoch": 1.55, "grad_norm": 3.429895318349554, "learning_rate": 1.7438775806972153e-05, "loss": 0.8123, "step": 3585 }, { "epoch": 1.55, "grad_norm": 3.4459765311838515, "learning_rate": 1.7428689709715928e-05, "loss": 0.8061, "step": 3590 }, { "epoch": 1.55, "grad_norm": 4.417656520258387, "learning_rate": 1.7418586721024096e-05, "loss": 0.8281, "step": 3595 }, { "epoch": 1.56, "grad_norm": 3.502368126389993, "learning_rate": 1.7408466863868935e-05, "loss": 0.8285, "step": 3600 }, { "epoch": 1.56, "grad_norm": 2.1996075307453475, "learning_rate": 1.739833016126109e-05, "loss": 0.7089, "step": 3605 }, { "epoch": 1.56, "grad_norm": 1.5582767433616693, "learning_rate": 1.7388176636249508e-05, "loss": 0.7903, "step": 3610 }, { "epoch": 1.56, "grad_norm": 1.9699367067357896, "learning_rate": 1.737800631192138e-05, "loss": 0.8573, "step": 3615 }, { "epoch": 1.56, "grad_norm": 5.25889815456181, "learning_rate": 1.7367819211402102e-05, "loss": 0.9748, "step": 3620 }, { "epoch": 1.57, "grad_norm": 4.646179346178722, "learning_rate": 1.7357615357855214e-05, "loss": 0.9594, "step": 3625 }, { "epoch": 1.57, "grad_norm": 6.136196120838206, "learning_rate": 1.7347394774482352e-05, "loss": 0.8544, "step": 3630 }, { "epoch": 1.57, "grad_norm": 0.8175188488490657, "learning_rate": 1.733715748452319e-05, "loss": 0.806, "step": 3635 }, { "epoch": 1.57, "grad_norm": 3.460932815967537, "learning_rate": 1.7326903511255385e-05, "loss": 0.7996, "step": 3640 }, { "epoch": 1.57, "grad_norm": 2.878479232664424, "learning_rate": 1.7316632877994536e-05, "loss": 1.0603, "step": 3645 }, { "epoch": 1.58, "grad_norm": 2.8505217948290817, "learning_rate": 1.7306345608094118e-05, "loss": 0.7371, "step": 3650 }, { "epoch": 1.58, "grad_norm": 4.88241799283637, "learning_rate": 1.7296041724945444e-05, "loss": 0.8434, "step": 3655 }, { "epoch": 1.58, "grad_norm": 1.741185669970214, "learning_rate": 1.7285721251977587e-05, "loss": 1.04, "step": 3660 }, { "epoch": 1.58, "grad_norm": 1.3676240280021945, "learning_rate": 1.727538421265736e-05, "loss": 0.9171, "step": 3665 }, { "epoch": 1.59, "grad_norm": 2.8829408528987894, "learning_rate": 1.7265030630489225e-05, "loss": 0.8408, "step": 3670 }, { "epoch": 1.59, "grad_norm": 2.4303674261915633, "learning_rate": 1.725466052901528e-05, "loss": 0.7715, "step": 3675 }, { "epoch": 1.59, "grad_norm": 3.0475458834050135, "learning_rate": 1.724427393181517e-05, "loss": 0.9401, "step": 3680 }, { "epoch": 1.59, "grad_norm": 7.430831583314274, "learning_rate": 1.7233870862506056e-05, "loss": 0.8857, "step": 3685 }, { "epoch": 1.59, "grad_norm": 2.4942060149245893, "learning_rate": 1.7223451344742543e-05, "loss": 0.8022, "step": 3690 }, { "epoch": 1.6, "grad_norm": 3.6192672645199626, "learning_rate": 1.721301540221665e-05, "loss": 0.7113, "step": 3695 }, { "epoch": 1.6, "grad_norm": 5.739159713290128, "learning_rate": 1.7202563058657742e-05, "loss": 0.8907, "step": 3700 }, { "epoch": 1.6, "grad_norm": 3.707239332984955, "learning_rate": 1.719209433783246e-05, "loss": 0.9802, "step": 3705 }, { "epoch": 1.6, "grad_norm": 4.992264989017978, "learning_rate": 1.7181609263544706e-05, "loss": 0.7683, "step": 3710 }, { "epoch": 1.6, "grad_norm": 2.2368251184952026, "learning_rate": 1.717110785963555e-05, "loss": 0.7963, "step": 3715 }, { "epoch": 1.61, "grad_norm": 4.286388500117645, "learning_rate": 1.7160590149983195e-05, "loss": 0.9671, "step": 3720 }, { "epoch": 1.61, "grad_norm": 2.6902602240723628, "learning_rate": 1.715005615850293e-05, "loss": 0.8457, "step": 3725 }, { "epoch": 1.61, "grad_norm": 2.476741017909578, "learning_rate": 1.713950590914706e-05, "loss": 0.8622, "step": 3730 }, { "epoch": 1.61, "grad_norm": 11.302682992388872, "learning_rate": 1.7128939425904857e-05, "loss": 0.8139, "step": 3735 }, { "epoch": 1.62, "grad_norm": 5.760632967816922, "learning_rate": 1.7118356732802502e-05, "loss": 0.9834, "step": 3740 }, { "epoch": 1.62, "grad_norm": 4.72617092213642, "learning_rate": 1.710775785390304e-05, "loss": 0.8057, "step": 3745 }, { "epoch": 1.62, "grad_norm": 2.867135354009946, "learning_rate": 1.709714281330632e-05, "loss": 0.7903, "step": 3750 }, { "epoch": 1.62, "grad_norm": 5.584505123973735, "learning_rate": 1.7086511635148926e-05, "loss": 1.0098, "step": 3755 }, { "epoch": 1.62, "grad_norm": 1.3507238105944468, "learning_rate": 1.7075864343604162e-05, "loss": 0.6761, "step": 3760 }, { "epoch": 1.63, "grad_norm": 5.062844299889445, "learning_rate": 1.7065200962881946e-05, "loss": 0.9111, "step": 3765 }, { "epoch": 1.63, "grad_norm": 1.420206926325427, "learning_rate": 1.7054521517228792e-05, "loss": 0.9262, "step": 3770 }, { "epoch": 1.63, "grad_norm": 3.400362820427134, "learning_rate": 1.704382603092774e-05, "loss": 0.7034, "step": 3775 }, { "epoch": 1.63, "grad_norm": 13.410473444763928, "learning_rate": 1.7033114528298304e-05, "loss": 0.8797, "step": 3780 }, { "epoch": 1.63, "grad_norm": 4.227659804805431, "learning_rate": 1.7022387033696413e-05, "loss": 0.8266, "step": 3785 }, { "epoch": 1.64, "grad_norm": 11.123371302509794, "learning_rate": 1.7011643571514365e-05, "loss": 0.8825, "step": 3790 }, { "epoch": 1.64, "grad_norm": 7.0109121600981466, "learning_rate": 1.7000884166180754e-05, "loss": 1.0255, "step": 3795 }, { "epoch": 1.64, "grad_norm": 3.233226584132477, "learning_rate": 1.6990108842160445e-05, "loss": 0.8432, "step": 3800 }, { "epoch": 1.64, "grad_norm": 2.6243800886514235, "learning_rate": 1.6979317623954476e-05, "loss": 0.7495, "step": 3805 }, { "epoch": 1.65, "grad_norm": 4.804997476595591, "learning_rate": 1.6968510536100037e-05, "loss": 0.7711, "step": 3810 }, { "epoch": 1.65, "grad_norm": 3.084026561599956, "learning_rate": 1.6957687603170413e-05, "loss": 0.8088, "step": 3815 }, { "epoch": 1.65, "grad_norm": 8.949623608113457, "learning_rate": 1.6946848849774894e-05, "loss": 0.8532, "step": 3820 }, { "epoch": 1.65, "grad_norm": 4.778283870889809, "learning_rate": 1.693599430055876e-05, "loss": 0.9558, "step": 3825 }, { "epoch": 1.65, "grad_norm": 4.587210475944901, "learning_rate": 1.692512398020321e-05, "loss": 0.7157, "step": 3830 }, { "epoch": 1.66, "grad_norm": 1.7434908938272295, "learning_rate": 1.691423791342529e-05, "loss": 0.877, "step": 3835 }, { "epoch": 1.66, "grad_norm": 1.6581151680906558, "learning_rate": 1.6903336124977858e-05, "loss": 0.8128, "step": 3840 }, { "epoch": 1.66, "grad_norm": 2.8079967808156883, "learning_rate": 1.689241863964952e-05, "loss": 0.9081, "step": 3845 }, { "epoch": 1.66, "grad_norm": 0.7986015484972644, "learning_rate": 1.688148548226457e-05, "loss": 0.7597, "step": 3850 }, { "epoch": 1.67, "grad_norm": 4.498669451011208, "learning_rate": 1.687053667768295e-05, "loss": 0.7931, "step": 3855 }, { "epoch": 1.67, "grad_norm": 3.124000175896055, "learning_rate": 1.685957225080016e-05, "loss": 0.7858, "step": 3860 }, { "epoch": 1.67, "grad_norm": 4.364340657244522, "learning_rate": 1.684859222654724e-05, "loss": 0.9062, "step": 3865 }, { "epoch": 1.67, "grad_norm": 7.350243693509682, "learning_rate": 1.6837596629890683e-05, "loss": 1.05, "step": 3870 }, { "epoch": 1.67, "grad_norm": 3.2882320910324405, "learning_rate": 1.6826585485832408e-05, "loss": 0.8192, "step": 3875 }, { "epoch": 1.68, "grad_norm": 4.778392575192595, "learning_rate": 1.6815558819409662e-05, "loss": 0.9379, "step": 3880 }, { "epoch": 1.68, "grad_norm": 3.318104844054457, "learning_rate": 1.6804516655695003e-05, "loss": 0.8725, "step": 3885 }, { "epoch": 1.68, "grad_norm": 2.264270858290099, "learning_rate": 1.6793459019796226e-05, "loss": 0.6963, "step": 3890 }, { "epoch": 1.68, "grad_norm": 4.01933074861301, "learning_rate": 1.67823859368563e-05, "loss": 0.805, "step": 3895 }, { "epoch": 1.68, "grad_norm": 2.9667750183356283, "learning_rate": 1.6771297432053324e-05, "loss": 0.7698, "step": 3900 }, { "epoch": 1.69, "grad_norm": 2.8413147859622034, "learning_rate": 1.6760193530600463e-05, "loss": 0.8624, "step": 3905 }, { "epoch": 1.69, "grad_norm": 0.8762946124497302, "learning_rate": 1.6749074257745886e-05, "loss": 0.7549, "step": 3910 }, { "epoch": 1.69, "grad_norm": 1.516106158568189, "learning_rate": 1.6737939638772717e-05, "loss": 1.0617, "step": 3915 }, { "epoch": 1.69, "grad_norm": 2.6633739389265387, "learning_rate": 1.6726789698998975e-05, "loss": 0.7438, "step": 3920 }, { "epoch": 1.7, "grad_norm": 5.437903737579578, "learning_rate": 1.6715624463777517e-05, "loss": 0.879, "step": 3925 }, { "epoch": 1.7, "grad_norm": 1.5278118733471453, "learning_rate": 1.6704443958495968e-05, "loss": 0.9221, "step": 3930 }, { "epoch": 1.7, "grad_norm": 1.463363203352676, "learning_rate": 1.6693248208576692e-05, "loss": 0.847, "step": 3935 }, { "epoch": 1.7, "grad_norm": 1.8308055767789655, "learning_rate": 1.6682037239476704e-05, "loss": 0.715, "step": 3940 }, { "epoch": 1.7, "grad_norm": 3.1489574436454206, "learning_rate": 1.6670811076687625e-05, "loss": 0.8082, "step": 3945 }, { "epoch": 1.71, "grad_norm": 1.7250411190465171, "learning_rate": 1.665956974573563e-05, "loss": 0.781, "step": 3950 }, { "epoch": 1.71, "grad_norm": 1.3382497564513038, "learning_rate": 1.6648313272181376e-05, "loss": 0.7784, "step": 3955 }, { "epoch": 1.71, "grad_norm": 3.455075057480329, "learning_rate": 1.6637041681619965e-05, "loss": 0.8104, "step": 3960 }, { "epoch": 1.71, "grad_norm": 5.418598141306543, "learning_rate": 1.6625754999680848e-05, "loss": 0.7615, "step": 3965 }, { "epoch": 1.71, "grad_norm": 4.440826447631677, "learning_rate": 1.6614453252027812e-05, "loss": 0.8031, "step": 3970 }, { "epoch": 1.72, "grad_norm": 3.70035610129939, "learning_rate": 1.6603136464358904e-05, "loss": 0.773, "step": 3975 }, { "epoch": 1.72, "grad_norm": 5.716173129679144, "learning_rate": 1.6591804662406338e-05, "loss": 0.7824, "step": 3980 }, { "epoch": 1.72, "grad_norm": 1.5729883041830204, "learning_rate": 1.6580457871936506e-05, "loss": 0.8418, "step": 3985 }, { "epoch": 1.72, "grad_norm": 6.022112190463788, "learning_rate": 1.6569096118749858e-05, "loss": 0.7688, "step": 3990 }, { "epoch": 1.73, "grad_norm": 1.3221927220586283, "learning_rate": 1.6557719428680872e-05, "loss": 0.758, "step": 3995 }, { "epoch": 1.73, "grad_norm": 3.799747978987853, "learning_rate": 1.654632782759799e-05, "loss": 0.8551, "step": 4000 }, { "epoch": 1.73, "grad_norm": 1.2942585118978511, "learning_rate": 1.653492134140356e-05, "loss": 0.7883, "step": 4005 }, { "epoch": 1.73, "grad_norm": 2.7212654259408304, "learning_rate": 1.652349999603377e-05, "loss": 0.8271, "step": 4010 }, { "epoch": 1.73, "grad_norm": 2.5262239960585866, "learning_rate": 1.6512063817458605e-05, "loss": 0.8504, "step": 4015 }, { "epoch": 1.74, "grad_norm": 3.1183416250542306, "learning_rate": 1.650061283168177e-05, "loss": 0.6711, "step": 4020 }, { "epoch": 1.74, "grad_norm": 2.397589020478298, "learning_rate": 1.6489147064740642e-05, "loss": 0.7684, "step": 4025 }, { "epoch": 1.74, "grad_norm": 3.656025571054232, "learning_rate": 1.6477666542706207e-05, "loss": 0.8718, "step": 4030 }, { "epoch": 1.74, "grad_norm": 2.168095980113938, "learning_rate": 1.6466171291683e-05, "loss": 0.821, "step": 4035 }, { "epoch": 1.75, "grad_norm": 3.574224637226931, "learning_rate": 1.6454661337809055e-05, "loss": 0.7481, "step": 4040 }, { "epoch": 1.75, "grad_norm": 5.950088811281826, "learning_rate": 1.6443136707255818e-05, "loss": 0.7502, "step": 4045 }, { "epoch": 1.75, "grad_norm": 3.655860963651843, "learning_rate": 1.6431597426228126e-05, "loss": 0.8137, "step": 4050 }, { "epoch": 1.75, "grad_norm": 3.999879119916261, "learning_rate": 1.642004352096412e-05, "loss": 0.863, "step": 4055 }, { "epoch": 1.75, "grad_norm": 2.3109457640682876, "learning_rate": 1.6408475017735202e-05, "loss": 0.7669, "step": 4060 }, { "epoch": 1.76, "grad_norm": 3.0774638339199347, "learning_rate": 1.6396891942845954e-05, "loss": 0.7902, "step": 4065 }, { "epoch": 1.76, "grad_norm": 3.286986353132523, "learning_rate": 1.6385294322634098e-05, "loss": 0.9422, "step": 4070 }, { "epoch": 1.76, "grad_norm": 3.0032940152432426, "learning_rate": 1.6373682183470433e-05, "loss": 0.8383, "step": 4075 }, { "epoch": 1.76, "grad_norm": 4.1268268918520565, "learning_rate": 1.6362055551758767e-05, "loss": 0.741, "step": 4080 }, { "epoch": 1.76, "grad_norm": 3.36178597516033, "learning_rate": 1.635041445393586e-05, "loss": 0.8094, "step": 4085 }, { "epoch": 1.77, "grad_norm": 2.9084683387629076, "learning_rate": 1.633875891647137e-05, "loss": 0.8543, "step": 4090 }, { "epoch": 1.77, "grad_norm": 3.0795495163167526, "learning_rate": 1.6327088965867776e-05, "loss": 0.868, "step": 4095 }, { "epoch": 1.77, "grad_norm": 1.6954478345935486, "learning_rate": 1.6315404628660353e-05, "loss": 0.8003, "step": 4100 }, { "epoch": 1.77, "grad_norm": 2.630623426076288, "learning_rate": 1.630370593141706e-05, "loss": 0.851, "step": 4105 }, { "epoch": 1.78, "grad_norm": 8.6511753159283, "learning_rate": 1.6291992900738533e-05, "loss": 0.6028, "step": 4110 }, { "epoch": 1.78, "grad_norm": 1.8881802149607876, "learning_rate": 1.6280265563257985e-05, "loss": 0.9412, "step": 4115 }, { "epoch": 1.78, "grad_norm": 1.4693485910253388, "learning_rate": 1.6268523945641167e-05, "loss": 0.6772, "step": 4120 }, { "epoch": 1.78, "grad_norm": 2.9003838100116686, "learning_rate": 1.625676807458629e-05, "loss": 0.8457, "step": 4125 }, { "epoch": 1.78, "grad_norm": 2.589692655837404, "learning_rate": 1.6244997976823985e-05, "loss": 0.6244, "step": 4130 }, { "epoch": 1.79, "grad_norm": 2.956900769234369, "learning_rate": 1.6233213679117232e-05, "loss": 0.6138, "step": 4135 }, { "epoch": 1.79, "grad_norm": 4.024754567975696, "learning_rate": 1.6221415208261293e-05, "loss": 0.7175, "step": 4140 }, { "epoch": 1.79, "grad_norm": 2.4529650232289715, "learning_rate": 1.6209602591083657e-05, "loss": 0.6782, "step": 4145 }, { "epoch": 1.79, "grad_norm": 4.58521235942626, "learning_rate": 1.6197775854443985e-05, "loss": 0.6483, "step": 4150 }, { "epoch": 1.79, "grad_norm": 4.733052959198119, "learning_rate": 1.6185935025234036e-05, "loss": 0.7645, "step": 4155 }, { "epoch": 1.8, "grad_norm": 7.068612856829745, "learning_rate": 1.6174080130377622e-05, "loss": 0.9781, "step": 4160 }, { "epoch": 1.8, "grad_norm": 4.417592764702344, "learning_rate": 1.6162211196830527e-05, "loss": 0.9102, "step": 4165 }, { "epoch": 1.8, "grad_norm": 3.6303747370484167, "learning_rate": 1.6150328251580468e-05, "loss": 0.8287, "step": 4170 }, { "epoch": 1.8, "grad_norm": 7.138484240792349, "learning_rate": 1.613843132164701e-05, "loss": 0.872, "step": 4175 }, { "epoch": 1.81, "grad_norm": 3.918036791068465, "learning_rate": 1.6126520434081516e-05, "loss": 0.7506, "step": 4180 }, { "epoch": 1.81, "grad_norm": 5.0901783852976425, "learning_rate": 1.6114595615967106e-05, "loss": 0.9017, "step": 4185 }, { "epoch": 1.81, "grad_norm": 2.045410905898512, "learning_rate": 1.6102656894418553e-05, "loss": 0.7683, "step": 4190 }, { "epoch": 1.81, "grad_norm": 2.6632339628142594, "learning_rate": 1.609070429658225e-05, "loss": 0.7586, "step": 4195 }, { "epoch": 1.81, "grad_norm": 1.3827299405985718, "learning_rate": 1.607873784963615e-05, "loss": 0.8765, "step": 4200 }, { "epoch": 1.82, "grad_norm": 3.0920671391775727, "learning_rate": 1.6066757580789688e-05, "loss": 0.7257, "step": 4205 }, { "epoch": 1.82, "grad_norm": 2.3255694701334635, "learning_rate": 1.6054763517283732e-05, "loss": 0.6743, "step": 4210 }, { "epoch": 1.82, "grad_norm": 2.2854911864107286, "learning_rate": 1.604275568639051e-05, "loss": 0.6999, "step": 4215 }, { "epoch": 1.82, "grad_norm": 2.6973111281931277, "learning_rate": 1.603073411541356e-05, "loss": 0.6559, "step": 4220 }, { "epoch": 1.83, "grad_norm": 1.711774755331081, "learning_rate": 1.6018698831687666e-05, "loss": 0.7051, "step": 4225 }, { "epoch": 1.83, "grad_norm": 1.7553915583220734, "learning_rate": 1.6006649862578778e-05, "loss": 0.8272, "step": 4230 }, { "epoch": 1.83, "grad_norm": 2.6007853775805665, "learning_rate": 1.599458723548398e-05, "loss": 0.7484, "step": 4235 }, { "epoch": 1.83, "grad_norm": 2.5226516379121358, "learning_rate": 1.5982510977831408e-05, "loss": 0.6877, "step": 4240 }, { "epoch": 1.83, "grad_norm": 3.130361730803545, "learning_rate": 1.5970421117080177e-05, "loss": 0.6707, "step": 4245 }, { "epoch": 1.84, "grad_norm": 1.8182316559095062, "learning_rate": 1.5958317680720355e-05, "loss": 0.6941, "step": 4250 }, { "epoch": 1.84, "grad_norm": 2.8174603247256926, "learning_rate": 1.5946200696272863e-05, "loss": 0.9454, "step": 4255 }, { "epoch": 1.84, "grad_norm": 1.5718684552163051, "learning_rate": 1.5934070191289433e-05, "loss": 0.8102, "step": 4260 }, { "epoch": 1.84, "grad_norm": 7.559817790858044, "learning_rate": 1.5921926193352538e-05, "loss": 0.8674, "step": 4265 }, { "epoch": 1.84, "grad_norm": 3.9279347722065308, "learning_rate": 1.5909768730075333e-05, "loss": 0.8646, "step": 4270 }, { "epoch": 1.85, "grad_norm": 7.256637087042067, "learning_rate": 1.5897597829101595e-05, "loss": 0.7771, "step": 4275 }, { "epoch": 1.85, "grad_norm": 6.071624777714126, "learning_rate": 1.588541351810564e-05, "loss": 0.619, "step": 4280 }, { "epoch": 1.85, "grad_norm": 11.2771501825943, "learning_rate": 1.58732158247923e-05, "loss": 0.7805, "step": 4285 }, { "epoch": 1.85, "grad_norm": 13.152048939078416, "learning_rate": 1.586100477689682e-05, "loss": 0.9266, "step": 4290 }, { "epoch": 1.86, "grad_norm": 13.753922000544033, "learning_rate": 1.5848780402184808e-05, "loss": 0.7702, "step": 4295 }, { "epoch": 1.86, "grad_norm": 7.27610899912322, "learning_rate": 1.5836542728452185e-05, "loss": 0.7946, "step": 4300 }, { "epoch": 1.86, "grad_norm": 2.368126946549542, "learning_rate": 1.5824291783525106e-05, "loss": 0.6348, "step": 4305 }, { "epoch": 1.86, "grad_norm": 1.146122972648381, "learning_rate": 1.5812027595259906e-05, "loss": 0.7269, "step": 4310 }, { "epoch": 1.86, "grad_norm": 1.8289636619528542, "learning_rate": 1.579975019154302e-05, "loss": 0.7495, "step": 4315 }, { "epoch": 1.87, "grad_norm": 2.165288148545283, "learning_rate": 1.5787459600290954e-05, "loss": 0.7305, "step": 4320 }, { "epoch": 1.87, "grad_norm": 4.155506709529135, "learning_rate": 1.577515584945018e-05, "loss": 0.8006, "step": 4325 }, { "epoch": 1.87, "grad_norm": 2.6227326930585053, "learning_rate": 1.5762838966997107e-05, "loss": 0.7165, "step": 4330 }, { "epoch": 1.87, "grad_norm": 7.036639513352053, "learning_rate": 1.5750508980937993e-05, "loss": 0.777, "step": 4335 }, { "epoch": 1.87, "grad_norm": 6.766310299818976, "learning_rate": 1.5738165919308895e-05, "loss": 0.8287, "step": 4340 }, { "epoch": 1.88, "grad_norm": 1.2420623522486884, "learning_rate": 1.5725809810175602e-05, "loss": 0.7338, "step": 4345 }, { "epoch": 1.88, "grad_norm": 6.564640372437988, "learning_rate": 1.5713440681633566e-05, "loss": 0.767, "step": 4350 }, { "epoch": 1.88, "grad_norm": 3.0083078851125267, "learning_rate": 1.5701058561807853e-05, "loss": 0.7465, "step": 4355 }, { "epoch": 1.88, "grad_norm": 1.366516155826671, "learning_rate": 1.5688663478853058e-05, "loss": 0.7986, "step": 4360 }, { "epoch": 1.89, "grad_norm": 1.5525097970638504, "learning_rate": 1.5676255460953254e-05, "loss": 0.8143, "step": 4365 }, { "epoch": 1.89, "grad_norm": 2.4683161535080473, "learning_rate": 1.566383453632193e-05, "loss": 0.7335, "step": 4370 }, { "epoch": 1.89, "grad_norm": 1.852877416756737, "learning_rate": 1.565140073320192e-05, "loss": 0.7425, "step": 4375 }, { "epoch": 1.89, "grad_norm": 1.4269521394757336, "learning_rate": 1.5638954079865342e-05, "loss": 0.7322, "step": 4380 }, { "epoch": 1.89, "grad_norm": 2.338580529714504, "learning_rate": 1.5626494604613528e-05, "loss": 0.9325, "step": 4385 }, { "epoch": 1.9, "grad_norm": 2.708983569930317, "learning_rate": 1.561402233577697e-05, "loss": 0.8415, "step": 4390 }, { "epoch": 1.9, "grad_norm": 2.8397511266914304, "learning_rate": 1.5601537301715247e-05, "loss": 0.7672, "step": 4395 }, { "epoch": 1.9, "grad_norm": 1.4728688538265324, "learning_rate": 1.558903953081697e-05, "loss": 0.6873, "step": 4400 }, { "epoch": 1.9, "grad_norm": 2.4185636195682747, "learning_rate": 1.5576529051499707e-05, "loss": 0.7151, "step": 4405 }, { "epoch": 1.9, "grad_norm": 3.931984808710276, "learning_rate": 1.5564005892209916e-05, "loss": 0.7446, "step": 4410 }, { "epoch": 1.91, "grad_norm": 3.9161564917715075, "learning_rate": 1.55514700814229e-05, "loss": 0.8391, "step": 4415 }, { "epoch": 1.91, "grad_norm": 1.3389597797447057, "learning_rate": 1.5538921647642718e-05, "loss": 0.7746, "step": 4420 }, { "epoch": 1.91, "grad_norm": 2.1550095910375564, "learning_rate": 1.5526360619402138e-05, "loss": 0.6443, "step": 4425 }, { "epoch": 1.91, "grad_norm": 5.377982371644991, "learning_rate": 1.5513787025262556e-05, "loss": 0.7172, "step": 4430 }, { "epoch": 1.92, "grad_norm": 3.4585542128723183, "learning_rate": 1.550120089381396e-05, "loss": 0.8732, "step": 4435 }, { "epoch": 1.92, "grad_norm": 15.135963545385831, "learning_rate": 1.548860225367482e-05, "loss": 0.8467, "step": 4440 }, { "epoch": 1.92, "grad_norm": 4.608444491060904, "learning_rate": 1.5475991133492068e-05, "loss": 0.6725, "step": 4445 }, { "epoch": 1.92, "grad_norm": 15.522224892151614, "learning_rate": 1.5463367561941008e-05, "loss": 0.6184, "step": 4450 }, { "epoch": 1.92, "grad_norm": 13.838682827729304, "learning_rate": 1.5450731567725248e-05, "loss": 0.8473, "step": 4455 }, { "epoch": 1.93, "grad_norm": 10.190353817674348, "learning_rate": 1.5438083179576654e-05, "loss": 0.8967, "step": 4460 }, { "epoch": 1.93, "grad_norm": 4.046631043939276, "learning_rate": 1.542542242625527e-05, "loss": 0.8575, "step": 4465 }, { "epoch": 1.93, "grad_norm": 9.244099006352315, "learning_rate": 1.541274933654925e-05, "loss": 0.783, "step": 4470 }, { "epoch": 1.93, "grad_norm": 5.994363890660864, "learning_rate": 1.540006393927481e-05, "loss": 0.8754, "step": 4475 }, { "epoch": 1.94, "grad_norm": 16.806360328638917, "learning_rate": 1.538736626327614e-05, "loss": 0.7075, "step": 4480 }, { "epoch": 1.94, "grad_norm": 3.7191893623583687, "learning_rate": 1.5374656337425356e-05, "loss": 0.6038, "step": 4485 }, { "epoch": 1.94, "grad_norm": 2.6278813426794176, "learning_rate": 1.536193419062242e-05, "loss": 0.8328, "step": 4490 }, { "epoch": 1.94, "grad_norm": 3.1733988195669207, "learning_rate": 1.5349199851795095e-05, "loss": 0.8071, "step": 4495 }, { "epoch": 1.94, "grad_norm": 4.189600230957689, "learning_rate": 1.533645334989886e-05, "loss": 0.8508, "step": 4500 }, { "epoch": 1.95, "grad_norm": 10.22818859017594, "learning_rate": 1.5323694713916845e-05, "loss": 0.7205, "step": 4505 }, { "epoch": 1.95, "grad_norm": 1.9802329865191948, "learning_rate": 1.531092397285978e-05, "loss": 0.7696, "step": 4510 }, { "epoch": 1.95, "grad_norm": 6.172727439763833, "learning_rate": 1.5298141155765907e-05, "loss": 0.7767, "step": 4515 }, { "epoch": 1.95, "grad_norm": 3.088104692698883, "learning_rate": 1.5285346291700945e-05, "loss": 0.7553, "step": 4520 }, { "epoch": 1.95, "grad_norm": 1.83864815532713, "learning_rate": 1.5272539409757992e-05, "loss": 0.6233, "step": 4525 }, { "epoch": 1.96, "grad_norm": 3.214823490347084, "learning_rate": 1.5259720539057474e-05, "loss": 0.6767, "step": 4530 }, { "epoch": 1.96, "grad_norm": 3.844831063170245, "learning_rate": 1.524688970874708e-05, "loss": 0.7351, "step": 4535 }, { "epoch": 1.96, "grad_norm": 11.119244612077713, "learning_rate": 1.5234046948001688e-05, "loss": 0.7056, "step": 4540 }, { "epoch": 1.96, "grad_norm": 2.0176072517741206, "learning_rate": 1.5221192286023311e-05, "loss": 0.6993, "step": 4545 }, { "epoch": 1.97, "grad_norm": 2.6533601248298404, "learning_rate": 1.5208325752041023e-05, "loss": 0.7078, "step": 4550 }, { "epoch": 1.97, "grad_norm": 4.065798443051646, "learning_rate": 1.5195447375310884e-05, "loss": 0.6986, "step": 4555 }, { "epoch": 1.97, "grad_norm": 3.744211055171865, "learning_rate": 1.518255718511589e-05, "loss": 0.6592, "step": 4560 }, { "epoch": 1.97, "grad_norm": 3.9609503029691746, "learning_rate": 1.516965521076589e-05, "loss": 0.8221, "step": 4565 }, { "epoch": 1.97, "grad_norm": 1.6094015126459182, "learning_rate": 1.515674148159754e-05, "loss": 0.6243, "step": 4570 }, { "epoch": 1.98, "grad_norm": 3.4156081882227625, "learning_rate": 1.5143816026974221e-05, "loss": 0.7139, "step": 4575 }, { "epoch": 1.98, "grad_norm": 1.9510998473531977, "learning_rate": 1.5130878876285965e-05, "loss": 0.7618, "step": 4580 }, { "epoch": 1.98, "grad_norm": 1.668314185972532, "learning_rate": 1.511793005894941e-05, "loss": 0.7907, "step": 4585 }, { "epoch": 1.98, "grad_norm": 1.6058117505109508, "learning_rate": 1.5104969604407716e-05, "loss": 0.659, "step": 4590 }, { "epoch": 1.98, "grad_norm": 3.5132785483692635, "learning_rate": 1.5091997542130506e-05, "loss": 0.786, "step": 4595 }, { "epoch": 1.99, "grad_norm": 6.732887645085844, "learning_rate": 1.5079013901613802e-05, "loss": 0.7016, "step": 4600 }, { "epoch": 1.99, "grad_norm": 1.423445894358691, "learning_rate": 1.5066018712379939e-05, "loss": 0.7545, "step": 4605 }, { "epoch": 1.99, "grad_norm": 1.5043065712268024, "learning_rate": 1.5053012003977527e-05, "loss": 0.8126, "step": 4610 }, { "epoch": 1.99, "grad_norm": 3.0586231775629615, "learning_rate": 1.5039993805981357e-05, "loss": 0.7461, "step": 4615 }, { "epoch": 2.0, "grad_norm": 6.052595972323311, "learning_rate": 1.5026964147992348e-05, "loss": 0.7427, "step": 4620 }, { "epoch": 2.0, "grad_norm": 2.0733693886967393, "learning_rate": 1.5013923059637486e-05, "loss": 0.6624, "step": 4625 }, { "epoch": 2.0, "grad_norm": 4.687342403320873, "learning_rate": 1.500087057056973e-05, "loss": 0.7793, "step": 4630 }, { "epoch": 2.0, "eval_loss": 5.413473606109619, "eval_runtime": 351.2973, "eval_samples_per_second": 21.369, "eval_steps_per_second": 0.336, "step": 4630 }, { "epoch": 2.0, "grad_norm": 3.6681478113794004, "learning_rate": 1.498780671046798e-05, "loss": 0.734, "step": 4635 }, { "epoch": 2.0, "grad_norm": 7.802830523737377, "learning_rate": 1.497473150903698e-05, "loss": 0.7815, "step": 4640 }, { "epoch": 2.01, "grad_norm": 8.109033409317203, "learning_rate": 1.496164499600726e-05, "loss": 0.829, "step": 4645 }, { "epoch": 2.01, "grad_norm": 8.074131689515943, "learning_rate": 1.4948547201135088e-05, "loss": 0.6535, "step": 4650 }, { "epoch": 2.01, "grad_norm": 6.596520551200253, "learning_rate": 1.4935438154202363e-05, "loss": 0.7058, "step": 4655 }, { "epoch": 2.01, "grad_norm": 1.6424859881344187, "learning_rate": 1.4922317885016584e-05, "loss": 0.643, "step": 4660 }, { "epoch": 2.02, "grad_norm": 4.977709318530804, "learning_rate": 1.490918642341076e-05, "loss": 0.731, "step": 4665 }, { "epoch": 2.02, "grad_norm": 2.795003746798207, "learning_rate": 1.4896043799243349e-05, "loss": 0.7434, "step": 4670 }, { "epoch": 2.02, "grad_norm": 2.106674558037682, "learning_rate": 1.4882890042398197e-05, "loss": 0.6729, "step": 4675 }, { "epoch": 2.02, "grad_norm": 3.4589623366999875, "learning_rate": 1.4869725182784458e-05, "loss": 0.8524, "step": 4680 }, { "epoch": 2.02, "grad_norm": 2.702534222166854, "learning_rate": 1.4856549250336537e-05, "loss": 0.6632, "step": 4685 }, { "epoch": 2.03, "grad_norm": 2.8843345731755354, "learning_rate": 1.4843362275014004e-05, "loss": 0.8314, "step": 4690 }, { "epoch": 2.03, "grad_norm": 2.2974327677286053, "learning_rate": 1.4830164286801552e-05, "loss": 0.6472, "step": 4695 }, { "epoch": 2.03, "grad_norm": 2.6153862975813094, "learning_rate": 1.4816955315708916e-05, "loss": 0.6175, "step": 4700 }, { "epoch": 2.03, "grad_norm": 1.812689881534979, "learning_rate": 1.4803735391770788e-05, "loss": 0.6033, "step": 4705 }, { "epoch": 2.03, "grad_norm": 8.657508696336997, "learning_rate": 1.4790504545046785e-05, "loss": 0.8608, "step": 4710 }, { "epoch": 2.04, "grad_norm": 10.64182267161339, "learning_rate": 1.4777262805621341e-05, "loss": 0.686, "step": 4715 }, { "epoch": 2.04, "grad_norm": 10.917597635768443, "learning_rate": 1.4764010203603673e-05, "loss": 0.6776, "step": 4720 }, { "epoch": 2.04, "grad_norm": 4.894550754609609, "learning_rate": 1.4750746769127695e-05, "loss": 0.6753, "step": 4725 }, { "epoch": 2.04, "grad_norm": 7.90131501061075, "learning_rate": 1.4737472532351941e-05, "loss": 0.6693, "step": 4730 }, { "epoch": 2.05, "grad_norm": 7.234315155347728, "learning_rate": 1.4724187523459524e-05, "loss": 0.8296, "step": 4735 }, { "epoch": 2.05, "grad_norm": 2.621839934063872, "learning_rate": 1.4710891772658034e-05, "loss": 0.8339, "step": 4740 }, { "epoch": 2.05, "grad_norm": 4.132144800302082, "learning_rate": 1.4697585310179495e-05, "loss": 0.7128, "step": 4745 }, { "epoch": 2.05, "grad_norm": 1.392978638244078, "learning_rate": 1.4684268166280293e-05, "loss": 0.5983, "step": 4750 }, { "epoch": 2.05, "grad_norm": 2.346021627010998, "learning_rate": 1.4670940371241086e-05, "loss": 0.6504, "step": 4755 }, { "epoch": 2.06, "grad_norm": 1.6569506181595035, "learning_rate": 1.4657601955366767e-05, "loss": 0.693, "step": 4760 }, { "epoch": 2.06, "grad_norm": 2.2965091368781776, "learning_rate": 1.4644252948986368e-05, "loss": 0.6942, "step": 4765 }, { "epoch": 2.06, "grad_norm": 1.6884518445524517, "learning_rate": 1.4630893382453004e-05, "loss": 0.7618, "step": 4770 }, { "epoch": 2.06, "grad_norm": 3.0291222685763852, "learning_rate": 1.4617523286143806e-05, "loss": 0.7276, "step": 4775 }, { "epoch": 2.06, "grad_norm": 2.752424933714392, "learning_rate": 1.4604142690459843e-05, "loss": 0.6047, "step": 4780 }, { "epoch": 2.07, "grad_norm": 2.913663764157102, "learning_rate": 1.459075162582606e-05, "loss": 0.7256, "step": 4785 }, { "epoch": 2.07, "grad_norm": 2.2389568862285567, "learning_rate": 1.4577350122691205e-05, "loss": 0.7098, "step": 4790 }, { "epoch": 2.07, "grad_norm": 3.8588853245465953, "learning_rate": 1.4563938211527761e-05, "loss": 0.8396, "step": 4795 }, { "epoch": 2.07, "grad_norm": 1.8326704667975264, "learning_rate": 1.4550515922831881e-05, "loss": 0.6518, "step": 4800 }, { "epoch": 2.08, "grad_norm": 2.01601811375239, "learning_rate": 1.4537083287123309e-05, "loss": 0.7681, "step": 4805 }, { "epoch": 2.08, "grad_norm": 3.4581655103666966, "learning_rate": 1.4523640334945317e-05, "loss": 0.6857, "step": 4810 }, { "epoch": 2.08, "grad_norm": 2.5903440925338317, "learning_rate": 1.4510187096864638e-05, "loss": 0.6613, "step": 4815 }, { "epoch": 2.08, "grad_norm": 1.8100171699807792, "learning_rate": 1.449672360347139e-05, "loss": 0.6798, "step": 4820 }, { "epoch": 2.08, "grad_norm": 2.1781013189873946, "learning_rate": 1.4483249885379014e-05, "loss": 0.6552, "step": 4825 }, { "epoch": 2.09, "grad_norm": 6.251914333658113, "learning_rate": 1.4469765973224194e-05, "loss": 0.7461, "step": 4830 }, { "epoch": 2.09, "grad_norm": 5.817878468854903, "learning_rate": 1.4456271897666798e-05, "loss": 0.6437, "step": 4835 }, { "epoch": 2.09, "grad_norm": 5.662860653064379, "learning_rate": 1.44427676893898e-05, "loss": 0.7011, "step": 4840 }, { "epoch": 2.09, "grad_norm": 4.510980150786451, "learning_rate": 1.442925337909922e-05, "loss": 0.5923, "step": 4845 }, { "epoch": 2.1, "grad_norm": 1.9814598174187572, "learning_rate": 1.441572899752404e-05, "loss": 0.7218, "step": 4850 }, { "epoch": 2.1, "grad_norm": 1.8871473249191737, "learning_rate": 1.4402194575416148e-05, "loss": 0.7841, "step": 4855 }, { "epoch": 2.1, "grad_norm": 2.729798404906885, "learning_rate": 1.438865014355026e-05, "loss": 0.5294, "step": 4860 }, { "epoch": 2.1, "grad_norm": 3.1101404087816444, "learning_rate": 1.4375095732723852e-05, "loss": 0.6459, "step": 4865 }, { "epoch": 2.1, "grad_norm": 2.5343679096514107, "learning_rate": 1.4361531373757092e-05, "loss": 0.5827, "step": 4870 }, { "epoch": 2.11, "grad_norm": 2.4084831856604536, "learning_rate": 1.434795709749277e-05, "loss": 0.6763, "step": 4875 }, { "epoch": 2.11, "grad_norm": 2.6059335460748896, "learning_rate": 1.4334372934796218e-05, "loss": 0.5522, "step": 4880 }, { "epoch": 2.11, "grad_norm": 4.261437272764679, "learning_rate": 1.4320778916555255e-05, "loss": 0.6038, "step": 4885 }, { "epoch": 2.11, "grad_norm": 1.2672251951057814, "learning_rate": 1.430717507368011e-05, "loss": 0.688, "step": 4890 }, { "epoch": 2.11, "grad_norm": 2.1511319225380094, "learning_rate": 1.4293561437103348e-05, "loss": 0.7069, "step": 4895 }, { "epoch": 2.12, "grad_norm": 4.212127220928629, "learning_rate": 1.4279938037779801e-05, "loss": 0.6505, "step": 4900 }, { "epoch": 2.12, "grad_norm": 2.1893645392849868, "learning_rate": 1.4266304906686508e-05, "loss": 0.5228, "step": 4905 }, { "epoch": 2.12, "grad_norm": 3.537460380045355, "learning_rate": 1.425266207482263e-05, "loss": 0.718, "step": 4910 }, { "epoch": 2.12, "grad_norm": 3.404220405341098, "learning_rate": 1.4239009573209381e-05, "loss": 0.7759, "step": 4915 }, { "epoch": 2.13, "grad_norm": 3.159580780643254, "learning_rate": 1.4225347432889978e-05, "loss": 0.6648, "step": 4920 }, { "epoch": 2.13, "grad_norm": 3.037253480076717, "learning_rate": 1.4211675684929533e-05, "loss": 0.6586, "step": 4925 }, { "epoch": 2.13, "grad_norm": 3.869361730397183, "learning_rate": 1.4197994360415023e-05, "loss": 0.6963, "step": 4930 }, { "epoch": 2.13, "grad_norm": 2.2272003374579468, "learning_rate": 1.4184303490455194e-05, "loss": 0.8277, "step": 4935 }, { "epoch": 2.13, "grad_norm": 3.034166829374501, "learning_rate": 1.4170603106180489e-05, "loss": 0.7326, "step": 4940 }, { "epoch": 2.14, "grad_norm": 3.0837813829439136, "learning_rate": 1.4156893238743e-05, "loss": 0.6207, "step": 4945 }, { "epoch": 2.14, "grad_norm": 4.059850226368533, "learning_rate": 1.414317391931636e-05, "loss": 0.7122, "step": 4950 }, { "epoch": 2.14, "grad_norm": 3.4181474527577977, "learning_rate": 1.4129445179095718e-05, "loss": 0.6949, "step": 4955 }, { "epoch": 2.14, "grad_norm": 2.331066437043077, "learning_rate": 1.4115707049297631e-05, "loss": 0.5983, "step": 4960 }, { "epoch": 2.14, "grad_norm": 3.0238435031005064, "learning_rate": 1.4101959561160003e-05, "loss": 0.6895, "step": 4965 }, { "epoch": 2.15, "grad_norm": 2.761852205828301, "learning_rate": 1.408820274594203e-05, "loss": 0.7393, "step": 4970 }, { "epoch": 2.15, "grad_norm": 2.544915824724106, "learning_rate": 1.4074436634924102e-05, "loss": 0.6316, "step": 4975 }, { "epoch": 2.15, "grad_norm": 1.3244544600527366, "learning_rate": 1.4060661259407758e-05, "loss": 0.5658, "step": 4980 }, { "epoch": 2.15, "grad_norm": 2.772971136679998, "learning_rate": 1.4046876650715591e-05, "loss": 0.6778, "step": 4985 }, { "epoch": 2.16, "grad_norm": 2.5012677940913757, "learning_rate": 1.4033082840191196e-05, "loss": 0.7587, "step": 4990 }, { "epoch": 2.16, "grad_norm": 3.64540541690908, "learning_rate": 1.4019279859199096e-05, "loss": 0.7228, "step": 4995 }, { "epoch": 2.16, "grad_norm": 8.366734997146574, "learning_rate": 1.400546773912465e-05, "loss": 0.7256, "step": 5000 }, { "epoch": 2.16, "grad_norm": 2.142213618690858, "learning_rate": 1.3991646511374011e-05, "loss": 0.7008, "step": 5005 }, { "epoch": 2.16, "grad_norm": 4.019248315380808, "learning_rate": 1.3977816207374039e-05, "loss": 0.7018, "step": 5010 }, { "epoch": 2.17, "grad_norm": 5.554098532914148, "learning_rate": 1.396397685857223e-05, "loss": 0.7446, "step": 5015 }, { "epoch": 2.17, "grad_norm": 3.359959733126847, "learning_rate": 1.3950128496436646e-05, "loss": 0.601, "step": 5020 }, { "epoch": 2.17, "grad_norm": 2.569653753156473, "learning_rate": 1.393627115245584e-05, "loss": 0.6059, "step": 5025 }, { "epoch": 2.17, "grad_norm": 2.9822193273204873, "learning_rate": 1.3922404858138794e-05, "loss": 0.6179, "step": 5030 }, { "epoch": 2.17, "grad_norm": 2.0489495833913782, "learning_rate": 1.390852964501484e-05, "loss": 0.6472, "step": 5035 }, { "epoch": 2.18, "grad_norm": 1.4261825333330067, "learning_rate": 1.389464554463359e-05, "loss": 0.5776, "step": 5040 }, { "epoch": 2.18, "grad_norm": 2.400830227229986, "learning_rate": 1.3880752588564861e-05, "loss": 0.7632, "step": 5045 }, { "epoch": 2.18, "grad_norm": 2.9766721821975106, "learning_rate": 1.3866850808398607e-05, "loss": 0.6854, "step": 5050 }, { "epoch": 2.18, "grad_norm": 5.204975829961181, "learning_rate": 1.3852940235744848e-05, "loss": 0.6408, "step": 5055 }, { "epoch": 2.19, "grad_norm": 12.80490051664339, "learning_rate": 1.3839020902233595e-05, "loss": 0.6775, "step": 5060 }, { "epoch": 2.19, "grad_norm": 2.8672289829708397, "learning_rate": 1.3825092839514782e-05, "loss": 0.6438, "step": 5065 }, { "epoch": 2.19, "grad_norm": 4.051389466862681, "learning_rate": 1.381115607925819e-05, "loss": 0.6045, "step": 5070 }, { "epoch": 2.19, "grad_norm": 6.952904080521865, "learning_rate": 1.3797210653153372e-05, "loss": 0.7735, "step": 5075 }, { "epoch": 2.19, "grad_norm": 2.94230699295151, "learning_rate": 1.3783256592909596e-05, "loss": 0.6059, "step": 5080 }, { "epoch": 2.2, "grad_norm": 2.9411457261542724, "learning_rate": 1.3769293930255752e-05, "loss": 0.6946, "step": 5085 }, { "epoch": 2.2, "grad_norm": 3.4557539077990724, "learning_rate": 1.3755322696940293e-05, "loss": 0.5876, "step": 5090 }, { "epoch": 2.2, "grad_norm": 1.856220373216668, "learning_rate": 1.3741342924731171e-05, "loss": 0.8175, "step": 5095 }, { "epoch": 2.2, "grad_norm": 5.806548304083525, "learning_rate": 1.3727354645415734e-05, "loss": 0.6969, "step": 5100 }, { "epoch": 2.21, "grad_norm": 1.9514309269566108, "learning_rate": 1.3713357890800685e-05, "loss": 0.585, "step": 5105 }, { "epoch": 2.21, "grad_norm": 1.495350677319604, "learning_rate": 1.3699352692712004e-05, "loss": 0.5571, "step": 5110 }, { "epoch": 2.21, "grad_norm": 2.6614436176053577, "learning_rate": 1.3685339082994856e-05, "loss": 0.6419, "step": 5115 }, { "epoch": 2.21, "grad_norm": 2.707576709000901, "learning_rate": 1.3671317093513546e-05, "loss": 0.7347, "step": 5120 }, { "epoch": 2.21, "grad_norm": 2.2162068090827214, "learning_rate": 1.3657286756151418e-05, "loss": 0.6805, "step": 5125 }, { "epoch": 2.22, "grad_norm": 2.995075044694853, "learning_rate": 1.3643248102810812e-05, "loss": 0.6363, "step": 5130 }, { "epoch": 2.22, "grad_norm": 2.685871623205705, "learning_rate": 1.362920116541297e-05, "loss": 0.6295, "step": 5135 }, { "epoch": 2.22, "grad_norm": 3.174472117863071, "learning_rate": 1.3615145975897973e-05, "loss": 0.7393, "step": 5140 }, { "epoch": 2.22, "grad_norm": 2.0902726071931075, "learning_rate": 1.360108256622466e-05, "loss": 0.6473, "step": 5145 }, { "epoch": 2.22, "grad_norm": 1.9372938517889469, "learning_rate": 1.3587010968370569e-05, "loss": 0.6455, "step": 5150 }, { "epoch": 2.23, "grad_norm": 2.3024147029017534, "learning_rate": 1.3572931214331848e-05, "loss": 0.6314, "step": 5155 }, { "epoch": 2.23, "grad_norm": 4.320602900604779, "learning_rate": 1.3558843336123201e-05, "loss": 0.7002, "step": 5160 }, { "epoch": 2.23, "grad_norm": 1.6210451039283706, "learning_rate": 1.3544747365777795e-05, "loss": 0.7742, "step": 5165 }, { "epoch": 2.23, "grad_norm": 1.3075788822230048, "learning_rate": 1.3530643335347203e-05, "loss": 0.5962, "step": 5170 }, { "epoch": 2.24, "grad_norm": 1.7855687906302533, "learning_rate": 1.351653127690132e-05, "loss": 0.5874, "step": 5175 }, { "epoch": 2.24, "grad_norm": 2.3951063501074583, "learning_rate": 1.35024112225283e-05, "loss": 0.5974, "step": 5180 }, { "epoch": 2.24, "grad_norm": 2.1622174835372188, "learning_rate": 1.3488283204334479e-05, "loss": 0.6406, "step": 5185 }, { "epoch": 2.24, "grad_norm": 2.132938791139148, "learning_rate": 1.3474147254444293e-05, "loss": 0.6773, "step": 5190 }, { "epoch": 2.24, "grad_norm": 4.669387578737708, "learning_rate": 1.3460003405000226e-05, "loss": 0.58, "step": 5195 }, { "epoch": 2.25, "grad_norm": 4.131229331144581, "learning_rate": 1.3445851688162713e-05, "loss": 0.5259, "step": 5200 }, { "epoch": 2.25, "grad_norm": 2.101898168571244, "learning_rate": 1.3431692136110079e-05, "loss": 0.5822, "step": 5205 }, { "epoch": 2.25, "grad_norm": 2.9631752802509537, "learning_rate": 1.3417524781038473e-05, "loss": 0.7066, "step": 5210 }, { "epoch": 2.25, "grad_norm": 2.8623277308886537, "learning_rate": 1.3403349655161782e-05, "loss": 0.6715, "step": 5215 }, { "epoch": 2.25, "grad_norm": 3.3350327819908356, "learning_rate": 1.3389166790711556e-05, "loss": 0.7183, "step": 5220 }, { "epoch": 2.26, "grad_norm": 3.295562949758625, "learning_rate": 1.3374976219936952e-05, "loss": 0.7453, "step": 5225 }, { "epoch": 2.26, "grad_norm": 1.5495019986962577, "learning_rate": 1.336077797510464e-05, "loss": 0.6477, "step": 5230 }, { "epoch": 2.26, "grad_norm": 3.244632762508368, "learning_rate": 1.3346572088498746e-05, "loss": 0.7053, "step": 5235 }, { "epoch": 2.26, "grad_norm": 10.458889203896769, "learning_rate": 1.333235859242077e-05, "loss": 0.6152, "step": 5240 }, { "epoch": 2.27, "grad_norm": 2.817571457186845, "learning_rate": 1.3318137519189518e-05, "loss": 0.5173, "step": 5245 }, { "epoch": 2.27, "grad_norm": 2.2804314138434525, "learning_rate": 1.3303908901141016e-05, "loss": 0.6432, "step": 5250 }, { "epoch": 2.27, "grad_norm": 2.6539794325977764, "learning_rate": 1.3289672770628449e-05, "loss": 0.727, "step": 5255 }, { "epoch": 2.27, "grad_norm": 3.0261414160956615, "learning_rate": 1.327542916002209e-05, "loss": 0.64, "step": 5260 }, { "epoch": 2.27, "grad_norm": 2.6157623856222334, "learning_rate": 1.3261178101709215e-05, "loss": 0.4689, "step": 5265 }, { "epoch": 2.28, "grad_norm": 2.2558817062570116, "learning_rate": 1.3246919628094037e-05, "loss": 0.7602, "step": 5270 }, { "epoch": 2.28, "grad_norm": 3.70735926592346, "learning_rate": 1.3232653771597627e-05, "loss": 0.6818, "step": 5275 }, { "epoch": 2.28, "grad_norm": 3.63267852687161, "learning_rate": 1.3218380564657842e-05, "loss": 0.7698, "step": 5280 }, { "epoch": 2.28, "grad_norm": 3.5578281169494503, "learning_rate": 1.3204100039729263e-05, "loss": 0.6531, "step": 5285 }, { "epoch": 2.29, "grad_norm": 1.5663803805551082, "learning_rate": 1.3189812229283096e-05, "loss": 0.6545, "step": 5290 }, { "epoch": 2.29, "grad_norm": 3.6169189509060873, "learning_rate": 1.3175517165807126e-05, "loss": 0.671, "step": 5295 }, { "epoch": 2.29, "grad_norm": 3.164169667424972, "learning_rate": 1.3161214881805622e-05, "loss": 0.6619, "step": 5300 }, { "epoch": 2.29, "grad_norm": 2.7729255955211984, "learning_rate": 1.3146905409799266e-05, "loss": 0.6725, "step": 5305 }, { "epoch": 2.29, "grad_norm": 2.0512913530098693, "learning_rate": 1.3132588782325103e-05, "loss": 0.658, "step": 5310 }, { "epoch": 2.3, "grad_norm": 1.8793852520626917, "learning_rate": 1.3118265031936427e-05, "loss": 0.6442, "step": 5315 }, { "epoch": 2.3, "grad_norm": 4.26316521105751, "learning_rate": 1.3103934191202746e-05, "loss": 0.651, "step": 5320 }, { "epoch": 2.3, "grad_norm": 2.02079446252766, "learning_rate": 1.3089596292709677e-05, "loss": 0.6715, "step": 5325 }, { "epoch": 2.3, "grad_norm": 1.3143490394654598, "learning_rate": 1.307525136905889e-05, "loss": 0.5558, "step": 5330 }, { "epoch": 2.3, "grad_norm": 3.9706238434807273, "learning_rate": 1.3060899452868028e-05, "loss": 0.6195, "step": 5335 }, { "epoch": 2.31, "grad_norm": 4.653747012803783, "learning_rate": 1.3046540576770635e-05, "loss": 0.7183, "step": 5340 }, { "epoch": 2.31, "grad_norm": 6.238182045888077, "learning_rate": 1.3032174773416085e-05, "loss": 0.6659, "step": 5345 }, { "epoch": 2.31, "grad_norm": 4.559310932480753, "learning_rate": 1.3017802075469492e-05, "loss": 0.6809, "step": 5350 }, { "epoch": 2.31, "grad_norm": 1.467398670493438, "learning_rate": 1.3003422515611654e-05, "loss": 0.6352, "step": 5355 }, { "epoch": 2.32, "grad_norm": 4.642235893269442, "learning_rate": 1.2989036126538973e-05, "loss": 0.5664, "step": 5360 }, { "epoch": 2.32, "grad_norm": 3.938217402793309, "learning_rate": 1.2974642940963373e-05, "loss": 0.6056, "step": 5365 }, { "epoch": 2.32, "grad_norm": 6.885204245370707, "learning_rate": 1.2960242991612244e-05, "loss": 0.6308, "step": 5370 }, { "epoch": 2.32, "grad_norm": 4.5212483786095765, "learning_rate": 1.2945836311228342e-05, "loss": 0.5929, "step": 5375 }, { "epoch": 2.32, "grad_norm": 4.641571097379382, "learning_rate": 1.2931422932569732e-05, "loss": 0.5578, "step": 5380 }, { "epoch": 2.33, "grad_norm": 2.8833078407731842, "learning_rate": 1.2917002888409718e-05, "loss": 0.5554, "step": 5385 }, { "epoch": 2.33, "grad_norm": 1.173901145308014, "learning_rate": 1.2902576211536748e-05, "loss": 0.5722, "step": 5390 }, { "epoch": 2.33, "grad_norm": 0.9358893996361893, "learning_rate": 1.288814293475436e-05, "loss": 0.7776, "step": 5395 }, { "epoch": 2.33, "grad_norm": 1.4558280571808577, "learning_rate": 1.2873703090881096e-05, "loss": 0.4994, "step": 5400 }, { "epoch": 2.33, "grad_norm": 4.079557487260838, "learning_rate": 1.2859256712750428e-05, "loss": 0.6432, "step": 5405 }, { "epoch": 2.34, "grad_norm": 2.551436632509885, "learning_rate": 1.284480383321069e-05, "loss": 0.539, "step": 5410 }, { "epoch": 2.34, "grad_norm": 4.073997211595631, "learning_rate": 1.2830344485124995e-05, "loss": 0.6014, "step": 5415 }, { "epoch": 2.34, "grad_norm": 1.547482223872806, "learning_rate": 1.2815878701371172e-05, "loss": 0.6883, "step": 5420 }, { "epoch": 2.34, "grad_norm": 7.302871222039995, "learning_rate": 1.280140651484167e-05, "loss": 0.6608, "step": 5425 }, { "epoch": 2.35, "grad_norm": 4.71763130175225, "learning_rate": 1.278692795844351e-05, "loss": 0.6053, "step": 5430 }, { "epoch": 2.35, "grad_norm": 2.376185014669388, "learning_rate": 1.2772443065098186e-05, "loss": 0.5509, "step": 5435 }, { "epoch": 2.35, "grad_norm": 2.8825193085694707, "learning_rate": 1.275795186774161e-05, "loss": 0.8276, "step": 5440 }, { "epoch": 2.35, "grad_norm": 3.2274170276467036, "learning_rate": 1.2743454399324027e-05, "loss": 0.6181, "step": 5445 }, { "epoch": 2.35, "grad_norm": 2.543424038374083, "learning_rate": 1.2728950692809936e-05, "loss": 0.5948, "step": 5450 }, { "epoch": 2.36, "grad_norm": 3.9232792043148637, "learning_rate": 1.2714440781178019e-05, "loss": 0.6813, "step": 5455 }, { "epoch": 2.36, "grad_norm": 4.311356998187637, "learning_rate": 1.2699924697421075e-05, "loss": 0.599, "step": 5460 }, { "epoch": 2.36, "grad_norm": 2.394891057498207, "learning_rate": 1.2685402474545934e-05, "loss": 0.5836, "step": 5465 }, { "epoch": 2.36, "grad_norm": 4.232798939575906, "learning_rate": 1.2670874145573384e-05, "loss": 0.7191, "step": 5470 }, { "epoch": 2.37, "grad_norm": 2.546588455428622, "learning_rate": 1.2656339743538099e-05, "loss": 0.6068, "step": 5475 }, { "epoch": 2.37, "grad_norm": 2.397110488213129, "learning_rate": 1.2641799301488557e-05, "loss": 0.6745, "step": 5480 }, { "epoch": 2.37, "grad_norm": 2.3290923862939388, "learning_rate": 1.2627252852486978e-05, "loss": 0.5364, "step": 5485 }, { "epoch": 2.37, "grad_norm": 2.320282415146021, "learning_rate": 1.2612700429609231e-05, "loss": 0.5121, "step": 5490 }, { "epoch": 2.37, "grad_norm": 2.865501228371246, "learning_rate": 1.259814206594478e-05, "loss": 0.656, "step": 5495 }, { "epoch": 2.38, "grad_norm": 2.929289607937329, "learning_rate": 1.2583577794596588e-05, "loss": 0.6746, "step": 5500 }, { "epoch": 2.38, "grad_norm": 1.0123923944249917, "learning_rate": 1.256900764868105e-05, "loss": 0.7035, "step": 5505 }, { "epoch": 2.38, "grad_norm": 2.5671713727059498, "learning_rate": 1.2554431661327929e-05, "loss": 0.5534, "step": 5510 }, { "epoch": 2.38, "grad_norm": 2.713465275447893, "learning_rate": 1.253984986568026e-05, "loss": 0.6707, "step": 5515 }, { "epoch": 2.38, "grad_norm": 3.2388981875859666, "learning_rate": 1.2525262294894291e-05, "loss": 0.5429, "step": 5520 }, { "epoch": 2.39, "grad_norm": 4.429034301281724, "learning_rate": 1.25106689821394e-05, "loss": 0.634, "step": 5525 }, { "epoch": 2.39, "grad_norm": 3.527254640727882, "learning_rate": 1.2496069960598015e-05, "loss": 0.5531, "step": 5530 }, { "epoch": 2.39, "grad_norm": 3.9778021787359807, "learning_rate": 1.2481465263465556e-05, "loss": 0.5832, "step": 5535 }, { "epoch": 2.39, "grad_norm": 5.7570845677942994, "learning_rate": 1.2466854923950337e-05, "loss": 0.6605, "step": 5540 }, { "epoch": 2.4, "grad_norm": 2.5136660861714533, "learning_rate": 1.2452238975273517e-05, "loss": 0.6726, "step": 5545 }, { "epoch": 2.4, "grad_norm": 4.778955739958259, "learning_rate": 1.2437617450668991e-05, "loss": 0.6528, "step": 5550 }, { "epoch": 2.4, "grad_norm": 2.822118632646264, "learning_rate": 1.2422990383383342e-05, "loss": 0.7309, "step": 5555 }, { "epoch": 2.4, "grad_norm": 3.101702662145458, "learning_rate": 1.2408357806675755e-05, "loss": 0.6144, "step": 5560 }, { "epoch": 2.4, "grad_norm": 2.7283728602937067, "learning_rate": 1.2393719753817945e-05, "loss": 0.6402, "step": 5565 }, { "epoch": 2.41, "grad_norm": 3.941793534144611, "learning_rate": 1.2379076258094076e-05, "loss": 0.5565, "step": 5570 }, { "epoch": 2.41, "grad_norm": 2.0507521443057612, "learning_rate": 1.2364427352800688e-05, "loss": 0.5296, "step": 5575 }, { "epoch": 2.41, "grad_norm": 2.2791441959795167, "learning_rate": 1.2349773071246615e-05, "loss": 0.6071, "step": 5580 }, { "epoch": 2.41, "grad_norm": 1.4181544900587406, "learning_rate": 1.233511344675293e-05, "loss": 0.5631, "step": 5585 }, { "epoch": 2.41, "grad_norm": 3.0642106256181467, "learning_rate": 1.2320448512652845e-05, "loss": 0.5379, "step": 5590 }, { "epoch": 2.42, "grad_norm": 3.0825089074185787, "learning_rate": 1.2305778302291647e-05, "loss": 0.6778, "step": 5595 }, { "epoch": 2.42, "grad_norm": 2.3649728516693553, "learning_rate": 1.229110284902662e-05, "loss": 0.7069, "step": 5600 }, { "epoch": 2.42, "grad_norm": 4.418260582924241, "learning_rate": 1.2276422186226965e-05, "loss": 0.6248, "step": 5605 }, { "epoch": 2.42, "grad_norm": 3.393194032854879, "learning_rate": 1.2261736347273739e-05, "loss": 0.5673, "step": 5610 }, { "epoch": 2.43, "grad_norm": 3.256965708075714, "learning_rate": 1.2247045365559758e-05, "loss": 0.6426, "step": 5615 }, { "epoch": 2.43, "grad_norm": 1.9498668554848662, "learning_rate": 1.2232349274489543e-05, "loss": 0.6806, "step": 5620 }, { "epoch": 2.43, "grad_norm": 3.8443153243697936, "learning_rate": 1.2217648107479222e-05, "loss": 0.6157, "step": 5625 }, { "epoch": 2.43, "grad_norm": 2.4099977548359326, "learning_rate": 1.2202941897956468e-05, "loss": 0.5046, "step": 5630 }, { "epoch": 2.43, "grad_norm": 3.164543987009733, "learning_rate": 1.2188230679360417e-05, "loss": 0.6559, "step": 5635 }, { "epoch": 2.44, "grad_norm": 1.3937539752455785, "learning_rate": 1.2173514485141606e-05, "loss": 0.524, "step": 5640 }, { "epoch": 2.44, "grad_norm": 1.0846844450463282, "learning_rate": 1.2158793348761878e-05, "loss": 0.494, "step": 5645 }, { "epoch": 2.44, "grad_norm": 2.2885642706729197, "learning_rate": 1.2144067303694304e-05, "loss": 0.5825, "step": 5650 }, { "epoch": 2.44, "grad_norm": 1.3565551486811531, "learning_rate": 1.212933638342313e-05, "loss": 0.6171, "step": 5655 }, { "epoch": 2.44, "grad_norm": 1.328702556248931, "learning_rate": 1.211460062144369e-05, "loss": 0.5537, "step": 5660 }, { "epoch": 2.45, "grad_norm": 2.2535405004738798, "learning_rate": 1.2099860051262308e-05, "loss": 0.5915, "step": 5665 }, { "epoch": 2.45, "grad_norm": 2.2859547603183534, "learning_rate": 1.2085114706396264e-05, "loss": 0.5191, "step": 5670 }, { "epoch": 2.45, "grad_norm": 2.386753821740332, "learning_rate": 1.207036462037368e-05, "loss": 0.5119, "step": 5675 }, { "epoch": 2.45, "grad_norm": 0.8861234521427775, "learning_rate": 1.2055609826733459e-05, "loss": 0.5889, "step": 5680 }, { "epoch": 2.46, "grad_norm": 5.447653168410484, "learning_rate": 1.2040850359025216e-05, "loss": 0.656, "step": 5685 }, { "epoch": 2.46, "grad_norm": 2.874911271848961, "learning_rate": 1.2026086250809181e-05, "loss": 0.6001, "step": 5690 }, { "epoch": 2.46, "grad_norm": 3.116810605679269, "learning_rate": 1.201131753565616e-05, "loss": 0.6166, "step": 5695 }, { "epoch": 2.46, "grad_norm": 2.638413391139646, "learning_rate": 1.1996544247147405e-05, "loss": 0.5621, "step": 5700 }, { "epoch": 2.46, "grad_norm": 1.6687401902988719, "learning_rate": 1.1981766418874585e-05, "loss": 0.6026, "step": 5705 }, { "epoch": 2.47, "grad_norm": 2.3214904478487557, "learning_rate": 1.1966984084439688e-05, "loss": 0.6314, "step": 5710 }, { "epoch": 2.47, "grad_norm": 1.9982906558239586, "learning_rate": 1.1952197277454943e-05, "loss": 0.5246, "step": 5715 }, { "epoch": 2.47, "grad_norm": 1.2589374703878966, "learning_rate": 1.1937406031542758e-05, "loss": 0.4738, "step": 5720 }, { "epoch": 2.47, "grad_norm": 1.6703887329525249, "learning_rate": 1.1922610380335626e-05, "loss": 0.6611, "step": 5725 }, { "epoch": 2.48, "grad_norm": 1.9141396640048154, "learning_rate": 1.190781035747606e-05, "loss": 0.4246, "step": 5730 }, { "epoch": 2.48, "grad_norm": 1.7078230496055034, "learning_rate": 1.1893005996616516e-05, "loss": 0.637, "step": 5735 }, { "epoch": 2.48, "grad_norm": 1.8474307055974233, "learning_rate": 1.1878197331419306e-05, "loss": 0.6485, "step": 5740 }, { "epoch": 2.48, "grad_norm": 2.153158902463925, "learning_rate": 1.1863384395556538e-05, "loss": 0.5999, "step": 5745 }, { "epoch": 2.48, "grad_norm": 1.803005771677984, "learning_rate": 1.1848567222710027e-05, "loss": 0.5679, "step": 5750 }, { "epoch": 2.49, "grad_norm": 3.335477853835864, "learning_rate": 1.1833745846571222e-05, "loss": 0.6034, "step": 5755 }, { "epoch": 2.49, "grad_norm": 1.878366420821759, "learning_rate": 1.181892030084113e-05, "loss": 0.5744, "step": 5760 }, { "epoch": 2.49, "grad_norm": 2.0165345116465745, "learning_rate": 1.1804090619230235e-05, "loss": 0.4466, "step": 5765 }, { "epoch": 2.49, "grad_norm": 3.5504172356816466, "learning_rate": 1.1789256835458429e-05, "loss": 0.608, "step": 5770 }, { "epoch": 2.49, "grad_norm": 2.245792656102741, "learning_rate": 1.1774418983254939e-05, "loss": 0.5836, "step": 5775 }, { "epoch": 2.5, "grad_norm": 2.5643770754859148, "learning_rate": 1.1759577096358223e-05, "loss": 0.5998, "step": 5780 }, { "epoch": 2.5, "grad_norm": 3.401996030820411, "learning_rate": 1.1744731208515936e-05, "loss": 0.5699, "step": 5785 }, { "epoch": 2.5, "grad_norm": 2.1812346079873346, "learning_rate": 1.1729881353484813e-05, "loss": 0.6498, "step": 5790 }, { "epoch": 2.5, "grad_norm": 2.899854876371358, "learning_rate": 1.1715027565030618e-05, "loss": 0.5407, "step": 5795 }, { "epoch": 2.51, "grad_norm": 5.374546420936496, "learning_rate": 1.1700169876928057e-05, "loss": 0.7177, "step": 5800 }, { "epoch": 2.51, "grad_norm": 2.4212010959195664, "learning_rate": 1.1685308322960705e-05, "loss": 0.5105, "step": 5805 }, { "epoch": 2.51, "grad_norm": 0.9555119992317347, "learning_rate": 1.1670442936920923e-05, "loss": 0.5637, "step": 5810 }, { "epoch": 2.51, "grad_norm": 2.7380695984695365, "learning_rate": 1.1655573752609789e-05, "loss": 0.5801, "step": 5815 }, { "epoch": 2.51, "grad_norm": 2.5075837508881804, "learning_rate": 1.164070080383701e-05, "loss": 0.533, "step": 5820 }, { "epoch": 2.52, "grad_norm": 5.6592126518142765, "learning_rate": 1.1625824124420873e-05, "loss": 0.6799, "step": 5825 }, { "epoch": 2.52, "grad_norm": 2.6552120212255543, "learning_rate": 1.161094374818812e-05, "loss": 0.5297, "step": 5830 }, { "epoch": 2.52, "grad_norm": 2.2459110733361736, "learning_rate": 1.159605970897392e-05, "loss": 0.6522, "step": 5835 }, { "epoch": 2.52, "grad_norm": 2.446676539882337, "learning_rate": 1.1581172040621759e-05, "loss": 0.5082, "step": 5840 }, { "epoch": 2.52, "grad_norm": 2.9109242525611707, "learning_rate": 1.1566280776983385e-05, "loss": 0.5508, "step": 5845 }, { "epoch": 2.53, "grad_norm": 1.65087084427715, "learning_rate": 1.1551385951918714e-05, "loss": 0.5428, "step": 5850 }, { "epoch": 2.53, "grad_norm": 2.6075258277791344, "learning_rate": 1.1536487599295758e-05, "loss": 0.5509, "step": 5855 }, { "epoch": 2.53, "grad_norm": 3.7449485712262245, "learning_rate": 1.1521585752990556e-05, "loss": 0.5438, "step": 5860 }, { "epoch": 2.53, "grad_norm": 2.412438766415541, "learning_rate": 1.1506680446887088e-05, "loss": 0.5831, "step": 5865 }, { "epoch": 2.54, "grad_norm": 7.446052091029832, "learning_rate": 1.14917717148772e-05, "loss": 0.6561, "step": 5870 }, { "epoch": 2.54, "grad_norm": 4.023471716380351, "learning_rate": 1.1476859590860534e-05, "loss": 0.5676, "step": 5875 }, { "epoch": 2.54, "grad_norm": 1.1134623043453562, "learning_rate": 1.1461944108744436e-05, "loss": 0.6136, "step": 5880 }, { "epoch": 2.54, "grad_norm": 2.956812319516977, "learning_rate": 1.1447025302443896e-05, "loss": 0.5827, "step": 5885 }, { "epoch": 2.54, "grad_norm": 1.9327663678979348, "learning_rate": 1.1432103205881451e-05, "loss": 0.5183, "step": 5890 }, { "epoch": 2.55, "grad_norm": 1.8084939065396937, "learning_rate": 1.1417177852987132e-05, "loss": 0.4991, "step": 5895 }, { "epoch": 2.55, "grad_norm": 1.9430549423028185, "learning_rate": 1.1402249277698367e-05, "loss": 0.4841, "step": 5900 }, { "epoch": 2.55, "grad_norm": 2.8708341115461233, "learning_rate": 1.1387317513959914e-05, "loss": 0.6687, "step": 5905 }, { "epoch": 2.55, "grad_norm": 2.529911695150623, "learning_rate": 1.1372382595723783e-05, "loss": 0.5151, "step": 5910 }, { "epoch": 2.56, "grad_norm": 2.4366426081266983, "learning_rate": 1.1357444556949151e-05, "loss": 0.5137, "step": 5915 }, { "epoch": 2.56, "grad_norm": 3.2681063789494766, "learning_rate": 1.134250343160229e-05, "loss": 0.5975, "step": 5920 }, { "epoch": 2.56, "grad_norm": 4.32348306671456, "learning_rate": 1.1327559253656502e-05, "loss": 0.561, "step": 5925 }, { "epoch": 2.56, "grad_norm": 3.027049309126319, "learning_rate": 1.1312612057092011e-05, "loss": 0.5233, "step": 5930 }, { "epoch": 2.56, "grad_norm": 3.013990620653381, "learning_rate": 1.1297661875895927e-05, "loss": 0.5997, "step": 5935 }, { "epoch": 2.57, "grad_norm": 3.0287001852233413, "learning_rate": 1.1282708744062125e-05, "loss": 0.509, "step": 5940 }, { "epoch": 2.57, "grad_norm": 1.79953582506735, "learning_rate": 1.1267752695591203e-05, "loss": 0.622, "step": 5945 }, { "epoch": 2.57, "grad_norm": 2.5864541268100023, "learning_rate": 1.125279376449039e-05, "loss": 0.4962, "step": 5950 }, { "epoch": 2.57, "grad_norm": 2.8771122683468864, "learning_rate": 1.1237831984773462e-05, "loss": 0.5531, "step": 5955 }, { "epoch": 2.57, "grad_norm": 2.0367422844037546, "learning_rate": 1.1222867390460681e-05, "loss": 0.5465, "step": 5960 }, { "epoch": 2.58, "grad_norm": 1.267993602452498, "learning_rate": 1.12079000155787e-05, "loss": 0.4367, "step": 5965 }, { "epoch": 2.58, "grad_norm": 2.135027107867463, "learning_rate": 1.11929298941605e-05, "loss": 0.4705, "step": 5970 }, { "epoch": 2.58, "grad_norm": 1.7457748728023093, "learning_rate": 1.1177957060245311e-05, "loss": 0.5437, "step": 5975 }, { "epoch": 2.58, "grad_norm": 1.912347529458289, "learning_rate": 1.1162981547878518e-05, "loss": 0.5349, "step": 5980 }, { "epoch": 2.59, "grad_norm": 4.50751108797099, "learning_rate": 1.1148003391111611e-05, "loss": 0.6474, "step": 5985 }, { "epoch": 2.59, "grad_norm": 1.5938058439774518, "learning_rate": 1.1133022624002083e-05, "loss": 0.6542, "step": 5990 }, { "epoch": 2.59, "grad_norm": 4.147373275718441, "learning_rate": 1.1118039280613365e-05, "loss": 0.5725, "step": 5995 }, { "epoch": 2.59, "grad_norm": 2.6356352190755605, "learning_rate": 1.1103053395014749e-05, "loss": 0.553, "step": 6000 }, { "epoch": 2.59, "grad_norm": 2.498630347261751, "learning_rate": 1.1088065001281302e-05, "loss": 0.5169, "step": 6005 }, { "epoch": 2.6, "grad_norm": 6.7469795499587555, "learning_rate": 1.1073074133493802e-05, "loss": 0.6078, "step": 6010 }, { "epoch": 2.6, "grad_norm": 5.1539900000529775, "learning_rate": 1.1058080825738643e-05, "loss": 0.5672, "step": 6015 }, { "epoch": 2.6, "grad_norm": 2.8156989728257864, "learning_rate": 1.104308511210777e-05, "loss": 0.4806, "step": 6020 }, { "epoch": 2.6, "grad_norm": 3.3213185297521712, "learning_rate": 1.1028087026698604e-05, "loss": 0.4861, "step": 6025 }, { "epoch": 2.6, "grad_norm": 3.5440873009748146, "learning_rate": 1.1013086603613953e-05, "loss": 0.4882, "step": 6030 }, { "epoch": 2.61, "grad_norm": 2.6503633600511955, "learning_rate": 1.0998083876961948e-05, "loss": 0.5048, "step": 6035 }, { "epoch": 2.61, "grad_norm": 1.159740466993807, "learning_rate": 1.0983078880855942e-05, "loss": 0.4544, "step": 6040 }, { "epoch": 2.61, "grad_norm": 3.28865816392566, "learning_rate": 1.0968071649414465e-05, "loss": 0.509, "step": 6045 }, { "epoch": 2.61, "grad_norm": 2.0646591256328293, "learning_rate": 1.0953062216761123e-05, "loss": 0.5019, "step": 6050 }, { "epoch": 2.62, "grad_norm": 2.352642135677423, "learning_rate": 1.0938050617024528e-05, "loss": 0.5987, "step": 6055 }, { "epoch": 2.62, "grad_norm": 2.457968428431282, "learning_rate": 1.0923036884338218e-05, "loss": 0.5434, "step": 6060 }, { "epoch": 2.62, "grad_norm": 4.850215654720314, "learning_rate": 1.0908021052840582e-05, "loss": 0.5991, "step": 6065 }, { "epoch": 2.62, "grad_norm": 1.6414624184793067, "learning_rate": 1.089300315667478e-05, "loss": 0.467, "step": 6070 }, { "epoch": 2.62, "grad_norm": 1.8565846208125867, "learning_rate": 1.0877983229988671e-05, "loss": 0.4879, "step": 6075 }, { "epoch": 2.63, "grad_norm": 2.57669455439607, "learning_rate": 1.0862961306934727e-05, "loss": 0.5034, "step": 6080 }, { "epoch": 2.63, "grad_norm": 2.1781249624833516, "learning_rate": 1.0847937421669958e-05, "loss": 0.5489, "step": 6085 }, { "epoch": 2.63, "grad_norm": 3.2642079291697113, "learning_rate": 1.083291160835584e-05, "loss": 0.5428, "step": 6090 }, { "epoch": 2.63, "grad_norm": 1.520362141386093, "learning_rate": 1.0817883901158233e-05, "loss": 0.527, "step": 6095 }, { "epoch": 2.63, "grad_norm": 1.7772580788569998, "learning_rate": 1.0802854334247297e-05, "loss": 0.5083, "step": 6100 }, { "epoch": 2.64, "grad_norm": 1.996720547533441, "learning_rate": 1.0787822941797429e-05, "loss": 0.5448, "step": 6105 }, { "epoch": 2.64, "grad_norm": 1.523628857374489, "learning_rate": 1.0772789757987168e-05, "loss": 0.5466, "step": 6110 }, { "epoch": 2.64, "grad_norm": 1.242041010323239, "learning_rate": 1.0757754816999136e-05, "loss": 0.5819, "step": 6115 }, { "epoch": 2.64, "grad_norm": 2.1655405402416563, "learning_rate": 1.0742718153019945e-05, "loss": 0.453, "step": 6120 }, { "epoch": 2.65, "grad_norm": 2.6841517683072853, "learning_rate": 1.0727679800240123e-05, "loss": 0.434, "step": 6125 }, { "epoch": 2.65, "grad_norm": 2.7477547553589803, "learning_rate": 1.071263979285404e-05, "loss": 0.5315, "step": 6130 }, { "epoch": 2.65, "grad_norm": 2.485348126111469, "learning_rate": 1.0697598165059833e-05, "loss": 0.5192, "step": 6135 }, { "epoch": 2.65, "grad_norm": 3.066169512052378, "learning_rate": 1.0682554951059316e-05, "loss": 0.4982, "step": 6140 }, { "epoch": 2.65, "grad_norm": 1.4248750822788414, "learning_rate": 1.0667510185057915e-05, "loss": 0.5432, "step": 6145 }, { "epoch": 2.66, "grad_norm": 2.0714398287890092, "learning_rate": 1.0652463901264578e-05, "loss": 0.6539, "step": 6150 }, { "epoch": 2.66, "grad_norm": 5.174378433465227, "learning_rate": 1.0637416133891714e-05, "loss": 0.6034, "step": 6155 }, { "epoch": 2.66, "grad_norm": 3.170144848595313, "learning_rate": 1.06223669171551e-05, "loss": 0.4693, "step": 6160 }, { "epoch": 2.66, "grad_norm": 2.301582029494036, "learning_rate": 1.0607316285273808e-05, "loss": 0.5155, "step": 6165 }, { "epoch": 2.67, "grad_norm": 2.455215341523393, "learning_rate": 1.0592264272470128e-05, "loss": 0.5141, "step": 6170 }, { "epoch": 2.67, "grad_norm": 3.0805989635867594, "learning_rate": 1.0577210912969492e-05, "loss": 0.4961, "step": 6175 }, { "epoch": 2.67, "grad_norm": 4.048856790949565, "learning_rate": 1.0562156241000389e-05, "loss": 0.6041, "step": 6180 }, { "epoch": 2.67, "grad_norm": 3.2536278909993346, "learning_rate": 1.0547100290794303e-05, "loss": 0.4917, "step": 6185 }, { "epoch": 2.67, "grad_norm": 5.798717038594271, "learning_rate": 1.0532043096585613e-05, "loss": 0.5321, "step": 6190 }, { "epoch": 2.68, "grad_norm": 4.3837410681270805, "learning_rate": 1.0516984692611533e-05, "loss": 0.6015, "step": 6195 }, { "epoch": 2.68, "grad_norm": 4.576176396461095, "learning_rate": 1.0501925113112024e-05, "loss": 0.6082, "step": 6200 }, { "epoch": 2.68, "grad_norm": 1.707471946131464, "learning_rate": 1.0486864392329723e-05, "loss": 0.508, "step": 6205 }, { "epoch": 2.68, "grad_norm": 2.2217634890861824, "learning_rate": 1.0471802564509865e-05, "loss": 0.5729, "step": 6210 }, { "epoch": 2.68, "grad_norm": 1.6203039931412289, "learning_rate": 1.045673966390019e-05, "loss": 0.5472, "step": 6215 }, { "epoch": 2.69, "grad_norm": 2.4729542822145607, "learning_rate": 1.0441675724750895e-05, "loss": 0.533, "step": 6220 }, { "epoch": 2.69, "grad_norm": 1.841780535568983, "learning_rate": 1.042661078131452e-05, "loss": 0.468, "step": 6225 }, { "epoch": 2.69, "grad_norm": 0.9034137425899711, "learning_rate": 1.0411544867845904e-05, "loss": 0.4318, "step": 6230 }, { "epoch": 2.69, "grad_norm": 2.2121584496901514, "learning_rate": 1.0396478018602083e-05, "loss": 0.4904, "step": 6235 }, { "epoch": 2.7, "grad_norm": 2.0715237369990356, "learning_rate": 1.038141026784222e-05, "loss": 0.4411, "step": 6240 }, { "epoch": 2.7, "grad_norm": 3.8613319036660907, "learning_rate": 1.0366341649827538e-05, "loss": 0.549, "step": 6245 }, { "epoch": 2.7, "grad_norm": 4.138638245292264, "learning_rate": 1.0351272198821218e-05, "loss": 0.5471, "step": 6250 }, { "epoch": 2.7, "grad_norm": 3.4322114787684606, "learning_rate": 1.0336201949088344e-05, "loss": 0.5714, "step": 6255 }, { "epoch": 2.7, "grad_norm": 3.7460770229320297, "learning_rate": 1.0321130934895818e-05, "loss": 0.5299, "step": 6260 }, { "epoch": 2.71, "grad_norm": 1.0187985411942515, "learning_rate": 1.030605919051227e-05, "loss": 0.4599, "step": 6265 }, { "epoch": 2.71, "grad_norm": 3.060934636461662, "learning_rate": 1.0290986750208001e-05, "loss": 0.4518, "step": 6270 }, { "epoch": 2.71, "grad_norm": 2.6057030776915115, "learning_rate": 1.0275913648254887e-05, "loss": 0.4628, "step": 6275 }, { "epoch": 2.71, "grad_norm": 1.6893046080930116, "learning_rate": 1.0260839918926314e-05, "loss": 0.4309, "step": 6280 }, { "epoch": 2.71, "grad_norm": 1.4400368288913405, "learning_rate": 1.024576559649709e-05, "loss": 0.5131, "step": 6285 }, { "epoch": 2.72, "grad_norm": 1.070692376269189, "learning_rate": 1.0230690715243375e-05, "loss": 0.4188, "step": 6290 }, { "epoch": 2.72, "grad_norm": 2.058634828733364, "learning_rate": 1.02156153094426e-05, "loss": 0.5291, "step": 6295 }, { "epoch": 2.72, "grad_norm": 3.4252175800130167, "learning_rate": 1.0200539413373381e-05, "loss": 0.4725, "step": 6300 }, { "epoch": 2.72, "grad_norm": 3.9280466206448046, "learning_rate": 1.018546306131546e-05, "loss": 0.4743, "step": 6305 }, { "epoch": 2.73, "grad_norm": 2.151132653512246, "learning_rate": 1.0170386287549612e-05, "loss": 0.4581, "step": 6310 }, { "epoch": 2.73, "grad_norm": 1.2839090665270678, "learning_rate": 1.0155309126357562e-05, "loss": 0.5197, "step": 6315 }, { "epoch": 2.73, "grad_norm": 1.075875412375835, "learning_rate": 1.0140231612021931e-05, "loss": 0.4433, "step": 6320 }, { "epoch": 2.73, "grad_norm": 2.632088454474315, "learning_rate": 1.0125153778826133e-05, "loss": 0.4793, "step": 6325 }, { "epoch": 2.73, "grad_norm": 2.4920741381531326, "learning_rate": 1.0110075661054306e-05, "loss": 0.5394, "step": 6330 }, { "epoch": 2.74, "grad_norm": 1.018665224091436, "learning_rate": 1.0094997292991244e-05, "loss": 0.3782, "step": 6335 }, { "epoch": 2.74, "grad_norm": 2.5508307612111056, "learning_rate": 1.0079918708922303e-05, "loss": 0.4532, "step": 6340 }, { "epoch": 2.74, "grad_norm": 1.655282795993782, "learning_rate": 1.0064839943133328e-05, "loss": 0.4194, "step": 6345 }, { "epoch": 2.74, "grad_norm": 2.361478377401841, "learning_rate": 1.0049761029910585e-05, "loss": 0.5437, "step": 6350 }, { "epoch": 2.75, "grad_norm": 3.649049573919179, "learning_rate": 1.003468200354067e-05, "loss": 0.543, "step": 6355 }, { "epoch": 2.75, "grad_norm": 1.2283646653492042, "learning_rate": 1.0019602898310438e-05, "loss": 0.502, "step": 6360 }, { "epoch": 2.75, "grad_norm": 3.1541246866307904, "learning_rate": 1.000452374850692e-05, "loss": 0.4247, "step": 6365 }, { "epoch": 2.75, "grad_norm": 4.11299828550103, "learning_rate": 9.989444588417254e-06, "loss": 0.5446, "step": 6370 }, { "epoch": 2.75, "grad_norm": 4.914138694275413, "learning_rate": 9.9743654523286e-06, "loss": 0.4956, "step": 6375 }, { "epoch": 2.76, "grad_norm": 2.9470768143393022, "learning_rate": 9.959286374528059e-06, "loss": 0.4672, "step": 6380 }, { "epoch": 2.76, "grad_norm": 2.1677074867145225, "learning_rate": 9.944207389302603e-06, "loss": 0.5284, "step": 6385 }, { "epoch": 2.76, "grad_norm": 1.9429131304932807, "learning_rate": 9.929128530938996e-06, "loss": 0.4724, "step": 6390 }, { "epoch": 2.76, "grad_norm": 1.918155595325713, "learning_rate": 9.9140498337237e-06, "loss": 0.4078, "step": 6395 }, { "epoch": 2.76, "grad_norm": 1.2710780443623437, "learning_rate": 9.898971331942836e-06, "loss": 0.4892, "step": 6400 }, { "epoch": 2.77, "grad_norm": 2.0197316051827365, "learning_rate": 9.883893059882054e-06, "loss": 0.4773, "step": 6405 }, { "epoch": 2.77, "grad_norm": 1.9979343758482182, "learning_rate": 9.8688150518265e-06, "loss": 0.4322, "step": 6410 }, { "epoch": 2.77, "grad_norm": 2.9015111491639023, "learning_rate": 9.853737342060711e-06, "loss": 0.4626, "step": 6415 }, { "epoch": 2.77, "grad_norm": 1.9284808560910882, "learning_rate": 9.838659964868545e-06, "loss": 0.4723, "step": 6420 }, { "epoch": 2.78, "grad_norm": 1.696924508811764, "learning_rate": 9.82358295453311e-06, "loss": 0.4322, "step": 6425 }, { "epoch": 2.78, "grad_norm": 1.5827504845958853, "learning_rate": 9.808506345336675e-06, "loss": 0.4958, "step": 6430 }, { "epoch": 2.78, "grad_norm": 2.933433261057401, "learning_rate": 9.793430171560602e-06, "loss": 0.4268, "step": 6435 }, { "epoch": 2.78, "grad_norm": 2.4259662609504917, "learning_rate": 9.778354467485255e-06, "loss": 0.4229, "step": 6440 }, { "epoch": 2.78, "grad_norm": 2.5082682873508784, "learning_rate": 9.763279267389929e-06, "loss": 0.4694, "step": 6445 }, { "epoch": 2.79, "grad_norm": 2.25023974453689, "learning_rate": 9.74820460555279e-06, "loss": 0.4411, "step": 6450 }, { "epoch": 2.79, "grad_norm": 1.4082748459739975, "learning_rate": 9.733130516250761e-06, "loss": 0.355, "step": 6455 }, { "epoch": 2.79, "grad_norm": 1.4067199247341755, "learning_rate": 9.718057033759474e-06, "loss": 0.4116, "step": 6460 }, { "epoch": 2.79, "grad_norm": 1.7519299418804521, "learning_rate": 9.702984192353176e-06, "loss": 0.4818, "step": 6465 }, { "epoch": 2.79, "grad_norm": 3.4175019738202104, "learning_rate": 9.687912026304653e-06, "loss": 0.5346, "step": 6470 }, { "epoch": 2.8, "grad_norm": 1.9864314826474614, "learning_rate": 9.67284056988517e-06, "loss": 0.4581, "step": 6475 }, { "epoch": 2.8, "grad_norm": 3.138077655049651, "learning_rate": 9.657769857364363e-06, "loss": 0.518, "step": 6480 }, { "epoch": 2.8, "grad_norm": 2.502870249731197, "learning_rate": 9.642699923010185e-06, "loss": 0.4508, "step": 6485 }, { "epoch": 2.8, "grad_norm": 1.7930765574987866, "learning_rate": 9.627630801088816e-06, "loss": 0.4101, "step": 6490 }, { "epoch": 2.81, "grad_norm": 2.544947230994809, "learning_rate": 9.612562525864584e-06, "loss": 0.4858, "step": 6495 }, { "epoch": 2.81, "grad_norm": 4.137243268762429, "learning_rate": 9.597495131599906e-06, "loss": 0.4585, "step": 6500 }, { "epoch": 2.81, "grad_norm": 3.9778120532643086, "learning_rate": 9.582428652555185e-06, "loss": 0.4436, "step": 6505 }, { "epoch": 2.81, "grad_norm": 2.2441182670565736, "learning_rate": 9.567363122988741e-06, "loss": 0.4107, "step": 6510 }, { "epoch": 2.81, "grad_norm": 1.141091245400905, "learning_rate": 9.552298577156747e-06, "loss": 0.4256, "step": 6515 }, { "epoch": 2.82, "grad_norm": 1.7129366841254983, "learning_rate": 9.537235049313121e-06, "loss": 0.3952, "step": 6520 }, { "epoch": 2.82, "grad_norm": 1.236988405519333, "learning_rate": 9.522172573709488e-06, "loss": 0.4265, "step": 6525 }, { "epoch": 2.82, "grad_norm": 1.576104076344442, "learning_rate": 9.507111184595062e-06, "loss": 0.4207, "step": 6530 }, { "epoch": 2.82, "grad_norm": 1.3526871132184435, "learning_rate": 9.492050916216593e-06, "loss": 0.5159, "step": 6535 }, { "epoch": 2.83, "grad_norm": 2.2800239749148594, "learning_rate": 9.47699180281829e-06, "loss": 0.4127, "step": 6540 }, { "epoch": 2.83, "grad_norm": 1.2561916542114218, "learning_rate": 9.461933878641718e-06, "loss": 0.3991, "step": 6545 }, { "epoch": 2.83, "grad_norm": 2.2628973811496476, "learning_rate": 9.446877177925764e-06, "loss": 0.4333, "step": 6550 }, { "epoch": 2.83, "grad_norm": 1.4247293460628585, "learning_rate": 9.431821734906506e-06, "loss": 0.4714, "step": 6555 }, { "epoch": 2.83, "grad_norm": 1.619837488784781, "learning_rate": 9.416767583817178e-06, "loss": 0.3538, "step": 6560 }, { "epoch": 2.84, "grad_norm": 1.598948161513362, "learning_rate": 9.401714758888076e-06, "loss": 0.4258, "step": 6565 }, { "epoch": 2.84, "grad_norm": 1.0926972438062792, "learning_rate": 9.386663294346472e-06, "loss": 0.4779, "step": 6570 }, { "epoch": 2.84, "grad_norm": 2.508482094322588, "learning_rate": 9.371613224416554e-06, "loss": 0.4806, "step": 6575 }, { "epoch": 2.84, "grad_norm": 1.7024240871896414, "learning_rate": 9.356564583319335e-06, "loss": 0.4665, "step": 6580 }, { "epoch": 2.84, "grad_norm": 2.0503223331113056, "learning_rate": 9.341517405272575e-06, "loss": 0.4305, "step": 6585 }, { "epoch": 2.85, "grad_norm": 1.443289846371451, "learning_rate": 9.326471724490717e-06, "loss": 0.4521, "step": 6590 }, { "epoch": 2.85, "grad_norm": 2.2239936478869726, "learning_rate": 9.311427575184788e-06, "loss": 0.4195, "step": 6595 }, { "epoch": 2.85, "grad_norm": 1.5756273116741237, "learning_rate": 9.296384991562346e-06, "loss": 0.4334, "step": 6600 }, { "epoch": 2.85, "grad_norm": 1.6584753210048995, "learning_rate": 9.281344007827376e-06, "loss": 0.4295, "step": 6605 }, { "epoch": 2.86, "grad_norm": 2.8103529367397115, "learning_rate": 9.26630465818023e-06, "loss": 0.4584, "step": 6610 }, { "epoch": 2.86, "grad_norm": 2.691772042232581, "learning_rate": 9.25126697681755e-06, "loss": 0.4621, "step": 6615 }, { "epoch": 2.86, "grad_norm": 2.7735396415701468, "learning_rate": 9.236230997932174e-06, "loss": 0.4358, "step": 6620 }, { "epoch": 2.86, "grad_norm": 2.8240980703269316, "learning_rate": 9.221196755713082e-06, "loss": 0.4101, "step": 6625 }, { "epoch": 2.86, "grad_norm": 5.0412248180564605, "learning_rate": 9.206164284345291e-06, "loss": 0.4649, "step": 6630 }, { "epoch": 2.87, "grad_norm": 1.1581478268127954, "learning_rate": 9.1911336180098e-06, "loss": 0.413, "step": 6635 }, { "epoch": 2.87, "grad_norm": 4.453405657732692, "learning_rate": 9.176104790883504e-06, "loss": 0.4488, "step": 6640 }, { "epoch": 2.87, "grad_norm": 3.012774695677809, "learning_rate": 9.16107783713911e-06, "loss": 0.4058, "step": 6645 }, { "epoch": 2.87, "grad_norm": 1.2290176178612868, "learning_rate": 9.146052790945072e-06, "loss": 0.3824, "step": 6650 }, { "epoch": 2.87, "grad_norm": 2.232066797254159, "learning_rate": 9.131029686465506e-06, "loss": 0.506, "step": 6655 }, { "epoch": 2.88, "grad_norm": 4.16063650508915, "learning_rate": 9.116008557860106e-06, "loss": 0.4308, "step": 6660 }, { "epoch": 2.88, "grad_norm": 1.8369268794794247, "learning_rate": 9.100989439284081e-06, "loss": 0.4339, "step": 6665 }, { "epoch": 2.88, "grad_norm": 1.4077468997120663, "learning_rate": 9.08597236488806e-06, "loss": 0.413, "step": 6670 }, { "epoch": 2.88, "grad_norm": 1.2057626155911438, "learning_rate": 9.070957368818043e-06, "loss": 0.4537, "step": 6675 }, { "epoch": 2.89, "grad_norm": 1.5566729642358135, "learning_rate": 9.055944485215283e-06, "loss": 0.3966, "step": 6680 }, { "epoch": 2.89, "grad_norm": 2.600064929447201, "learning_rate": 9.04093374821624e-06, "loss": 0.4803, "step": 6685 }, { "epoch": 2.89, "grad_norm": 1.6680538318410998, "learning_rate": 9.02592519195249e-06, "loss": 0.4409, "step": 6690 }, { "epoch": 2.89, "grad_norm": 2.1954047183681937, "learning_rate": 9.01091885055065e-06, "loss": 0.4342, "step": 6695 }, { "epoch": 2.89, "grad_norm": 4.49678925419594, "learning_rate": 8.99591475813231e-06, "loss": 0.4316, "step": 6700 }, { "epoch": 2.9, "grad_norm": 2.2027231960939346, "learning_rate": 8.980912948813932e-06, "loss": 0.3865, "step": 6705 }, { "epoch": 2.9, "grad_norm": 2.581265968046604, "learning_rate": 8.965913456706796e-06, "loss": 0.4989, "step": 6710 }, { "epoch": 2.9, "grad_norm": 1.6644182265975438, "learning_rate": 8.950916315916912e-06, "loss": 0.3984, "step": 6715 }, { "epoch": 2.9, "grad_norm": 1.4570520063725072, "learning_rate": 8.935921560544938e-06, "loss": 0.437, "step": 6720 }, { "epoch": 2.9, "grad_norm": 1.1355513140288211, "learning_rate": 8.920929224686117e-06, "loss": 0.3342, "step": 6725 }, { "epoch": 2.91, "grad_norm": 1.7876848271437962, "learning_rate": 8.905939342430185e-06, "loss": 0.4636, "step": 6730 }, { "epoch": 2.91, "grad_norm": 2.4404383696926386, "learning_rate": 8.890951947861296e-06, "loss": 0.44, "step": 6735 }, { "epoch": 2.91, "grad_norm": 1.0600647901504878, "learning_rate": 8.875967075057953e-06, "loss": 0.4511, "step": 6740 }, { "epoch": 2.91, "grad_norm": 1.9431591387196776, "learning_rate": 8.86098475809292e-06, "loss": 0.3206, "step": 6745 }, { "epoch": 2.92, "grad_norm": 1.727807582573479, "learning_rate": 8.84600503103316e-06, "loss": 0.414, "step": 6750 }, { "epoch": 2.92, "grad_norm": 1.923727161874462, "learning_rate": 8.831027927939736e-06, "loss": 0.415, "step": 6755 }, { "epoch": 2.92, "grad_norm": 3.373896731208882, "learning_rate": 8.816053482867745e-06, "loss": 0.527, "step": 6760 }, { "epoch": 2.92, "grad_norm": 1.5335921617244541, "learning_rate": 8.80108172986625e-06, "loss": 0.4036, "step": 6765 }, { "epoch": 2.92, "grad_norm": 1.9471891820925509, "learning_rate": 8.786112702978176e-06, "loss": 0.5486, "step": 6770 }, { "epoch": 2.93, "grad_norm": 2.2347916473025022, "learning_rate": 8.77114643624027e-06, "loss": 0.4457, "step": 6775 }, { "epoch": 2.93, "grad_norm": 2.3678639583636265, "learning_rate": 8.756182963682987e-06, "loss": 0.4018, "step": 6780 }, { "epoch": 2.93, "grad_norm": 2.6214495125461847, "learning_rate": 8.741222319330434e-06, "loss": 0.4425, "step": 6785 }, { "epoch": 2.93, "grad_norm": 2.5343906167412498, "learning_rate": 8.72626453720029e-06, "loss": 0.4468, "step": 6790 }, { "epoch": 2.94, "grad_norm": 1.9361513781363973, "learning_rate": 8.711309651303718e-06, "loss": 0.378, "step": 6795 }, { "epoch": 2.94, "grad_norm": 4.69711181302915, "learning_rate": 8.696357695645308e-06, "loss": 0.3949, "step": 6800 }, { "epoch": 2.94, "grad_norm": 5.346661659511524, "learning_rate": 8.681408704222974e-06, "loss": 0.4211, "step": 6805 }, { "epoch": 2.94, "grad_norm": 2.449125693644456, "learning_rate": 8.666462711027894e-06, "loss": 0.3589, "step": 6810 }, { "epoch": 2.94, "grad_norm": 1.6096907460963736, "learning_rate": 8.651519750044438e-06, "loss": 0.3983, "step": 6815 }, { "epoch": 2.95, "grad_norm": 1.3264924866914174, "learning_rate": 8.636579855250062e-06, "loss": 0.394, "step": 6820 }, { "epoch": 2.95, "grad_norm": 4.37270376441934, "learning_rate": 8.621643060615272e-06, "loss": 0.4499, "step": 6825 }, { "epoch": 2.95, "grad_norm": 1.6547788544441633, "learning_rate": 8.606709400103512e-06, "loss": 0.4202, "step": 6830 }, { "epoch": 2.95, "grad_norm": 2.0890274265714206, "learning_rate": 8.591778907671099e-06, "loss": 0.3356, "step": 6835 }, { "epoch": 2.95, "grad_norm": 4.268006056243279, "learning_rate": 8.576851617267151e-06, "loss": 0.3998, "step": 6840 }, { "epoch": 2.96, "grad_norm": 2.863642651618915, "learning_rate": 8.5619275628335e-06, "loss": 0.3956, "step": 6845 }, { "epoch": 2.96, "grad_norm": 1.1601259335650023, "learning_rate": 8.547006778304632e-06, "loss": 0.3632, "step": 6850 }, { "epoch": 2.96, "grad_norm": 2.0777569420111854, "learning_rate": 8.532089297607585e-06, "loss": 0.3995, "step": 6855 }, { "epoch": 2.96, "grad_norm": 2.2903854384703433, "learning_rate": 8.517175154661887e-06, "loss": 0.4314, "step": 6860 }, { "epoch": 2.97, "grad_norm": 1.734580558138607, "learning_rate": 8.50226438337948e-06, "loss": 0.3829, "step": 6865 }, { "epoch": 2.97, "grad_norm": 3.0950107309613637, "learning_rate": 8.487357017664637e-06, "loss": 0.4173, "step": 6870 }, { "epoch": 2.97, "grad_norm": 1.2769399862835231, "learning_rate": 8.472453091413895e-06, "loss": 0.3269, "step": 6875 }, { "epoch": 2.97, "grad_norm": 2.874297218160978, "learning_rate": 8.457552638515957e-06, "loss": 0.5393, "step": 6880 }, { "epoch": 2.97, "grad_norm": 1.0257328536932597, "learning_rate": 8.44265569285164e-06, "loss": 0.3151, "step": 6885 }, { "epoch": 2.98, "grad_norm": 2.3351313764160255, "learning_rate": 8.427762288293774e-06, "loss": 0.4007, "step": 6890 }, { "epoch": 2.98, "grad_norm": 1.5703654283551154, "learning_rate": 8.412872458707149e-06, "loss": 0.3595, "step": 6895 }, { "epoch": 2.98, "grad_norm": 1.8729325717727523, "learning_rate": 8.397986237948426e-06, "loss": 0.3091, "step": 6900 }, { "epoch": 2.98, "grad_norm": 1.6325566546085524, "learning_rate": 8.383103659866051e-06, "loss": 0.3869, "step": 6905 }, { "epoch": 2.98, "grad_norm": 1.1253158005213604, "learning_rate": 8.368224758300196e-06, "loss": 0.3839, "step": 6910 }, { "epoch": 2.99, "grad_norm": 1.7683227163639756, "learning_rate": 8.353349567082664e-06, "loss": 0.377, "step": 6915 }, { "epoch": 2.99, "grad_norm": 2.5880147569316163, "learning_rate": 8.338478120036827e-06, "loss": 0.4638, "step": 6920 }, { "epoch": 2.99, "grad_norm": 2.3260828538915503, "learning_rate": 8.32361045097755e-06, "loss": 0.5224, "step": 6925 }, { "epoch": 2.99, "grad_norm": 1.2921962675871295, "learning_rate": 8.308746593711097e-06, "loss": 0.3786, "step": 6930 }, { "epoch": 3.0, "grad_norm": 1.373713282307955, "learning_rate": 8.293886582035068e-06, "loss": 0.4048, "step": 6935 }, { "epoch": 3.0, "grad_norm": 1.1792303077370565, "learning_rate": 8.27903044973832e-06, "loss": 0.3728, "step": 6940 }, { "epoch": 3.0, "grad_norm": 2.478527290979682, "learning_rate": 8.264178230600886e-06, "loss": 0.3835, "step": 6945 }, { "epoch": 3.0, "eval_loss": 3.1219844818115234, "eval_runtime": 352.5163, "eval_samples_per_second": 21.295, "eval_steps_per_second": 0.335, "step": 6945 }, { "epoch": 3.0, "grad_norm": 2.4438409801730288, "learning_rate": 8.24932995839391e-06, "loss": 0.3604, "step": 6950 }, { "epoch": 3.0, "grad_norm": 0.9829599516746514, "learning_rate": 8.234485666879552e-06, "loss": 0.3694, "step": 6955 }, { "epoch": 3.01, "grad_norm": 1.6859068677210045, "learning_rate": 8.219645389810923e-06, "loss": 0.4267, "step": 6960 }, { "epoch": 3.01, "grad_norm": 3.0400939723252676, "learning_rate": 8.204809160932004e-06, "loss": 0.382, "step": 6965 }, { "epoch": 3.01, "grad_norm": 2.365210178859166, "learning_rate": 8.189977013977575e-06, "loss": 0.404, "step": 6970 }, { "epoch": 3.01, "grad_norm": 1.6313288974490237, "learning_rate": 8.175148982673141e-06, "loss": 0.3676, "step": 6975 }, { "epoch": 3.02, "grad_norm": 1.5199651747372227, "learning_rate": 8.160325100734835e-06, "loss": 0.4012, "step": 6980 }, { "epoch": 3.02, "grad_norm": 1.7919489483520143, "learning_rate": 8.14550540186936e-06, "loss": 0.3834, "step": 6985 }, { "epoch": 3.02, "grad_norm": 1.376372130365141, "learning_rate": 8.13068991977391e-06, "loss": 0.3562, "step": 6990 }, { "epoch": 3.02, "grad_norm": 1.0713239095717266, "learning_rate": 8.115878688136088e-06, "loss": 0.3052, "step": 6995 }, { "epoch": 3.02, "grad_norm": 1.3659799887935307, "learning_rate": 8.10107174063384e-06, "loss": 0.3978, "step": 7000 }, { "epoch": 3.03, "grad_norm": 1.4462952426077653, "learning_rate": 8.08626911093536e-06, "loss": 0.3724, "step": 7005 }, { "epoch": 3.03, "grad_norm": 1.4157895388816184, "learning_rate": 8.07147083269903e-06, "loss": 0.3455, "step": 7010 }, { "epoch": 3.03, "grad_norm": 0.8782933167988847, "learning_rate": 8.056676939573334e-06, "loss": 0.3594, "step": 7015 }, { "epoch": 3.03, "grad_norm": 1.6277189620907764, "learning_rate": 8.041887465196788e-06, "loss": 0.4029, "step": 7020 }, { "epoch": 3.03, "grad_norm": 2.6546278749619274, "learning_rate": 8.02710244319786e-06, "loss": 0.4223, "step": 7025 }, { "epoch": 3.04, "grad_norm": 1.211076145212302, "learning_rate": 8.0123219071949e-06, "loss": 0.3598, "step": 7030 }, { "epoch": 3.04, "grad_norm": 1.343844774689017, "learning_rate": 7.997545890796045e-06, "loss": 0.3523, "step": 7035 }, { "epoch": 3.04, "grad_norm": 1.6274284116880298, "learning_rate": 7.982774427599162e-06, "loss": 0.3976, "step": 7040 }, { "epoch": 3.04, "grad_norm": 2.8144432271137214, "learning_rate": 7.968007551191767e-06, "loss": 0.3435, "step": 7045 }, { "epoch": 3.05, "grad_norm": 3.2774547224149004, "learning_rate": 7.953245295150945e-06, "loss": 0.4223, "step": 7050 }, { "epoch": 3.05, "grad_norm": 1.7535061060351183, "learning_rate": 7.938487693043277e-06, "loss": 0.3808, "step": 7055 }, { "epoch": 3.05, "grad_norm": 2.185881923891815, "learning_rate": 7.923734778424755e-06, "loss": 0.3872, "step": 7060 }, { "epoch": 3.05, "grad_norm": 2.5655611819438864, "learning_rate": 7.908986584840719e-06, "loss": 0.3469, "step": 7065 }, { "epoch": 3.05, "grad_norm": 2.0458124980828414, "learning_rate": 7.894243145825769e-06, "loss": 0.3517, "step": 7070 }, { "epoch": 3.06, "grad_norm": 2.09865517921951, "learning_rate": 7.879504494903703e-06, "loss": 0.3747, "step": 7075 }, { "epoch": 3.06, "grad_norm": 1.836960027260886, "learning_rate": 7.864770665587424e-06, "loss": 0.3784, "step": 7080 }, { "epoch": 3.06, "grad_norm": 2.161059273404664, "learning_rate": 7.850041691378873e-06, "loss": 0.3529, "step": 7085 }, { "epoch": 3.06, "grad_norm": 2.36114844324576, "learning_rate": 7.835317605768949e-06, "loss": 0.3875, "step": 7090 }, { "epoch": 3.06, "grad_norm": 1.7898708810429749, "learning_rate": 7.820598442237444e-06, "loss": 0.3854, "step": 7095 }, { "epoch": 3.07, "grad_norm": 1.9649168757553455, "learning_rate": 7.805884234252947e-06, "loss": 0.3459, "step": 7100 }, { "epoch": 3.07, "grad_norm": 1.9301743683275345, "learning_rate": 7.791175015272791e-06, "loss": 0.3507, "step": 7105 }, { "epoch": 3.07, "grad_norm": 1.3558476831220407, "learning_rate": 7.776470818742954e-06, "loss": 0.3579, "step": 7110 }, { "epoch": 3.07, "grad_norm": 1.5222520880851103, "learning_rate": 7.761771678097997e-06, "loss": 0.3348, "step": 7115 }, { "epoch": 3.08, "grad_norm": 2.3225581037412923, "learning_rate": 7.747077626760989e-06, "loss": 0.365, "step": 7120 }, { "epoch": 3.08, "grad_norm": 1.4854703386339154, "learning_rate": 7.732388698143421e-06, "loss": 0.3497, "step": 7125 }, { "epoch": 3.08, "grad_norm": 2.4825826951759966, "learning_rate": 7.717704925645145e-06, "loss": 0.4039, "step": 7130 }, { "epoch": 3.08, "grad_norm": 1.1508037345497122, "learning_rate": 7.703026342654279e-06, "loss": 0.3357, "step": 7135 }, { "epoch": 3.08, "grad_norm": 1.401657005259486, "learning_rate": 7.68835298254714e-06, "loss": 0.346, "step": 7140 }, { "epoch": 3.09, "grad_norm": 0.616882217177935, "learning_rate": 7.673684878688183e-06, "loss": 0.3683, "step": 7145 }, { "epoch": 3.09, "grad_norm": 1.4440225390107253, "learning_rate": 7.659022064429897e-06, "loss": 0.4064, "step": 7150 }, { "epoch": 3.09, "grad_norm": 1.3782845348525206, "learning_rate": 7.644364573112752e-06, "loss": 0.3057, "step": 7155 }, { "epoch": 3.09, "grad_norm": 1.1185076699186876, "learning_rate": 7.629712438065112e-06, "loss": 0.3627, "step": 7160 }, { "epoch": 3.1, "grad_norm": 0.9216251938487858, "learning_rate": 7.615065692603154e-06, "loss": 0.2782, "step": 7165 }, { "epoch": 3.1, "grad_norm": 1.0953121936604426, "learning_rate": 7.6004243700308136e-06, "loss": 0.3553, "step": 7170 }, { "epoch": 3.1, "grad_norm": 1.1345399633196933, "learning_rate": 7.585788503639688e-06, "loss": 0.3723, "step": 7175 }, { "epoch": 3.1, "grad_norm": 1.3955329601427968, "learning_rate": 7.571158126708974e-06, "loss": 0.3782, "step": 7180 }, { "epoch": 3.1, "grad_norm": 1.9115571763477264, "learning_rate": 7.556533272505377e-06, "loss": 0.3813, "step": 7185 }, { "epoch": 3.11, "grad_norm": 1.7498110352468872, "learning_rate": 7.5419139742830494e-06, "loss": 0.3144, "step": 7190 }, { "epoch": 3.11, "grad_norm": 1.0163021017255334, "learning_rate": 7.5273002652835124e-06, "loss": 0.3512, "step": 7195 }, { "epoch": 3.11, "grad_norm": 1.536549393457774, "learning_rate": 7.512692178735578e-06, "loss": 0.3413, "step": 7200 }, { "epoch": 3.11, "grad_norm": 1.8775277627792926, "learning_rate": 7.4980897478552725e-06, "loss": 0.406, "step": 7205 }, { "epoch": 3.11, "grad_norm": 1.9644401322054297, "learning_rate": 7.483493005845762e-06, "loss": 0.4718, "step": 7210 }, { "epoch": 3.12, "grad_norm": 2.625406118579211, "learning_rate": 7.468901985897277e-06, "loss": 0.344, "step": 7215 }, { "epoch": 3.12, "grad_norm": 2.163595461889686, "learning_rate": 7.454316721187034e-06, "loss": 0.3996, "step": 7220 }, { "epoch": 3.12, "grad_norm": 2.228158971734925, "learning_rate": 7.43973724487917e-06, "loss": 0.3664, "step": 7225 }, { "epoch": 3.12, "grad_norm": 2.524343595169013, "learning_rate": 7.425163590124662e-06, "loss": 0.3957, "step": 7230 }, { "epoch": 3.13, "grad_norm": 0.675888461514967, "learning_rate": 7.4105957900612405e-06, "loss": 0.3292, "step": 7235 }, { "epoch": 3.13, "grad_norm": 1.6605683075642659, "learning_rate": 7.396033877813327e-06, "loss": 0.3192, "step": 7240 }, { "epoch": 3.13, "grad_norm": 1.569991195286367, "learning_rate": 7.381477886491953e-06, "loss": 0.4003, "step": 7245 }, { "epoch": 3.13, "grad_norm": 2.632148940238971, "learning_rate": 7.366927849194698e-06, "loss": 0.3839, "step": 7250 }, { "epoch": 3.13, "grad_norm": 1.352149468997016, "learning_rate": 7.352383799005593e-06, "loss": 0.3815, "step": 7255 }, { "epoch": 3.14, "grad_norm": 1.662409127769855, "learning_rate": 7.3378457689950565e-06, "loss": 0.3485, "step": 7260 }, { "epoch": 3.14, "grad_norm": 1.7956039040207492, "learning_rate": 7.323313792219819e-06, "loss": 0.367, "step": 7265 }, { "epoch": 3.14, "grad_norm": 1.8540078766409693, "learning_rate": 7.308787901722847e-06, "loss": 0.3722, "step": 7270 }, { "epoch": 3.14, "grad_norm": 0.7085122569624023, "learning_rate": 7.294268130533268e-06, "loss": 0.3218, "step": 7275 }, { "epoch": 3.14, "grad_norm": 2.497501765694081, "learning_rate": 7.279754511666304e-06, "loss": 0.4098, "step": 7280 }, { "epoch": 3.15, "grad_norm": 1.285506504028356, "learning_rate": 7.265247078123171e-06, "loss": 0.3316, "step": 7285 }, { "epoch": 3.15, "grad_norm": 2.0715015673160715, "learning_rate": 7.250745862891033e-06, "loss": 0.316, "step": 7290 }, { "epoch": 3.15, "grad_norm": 1.6351432229152634, "learning_rate": 7.236250898942906e-06, "loss": 0.342, "step": 7295 }, { "epoch": 3.15, "grad_norm": 1.8714186683779126, "learning_rate": 7.221762219237603e-06, "loss": 0.4127, "step": 7300 }, { "epoch": 3.16, "grad_norm": 1.2075723689154334, "learning_rate": 7.207279856719642e-06, "loss": 0.39, "step": 7305 }, { "epoch": 3.16, "grad_norm": 1.842860731387192, "learning_rate": 7.192803844319176e-06, "loss": 0.3206, "step": 7310 }, { "epoch": 3.16, "grad_norm": 0.9966979046850045, "learning_rate": 7.178334214951919e-06, "loss": 0.3021, "step": 7315 }, { "epoch": 3.16, "grad_norm": 1.645332076913864, "learning_rate": 7.163871001519065e-06, "loss": 0.3523, "step": 7320 }, { "epoch": 3.16, "grad_norm": 2.5864286290966634, "learning_rate": 7.149414236907238e-06, "loss": 0.4284, "step": 7325 }, { "epoch": 3.17, "grad_norm": 2.1920020064756747, "learning_rate": 7.134963953988381e-06, "loss": 0.3299, "step": 7330 }, { "epoch": 3.17, "grad_norm": 0.7475437303317399, "learning_rate": 7.120520185619706e-06, "loss": 0.3112, "step": 7335 }, { "epoch": 3.17, "grad_norm": 2.14902808663687, "learning_rate": 7.1060829646436104e-06, "loss": 0.3632, "step": 7340 }, { "epoch": 3.17, "grad_norm": 1.5457835835946556, "learning_rate": 7.091652323887601e-06, "loss": 0.3356, "step": 7345 }, { "epoch": 3.17, "grad_norm": 1.5894112066172816, "learning_rate": 7.077228296164231e-06, "loss": 0.357, "step": 7350 }, { "epoch": 3.18, "grad_norm": 1.2040205720783954, "learning_rate": 7.062810914271012e-06, "loss": 0.3663, "step": 7355 }, { "epoch": 3.18, "grad_norm": 1.33985630492986, "learning_rate": 7.048400210990344e-06, "loss": 0.3366, "step": 7360 }, { "epoch": 3.18, "grad_norm": 1.127291173301345, "learning_rate": 7.033996219089438e-06, "loss": 0.3272, "step": 7365 }, { "epoch": 3.18, "grad_norm": 1.7185161121647898, "learning_rate": 7.0195989713202465e-06, "loss": 0.3234, "step": 7370 }, { "epoch": 3.19, "grad_norm": 1.1498105845305382, "learning_rate": 7.005208500419393e-06, "loss": 0.2983, "step": 7375 }, { "epoch": 3.19, "grad_norm": 2.2371882954147235, "learning_rate": 6.990824839108086e-06, "loss": 0.3753, "step": 7380 }, { "epoch": 3.19, "grad_norm": 0.7884854218345562, "learning_rate": 6.976448020092051e-06, "loss": 0.3763, "step": 7385 }, { "epoch": 3.19, "grad_norm": 2.09921574448507, "learning_rate": 6.962078076061456e-06, "loss": 0.3753, "step": 7390 }, { "epoch": 3.19, "grad_norm": 2.482230635386038, "learning_rate": 6.947715039690831e-06, "loss": 0.4298, "step": 7395 }, { "epoch": 3.2, "grad_norm": 2.108335121499453, "learning_rate": 6.933358943639014e-06, "loss": 0.3505, "step": 7400 }, { "epoch": 3.2, "grad_norm": 1.5449594462702159, "learning_rate": 6.919009820549045e-06, "loss": 0.394, "step": 7405 }, { "epoch": 3.2, "grad_norm": 1.2438744387949128, "learning_rate": 6.904667703048121e-06, "loss": 0.3462, "step": 7410 }, { "epoch": 3.2, "grad_norm": 1.289234931861233, "learning_rate": 6.890332623747504e-06, "loss": 0.3629, "step": 7415 }, { "epoch": 3.21, "grad_norm": 0.8977406690803236, "learning_rate": 6.876004615242449e-06, "loss": 0.3823, "step": 7420 }, { "epoch": 3.21, "grad_norm": 2.1847093235275623, "learning_rate": 6.861683710112146e-06, "loss": 0.397, "step": 7425 }, { "epoch": 3.21, "grad_norm": 3.0155479918164647, "learning_rate": 6.847369940919618e-06, "loss": 0.3929, "step": 7430 }, { "epoch": 3.21, "grad_norm": 4.104350300963877, "learning_rate": 6.833063340211674e-06, "loss": 0.4007, "step": 7435 }, { "epoch": 3.21, "grad_norm": 1.8049485982992373, "learning_rate": 6.818763940518818e-06, "loss": 0.354, "step": 7440 }, { "epoch": 3.22, "grad_norm": 1.2427853854370055, "learning_rate": 6.804471774355175e-06, "loss": 0.3642, "step": 7445 }, { "epoch": 3.22, "grad_norm": 2.331588538403581, "learning_rate": 6.790186874218437e-06, "loss": 0.3201, "step": 7450 }, { "epoch": 3.22, "grad_norm": 1.7805164540285714, "learning_rate": 6.77590927258976e-06, "loss": 0.3839, "step": 7455 }, { "epoch": 3.22, "grad_norm": 1.341701157263398, "learning_rate": 6.761639001933713e-06, "loss": 0.3224, "step": 7460 }, { "epoch": 3.22, "grad_norm": 1.2404201279129343, "learning_rate": 6.747376094698191e-06, "loss": 0.39, "step": 7465 }, { "epoch": 3.23, "grad_norm": 2.5663217984821713, "learning_rate": 6.7331205833143456e-06, "loss": 0.3605, "step": 7470 }, { "epoch": 3.23, "grad_norm": 1.2525035749489488, "learning_rate": 6.718872500196521e-06, "loss": 0.3395, "step": 7475 }, { "epoch": 3.23, "grad_norm": 1.5167515096191115, "learning_rate": 6.704631877742156e-06, "loss": 0.3075, "step": 7480 }, { "epoch": 3.23, "grad_norm": 1.662607642634134, "learning_rate": 6.690398748331739e-06, "loss": 0.3361, "step": 7485 }, { "epoch": 3.24, "grad_norm": 1.0759956437678158, "learning_rate": 6.676173144328713e-06, "loss": 0.3006, "step": 7490 }, { "epoch": 3.24, "grad_norm": 1.045165095111159, "learning_rate": 6.661955098079407e-06, "loss": 0.3062, "step": 7495 }, { "epoch": 3.24, "grad_norm": 1.028477080146992, "learning_rate": 6.6477446419129765e-06, "loss": 0.2854, "step": 7500 }, { "epoch": 3.24, "grad_norm": 1.839431683459406, "learning_rate": 6.633541808141307e-06, "loss": 0.2772, "step": 7505 }, { "epoch": 3.24, "grad_norm": 1.2499750070114224, "learning_rate": 6.61934662905896e-06, "loss": 0.3068, "step": 7510 }, { "epoch": 3.25, "grad_norm": 1.4318397803172023, "learning_rate": 6.605159136943086e-06, "loss": 0.3998, "step": 7515 }, { "epoch": 3.25, "grad_norm": 1.6896153505949632, "learning_rate": 6.590979364053358e-06, "loss": 0.336, "step": 7520 }, { "epoch": 3.25, "grad_norm": 1.8291472755252598, "learning_rate": 6.576807342631901e-06, "loss": 0.3195, "step": 7525 }, { "epoch": 3.25, "grad_norm": 1.086754478155717, "learning_rate": 6.56264310490321e-06, "loss": 0.3124, "step": 7530 }, { "epoch": 3.25, "grad_norm": 0.8549620256435886, "learning_rate": 6.548486683074086e-06, "loss": 0.3528, "step": 7535 }, { "epoch": 3.26, "grad_norm": 0.858456604542488, "learning_rate": 6.534338109333551e-06, "loss": 0.3175, "step": 7540 }, { "epoch": 3.26, "grad_norm": 1.870253816789839, "learning_rate": 6.5201974158527855e-06, "loss": 0.3451, "step": 7545 }, { "epoch": 3.26, "grad_norm": 1.7701261568886084, "learning_rate": 6.506064634785059e-06, "loss": 0.3468, "step": 7550 }, { "epoch": 3.26, "grad_norm": 1.3093267757249363, "learning_rate": 6.4919397982656365e-06, "loss": 0.3102, "step": 7555 }, { "epoch": 3.27, "grad_norm": 1.4187325171777991, "learning_rate": 6.47782293841173e-06, "loss": 0.3231, "step": 7560 }, { "epoch": 3.27, "grad_norm": 1.3886715755141823, "learning_rate": 6.463714087322406e-06, "loss": 0.3683, "step": 7565 }, { "epoch": 3.27, "grad_norm": 2.306803637558559, "learning_rate": 6.4496132770785215e-06, "loss": 0.4046, "step": 7570 }, { "epoch": 3.27, "grad_norm": 1.4954651335990372, "learning_rate": 6.43552053974266e-06, "loss": 0.2898, "step": 7575 }, { "epoch": 3.27, "grad_norm": 1.1220262633443565, "learning_rate": 6.421435907359035e-06, "loss": 0.3033, "step": 7580 }, { "epoch": 3.28, "grad_norm": 1.8256112203819537, "learning_rate": 6.407359411953441e-06, "loss": 0.2982, "step": 7585 }, { "epoch": 3.28, "grad_norm": 1.7431511555290342, "learning_rate": 6.393291085533166e-06, "loss": 0.2926, "step": 7590 }, { "epoch": 3.28, "grad_norm": 0.9850643228063368, "learning_rate": 6.379230960086916e-06, "loss": 0.2945, "step": 7595 }, { "epoch": 3.28, "grad_norm": 1.7655045331686694, "learning_rate": 6.36517906758477e-06, "loss": 0.4019, "step": 7600 }, { "epoch": 3.29, "grad_norm": 1.3966168616487404, "learning_rate": 6.351135439978067e-06, "loss": 0.2689, "step": 7605 }, { "epoch": 3.29, "grad_norm": 0.9472185721991497, "learning_rate": 6.337100109199362e-06, "loss": 0.312, "step": 7610 }, { "epoch": 3.29, "grad_norm": 1.7266621380307388, "learning_rate": 6.3230731071623445e-06, "loss": 0.3638, "step": 7615 }, { "epoch": 3.29, "grad_norm": 2.091724927308154, "learning_rate": 6.309054465761758e-06, "loss": 0.2872, "step": 7620 }, { "epoch": 3.29, "grad_norm": 1.073271495200662, "learning_rate": 6.295044216873351e-06, "loss": 0.3007, "step": 7625 }, { "epoch": 3.3, "grad_norm": 1.117360265840462, "learning_rate": 6.281042392353775e-06, "loss": 0.3308, "step": 7630 }, { "epoch": 3.3, "grad_norm": 1.5440549578000846, "learning_rate": 6.267049024040535e-06, "loss": 0.2729, "step": 7635 }, { "epoch": 3.3, "grad_norm": 1.5481051987579446, "learning_rate": 6.253064143751901e-06, "loss": 0.3386, "step": 7640 }, { "epoch": 3.3, "grad_norm": 3.786049943590816, "learning_rate": 6.239087783286842e-06, "loss": 0.3006, "step": 7645 }, { "epoch": 3.3, "grad_norm": 1.7723356489577555, "learning_rate": 6.225119974424969e-06, "loss": 0.3207, "step": 7650 }, { "epoch": 3.31, "grad_norm": 1.319330771697122, "learning_rate": 6.211160748926431e-06, "loss": 0.3429, "step": 7655 }, { "epoch": 3.31, "grad_norm": 0.9383400641879289, "learning_rate": 6.19721013853187e-06, "loss": 0.285, "step": 7660 }, { "epoch": 3.31, "grad_norm": 1.5060664851409171, "learning_rate": 6.183268174962333e-06, "loss": 0.3419, "step": 7665 }, { "epoch": 3.31, "grad_norm": 1.2514169013060776, "learning_rate": 6.16933488991921e-06, "loss": 0.3034, "step": 7670 }, { "epoch": 3.32, "grad_norm": 1.3352849856980331, "learning_rate": 6.1554103150841585e-06, "loss": 0.3616, "step": 7675 }, { "epoch": 3.32, "grad_norm": 2.519664197804798, "learning_rate": 6.141494482119024e-06, "loss": 0.2999, "step": 7680 }, { "epoch": 3.32, "grad_norm": 1.645506931854804, "learning_rate": 6.127587422665785e-06, "loss": 0.2752, "step": 7685 }, { "epoch": 3.32, "grad_norm": 1.26311160539326, "learning_rate": 6.113689168346462e-06, "loss": 0.2893, "step": 7690 }, { "epoch": 3.32, "grad_norm": 1.1737096926315287, "learning_rate": 6.099799750763051e-06, "loss": 0.3313, "step": 7695 }, { "epoch": 3.33, "grad_norm": 1.3998936159924413, "learning_rate": 6.085919201497472e-06, "loss": 0.3007, "step": 7700 }, { "epoch": 3.33, "grad_norm": 1.4549429673890482, "learning_rate": 6.072047552111462e-06, "loss": 0.2869, "step": 7705 }, { "epoch": 3.33, "grad_norm": 1.746910600245538, "learning_rate": 6.058184834146532e-06, "loss": 0.3488, "step": 7710 }, { "epoch": 3.33, "grad_norm": 1.0407842846096143, "learning_rate": 6.044331079123882e-06, "loss": 0.3, "step": 7715 }, { "epoch": 3.33, "grad_norm": 1.6015467888098907, "learning_rate": 6.030486318544322e-06, "loss": 0.3575, "step": 7720 }, { "epoch": 3.34, "grad_norm": 1.4990017412694239, "learning_rate": 6.016650583888231e-06, "loss": 0.3011, "step": 7725 }, { "epoch": 3.34, "grad_norm": 1.0912555050972461, "learning_rate": 6.0028239066154495e-06, "loss": 0.2803, "step": 7730 }, { "epoch": 3.34, "grad_norm": 1.8084455742572358, "learning_rate": 5.989006318165224e-06, "loss": 0.301, "step": 7735 }, { "epoch": 3.34, "grad_norm": 0.5940513087308738, "learning_rate": 5.97519784995614e-06, "loss": 0.3093, "step": 7740 }, { "epoch": 3.35, "grad_norm": 1.8390181703969186, "learning_rate": 5.96139853338604e-06, "loss": 0.3046, "step": 7745 }, { "epoch": 3.35, "grad_norm": 1.2236133534368583, "learning_rate": 5.947608399831968e-06, "loss": 0.3721, "step": 7750 }, { "epoch": 3.35, "grad_norm": 1.3411315105622825, "learning_rate": 5.933827480650073e-06, "loss": 0.3068, "step": 7755 }, { "epoch": 3.35, "grad_norm": 1.683704960625591, "learning_rate": 5.92005580717556e-06, "loss": 0.4067, "step": 7760 }, { "epoch": 3.35, "grad_norm": 1.4060882167908326, "learning_rate": 5.906293410722613e-06, "loss": 0.3623, "step": 7765 }, { "epoch": 3.36, "grad_norm": 1.727295083424792, "learning_rate": 5.892540322584315e-06, "loss": 0.3564, "step": 7770 }, { "epoch": 3.36, "grad_norm": 0.8805570616777468, "learning_rate": 5.878796574032592e-06, "loss": 0.3287, "step": 7775 }, { "epoch": 3.36, "grad_norm": 0.7273911720571473, "learning_rate": 5.865062196318128e-06, "loss": 0.273, "step": 7780 }, { "epoch": 3.36, "grad_norm": 1.3971767306685285, "learning_rate": 5.851337220670299e-06, "loss": 0.3476, "step": 7785 }, { "epoch": 3.37, "grad_norm": 1.5662729786773861, "learning_rate": 5.837621678297104e-06, "loss": 0.3471, "step": 7790 }, { "epoch": 3.37, "grad_norm": 1.6410213733545815, "learning_rate": 5.823915600385095e-06, "loss": 0.3553, "step": 7795 }, { "epoch": 3.37, "grad_norm": 1.4199761399560424, "learning_rate": 5.810219018099299e-06, "loss": 0.378, "step": 7800 }, { "epoch": 3.37, "grad_norm": 1.6803309581824162, "learning_rate": 5.796531962583161e-06, "loss": 0.2952, "step": 7805 }, { "epoch": 3.37, "grad_norm": 0.8482833840019645, "learning_rate": 5.782854464958445e-06, "loss": 0.322, "step": 7810 }, { "epoch": 3.38, "grad_norm": 2.011908070466235, "learning_rate": 5.769186556325206e-06, "loss": 0.2833, "step": 7815 }, { "epoch": 3.38, "grad_norm": 0.992761062940804, "learning_rate": 5.755528267761669e-06, "loss": 0.3253, "step": 7820 }, { "epoch": 3.38, "grad_norm": 1.295611381712756, "learning_rate": 5.741879630324214e-06, "loss": 0.2747, "step": 7825 }, { "epoch": 3.38, "grad_norm": 1.1938601977260708, "learning_rate": 5.728240675047259e-06, "loss": 0.3383, "step": 7830 }, { "epoch": 3.38, "grad_norm": 0.8344509260934919, "learning_rate": 5.714611432943202e-06, "loss": 0.2881, "step": 7835 }, { "epoch": 3.39, "grad_norm": 1.3994896482762227, "learning_rate": 5.700991935002371e-06, "loss": 0.3118, "step": 7840 }, { "epoch": 3.39, "grad_norm": 0.7711648002211058, "learning_rate": 5.687382212192915e-06, "loss": 0.3013, "step": 7845 }, { "epoch": 3.39, "grad_norm": 1.0473806175451943, "learning_rate": 5.673782295460784e-06, "loss": 0.2497, "step": 7850 }, { "epoch": 3.39, "grad_norm": 1.4164383045109232, "learning_rate": 5.660192215729615e-06, "loss": 0.3133, "step": 7855 }, { "epoch": 3.4, "grad_norm": 0.9574061018200604, "learning_rate": 5.646612003900674e-06, "loss": 0.3171, "step": 7860 }, { "epoch": 3.4, "grad_norm": 1.708395064070684, "learning_rate": 5.633041690852798e-06, "loss": 0.3316, "step": 7865 }, { "epoch": 3.4, "grad_norm": 1.2330174450753875, "learning_rate": 5.619481307442306e-06, "loss": 0.3241, "step": 7870 }, { "epoch": 3.4, "grad_norm": 1.2419549464618695, "learning_rate": 5.605930884502954e-06, "loss": 0.3131, "step": 7875 }, { "epoch": 3.4, "grad_norm": 1.0772138558401552, "learning_rate": 5.592390452845844e-06, "loss": 0.3444, "step": 7880 }, { "epoch": 3.41, "grad_norm": 1.4128274027698726, "learning_rate": 5.578860043259346e-06, "loss": 0.2587, "step": 7885 }, { "epoch": 3.41, "grad_norm": 0.9309254408786461, "learning_rate": 5.5653396865090635e-06, "loss": 0.3206, "step": 7890 }, { "epoch": 3.41, "grad_norm": 1.233915081617126, "learning_rate": 5.5518294133377195e-06, "loss": 0.2833, "step": 7895 }, { "epoch": 3.41, "grad_norm": 1.0971867385885703, "learning_rate": 5.538329254465128e-06, "loss": 0.2947, "step": 7900 }, { "epoch": 3.41, "grad_norm": 2.4423525002869053, "learning_rate": 5.5248392405881e-06, "loss": 0.386, "step": 7905 }, { "epoch": 3.42, "grad_norm": 1.6676084708192005, "learning_rate": 5.51135940238037e-06, "loss": 0.2742, "step": 7910 }, { "epoch": 3.42, "grad_norm": 0.9193407813258758, "learning_rate": 5.497889770492546e-06, "loss": 0.2955, "step": 7915 }, { "epoch": 3.42, "grad_norm": 1.6840232895053835, "learning_rate": 5.484430375552015e-06, "loss": 0.2896, "step": 7920 }, { "epoch": 3.42, "grad_norm": 2.089338229394022, "learning_rate": 5.4709812481629035e-06, "loss": 0.3333, "step": 7925 }, { "epoch": 3.43, "grad_norm": 1.2417429731256493, "learning_rate": 5.457542418905986e-06, "loss": 0.3443, "step": 7930 }, { "epoch": 3.43, "grad_norm": 1.187138692055809, "learning_rate": 5.4441139183386116e-06, "loss": 0.3059, "step": 7935 }, { "epoch": 3.43, "grad_norm": 1.380993547712161, "learning_rate": 5.430695776994659e-06, "loss": 0.35, "step": 7940 }, { "epoch": 3.43, "grad_norm": 1.1629205213157978, "learning_rate": 5.4172880253844325e-06, "loss": 0.3202, "step": 7945 }, { "epoch": 3.43, "grad_norm": 1.4779553786860646, "learning_rate": 5.403890693994635e-06, "loss": 0.3535, "step": 7950 }, { "epoch": 3.44, "grad_norm": 1.0785096047218297, "learning_rate": 5.3905038132882644e-06, "loss": 0.2486, "step": 7955 }, { "epoch": 3.44, "grad_norm": 1.6003168513345345, "learning_rate": 5.377127413704548e-06, "loss": 0.3267, "step": 7960 }, { "epoch": 3.44, "grad_norm": 0.8661954065325067, "learning_rate": 5.3637615256589e-06, "loss": 0.2747, "step": 7965 }, { "epoch": 3.44, "grad_norm": 1.0632816286685356, "learning_rate": 5.350406179542804e-06, "loss": 0.2608, "step": 7970 }, { "epoch": 3.44, "grad_norm": 1.4309598280517628, "learning_rate": 5.337061405723813e-06, "loss": 0.2653, "step": 7975 }, { "epoch": 3.45, "grad_norm": 1.4823225940472557, "learning_rate": 5.323727234545406e-06, "loss": 0.3331, "step": 7980 }, { "epoch": 3.45, "grad_norm": 1.1371470643568795, "learning_rate": 5.310403696326969e-06, "loss": 0.3151, "step": 7985 }, { "epoch": 3.45, "grad_norm": 1.0474163097333449, "learning_rate": 5.297090821363713e-06, "loss": 0.2394, "step": 7990 }, { "epoch": 3.45, "grad_norm": 2.0260247807879663, "learning_rate": 5.2837886399265855e-06, "loss": 0.3131, "step": 7995 }, { "epoch": 3.46, "grad_norm": 1.257262975782416, "learning_rate": 5.270497182262251e-06, "loss": 0.2739, "step": 8000 }, { "epoch": 3.46, "grad_norm": 1.3679796310400674, "learning_rate": 5.257216478592956e-06, "loss": 0.2799, "step": 8005 }, { "epoch": 3.46, "grad_norm": 1.2880772135115273, "learning_rate": 5.243946559116513e-06, "loss": 0.2591, "step": 8010 }, { "epoch": 3.46, "grad_norm": 1.3722257951161014, "learning_rate": 5.230687454006213e-06, "loss": 0.2561, "step": 8015 }, { "epoch": 3.46, "grad_norm": 1.4788204117150598, "learning_rate": 5.2174391934107445e-06, "loss": 0.3161, "step": 8020 }, { "epoch": 3.47, "grad_norm": 1.1950353284608026, "learning_rate": 5.204201807454161e-06, "loss": 0.2837, "step": 8025 }, { "epoch": 3.47, "grad_norm": 1.47777539000232, "learning_rate": 5.190975326235764e-06, "loss": 0.3246, "step": 8030 }, { "epoch": 3.47, "grad_norm": 1.800356278890997, "learning_rate": 5.177759779830072e-06, "loss": 0.3721, "step": 8035 }, { "epoch": 3.47, "grad_norm": 0.879211002173503, "learning_rate": 5.164555198286745e-06, "loss": 0.2545, "step": 8040 }, { "epoch": 3.48, "grad_norm": 1.6605980255623658, "learning_rate": 5.151361611630491e-06, "loss": 0.3551, "step": 8045 }, { "epoch": 3.48, "grad_norm": 0.9258465918526644, "learning_rate": 5.138179049861047e-06, "loss": 0.2511, "step": 8050 }, { "epoch": 3.48, "grad_norm": 1.362803220020721, "learning_rate": 5.125007542953056e-06, "loss": 0.2996, "step": 8055 }, { "epoch": 3.48, "grad_norm": 1.4846626929901958, "learning_rate": 5.111847120856035e-06, "loss": 0.2861, "step": 8060 }, { "epoch": 3.48, "grad_norm": 1.3120726030735523, "learning_rate": 5.0986978134943e-06, "loss": 0.3216, "step": 8065 }, { "epoch": 3.49, "grad_norm": 2.3701644636458017, "learning_rate": 5.085559650766878e-06, "loss": 0.3636, "step": 8070 }, { "epoch": 3.49, "grad_norm": 2.2927283455933685, "learning_rate": 5.072432662547483e-06, "loss": 0.2699, "step": 8075 }, { "epoch": 3.49, "grad_norm": 1.4104584634968191, "learning_rate": 5.05931687868439e-06, "loss": 0.2846, "step": 8080 }, { "epoch": 3.49, "grad_norm": 1.6350220111620895, "learning_rate": 5.046212329000417e-06, "loss": 0.3417, "step": 8085 }, { "epoch": 3.49, "grad_norm": 0.9603072152402977, "learning_rate": 5.033119043292835e-06, "loss": 0.3143, "step": 8090 }, { "epoch": 3.5, "grad_norm": 1.5098697111760186, "learning_rate": 5.02003705133329e-06, "loss": 0.3101, "step": 8095 }, { "epoch": 3.5, "grad_norm": 1.241925543830471, "learning_rate": 5.006966382867771e-06, "loss": 0.2992, "step": 8100 }, { "epoch": 3.5, "grad_norm": 1.5474911953971806, "learning_rate": 4.9939070676164935e-06, "loss": 0.3282, "step": 8105 }, { "epoch": 3.5, "grad_norm": 4.243167744270766, "learning_rate": 4.980859135273878e-06, "loss": 0.3118, "step": 8110 }, { "epoch": 3.51, "grad_norm": 2.5000761058036507, "learning_rate": 4.967822615508457e-06, "loss": 0.2975, "step": 8115 }, { "epoch": 3.51, "grad_norm": 3.5097390106620856, "learning_rate": 4.9547975379628e-06, "loss": 0.2945, "step": 8120 }, { "epoch": 3.51, "grad_norm": 2.209763598381993, "learning_rate": 4.941783932253487e-06, "loss": 0.3477, "step": 8125 }, { "epoch": 3.51, "grad_norm": 1.4044042169605864, "learning_rate": 4.928781827970983e-06, "loss": 0.2261, "step": 8130 }, { "epoch": 3.51, "grad_norm": 1.3885448858555944, "learning_rate": 4.9157912546796205e-06, "loss": 0.3017, "step": 8135 }, { "epoch": 3.52, "grad_norm": 1.2411471192003973, "learning_rate": 4.9028122419175075e-06, "loss": 0.2904, "step": 8140 }, { "epoch": 3.52, "grad_norm": 1.2413220137474514, "learning_rate": 4.889844819196455e-06, "loss": 0.2782, "step": 8145 }, { "epoch": 3.52, "grad_norm": 1.250835967591782, "learning_rate": 4.876889016001946e-06, "loss": 0.3283, "step": 8150 }, { "epoch": 3.52, "grad_norm": 0.9716840825910151, "learning_rate": 4.863944861793014e-06, "loss": 0.3037, "step": 8155 }, { "epoch": 3.52, "grad_norm": 2.260877892021157, "learning_rate": 4.851012386002223e-06, "loss": 0.3047, "step": 8160 }, { "epoch": 3.53, "grad_norm": 1.681828011210631, "learning_rate": 4.83809161803558e-06, "loss": 0.2672, "step": 8165 }, { "epoch": 3.53, "grad_norm": 1.495162454290298, "learning_rate": 4.825182587272458e-06, "loss": 0.2552, "step": 8170 }, { "epoch": 3.53, "grad_norm": 1.292898157049884, "learning_rate": 4.812285323065565e-06, "loss": 0.2828, "step": 8175 }, { "epoch": 3.53, "grad_norm": 0.952818561352293, "learning_rate": 4.799399854740832e-06, "loss": 0.2739, "step": 8180 }, { "epoch": 3.54, "grad_norm": 0.723324136997605, "learning_rate": 4.786526211597378e-06, "loss": 0.2376, "step": 8185 }, { "epoch": 3.54, "grad_norm": 1.2593200948705463, "learning_rate": 4.7736644229074396e-06, "loss": 0.2956, "step": 8190 }, { "epoch": 3.54, "grad_norm": 1.3296718387734723, "learning_rate": 4.760814517916279e-06, "loss": 0.2842, "step": 8195 }, { "epoch": 3.54, "grad_norm": 0.8986198037137462, "learning_rate": 4.747976525842167e-06, "loss": 0.2481, "step": 8200 }, { "epoch": 3.54, "grad_norm": 1.7112417432680016, "learning_rate": 4.7351504758762565e-06, "loss": 0.301, "step": 8205 }, { "epoch": 3.55, "grad_norm": 1.3088144678719051, "learning_rate": 4.7223363971825655e-06, "loss": 0.268, "step": 8210 }, { "epoch": 3.55, "grad_norm": 1.0416234935318807, "learning_rate": 4.7095343188978885e-06, "loss": 0.2657, "step": 8215 }, { "epoch": 3.55, "grad_norm": 1.050594486786159, "learning_rate": 4.696744270131719e-06, "loss": 0.2606, "step": 8220 }, { "epoch": 3.55, "grad_norm": 1.0367524661999572, "learning_rate": 4.683966279966225e-06, "loss": 0.2551, "step": 8225 }, { "epoch": 3.56, "grad_norm": 1.1194180754236926, "learning_rate": 4.6712003774561275e-06, "loss": 0.2901, "step": 8230 }, { "epoch": 3.56, "grad_norm": 0.8141812221195328, "learning_rate": 4.65844659162868e-06, "loss": 0.2653, "step": 8235 }, { "epoch": 3.56, "grad_norm": 0.8263363451947964, "learning_rate": 4.645704951483583e-06, "loss": 0.2908, "step": 8240 }, { "epoch": 3.56, "grad_norm": 0.6953214011687743, "learning_rate": 4.632975485992903e-06, "loss": 0.2778, "step": 8245 }, { "epoch": 3.56, "grad_norm": 1.2865699217071096, "learning_rate": 4.620258224101057e-06, "loss": 0.2438, "step": 8250 }, { "epoch": 3.57, "grad_norm": 0.6117621448414158, "learning_rate": 4.607553194724677e-06, "loss": 0.2361, "step": 8255 }, { "epoch": 3.57, "grad_norm": 0.8914164995414522, "learning_rate": 4.594860426752607e-06, "loss": 0.2586, "step": 8260 }, { "epoch": 3.57, "grad_norm": 1.6881361423247938, "learning_rate": 4.5821799490458e-06, "loss": 0.287, "step": 8265 }, { "epoch": 3.57, "grad_norm": 1.2899428254311105, "learning_rate": 4.569511790437254e-06, "loss": 0.3003, "step": 8270 }, { "epoch": 3.57, "grad_norm": 1.6864476846021144, "learning_rate": 4.556855979731983e-06, "loss": 0.2636, "step": 8275 }, { "epoch": 3.58, "grad_norm": 0.9578718355240123, "learning_rate": 4.544212545706895e-06, "loss": 0.2438, "step": 8280 }, { "epoch": 3.58, "grad_norm": 1.2250154123094863, "learning_rate": 4.531581517110771e-06, "loss": 0.2761, "step": 8285 }, { "epoch": 3.58, "grad_norm": 0.5692185705890477, "learning_rate": 4.5189629226641876e-06, "loss": 0.2421, "step": 8290 }, { "epoch": 3.58, "grad_norm": 1.4743329422756897, "learning_rate": 4.506356791059427e-06, "loss": 0.2361, "step": 8295 }, { "epoch": 3.59, "grad_norm": 1.4258334907405563, "learning_rate": 4.493763150960468e-06, "loss": 0.2737, "step": 8300 }, { "epoch": 3.59, "grad_norm": 1.0513338499870875, "learning_rate": 4.481182031002853e-06, "loss": 0.2387, "step": 8305 }, { "epoch": 3.59, "grad_norm": 1.5143591042082543, "learning_rate": 4.468613459793675e-06, "loss": 0.2581, "step": 8310 }, { "epoch": 3.59, "grad_norm": 0.8425164174407974, "learning_rate": 4.456057465911489e-06, "loss": 0.2787, "step": 8315 }, { "epoch": 3.59, "grad_norm": 1.1182810076099785, "learning_rate": 4.443514077906242e-06, "loss": 0.2749, "step": 8320 }, { "epoch": 3.6, "grad_norm": 0.9606133791212749, "learning_rate": 4.430983324299242e-06, "loss": 0.2766, "step": 8325 }, { "epoch": 3.6, "grad_norm": 2.106283493636646, "learning_rate": 4.418465233583039e-06, "loss": 0.3489, "step": 8330 }, { "epoch": 3.6, "grad_norm": 0.8807348145669678, "learning_rate": 4.405959834221411e-06, "loss": 0.2673, "step": 8335 }, { "epoch": 3.6, "grad_norm": 1.2524322039073619, "learning_rate": 4.3934671546492715e-06, "loss": 0.247, "step": 8340 }, { "epoch": 3.6, "grad_norm": 1.0394890463062174, "learning_rate": 4.380987223272602e-06, "loss": 0.256, "step": 8345 }, { "epoch": 3.61, "grad_norm": 1.00137341927747, "learning_rate": 4.368520068468423e-06, "loss": 0.2457, "step": 8350 }, { "epoch": 3.61, "grad_norm": 1.7009249189017777, "learning_rate": 4.356065718584673e-06, "loss": 0.3016, "step": 8355 }, { "epoch": 3.61, "grad_norm": 1.5553016418878596, "learning_rate": 4.343624201940195e-06, "loss": 0.3043, "step": 8360 }, { "epoch": 3.61, "grad_norm": 1.197565393045384, "learning_rate": 4.331195546824646e-06, "loss": 0.2368, "step": 8365 }, { "epoch": 3.62, "grad_norm": 1.4071865316430026, "learning_rate": 4.318779781498429e-06, "loss": 0.3015, "step": 8370 }, { "epoch": 3.62, "grad_norm": 1.3096415330679698, "learning_rate": 4.306376934192659e-06, "loss": 0.2764, "step": 8375 }, { "epoch": 3.62, "grad_norm": 1.209092329560919, "learning_rate": 4.293987033109055e-06, "loss": 0.2275, "step": 8380 }, { "epoch": 3.62, "grad_norm": 1.7613370849719026, "learning_rate": 4.281610106419913e-06, "loss": 0.2764, "step": 8385 }, { "epoch": 3.62, "grad_norm": 1.295214792210852, "learning_rate": 4.269246182268027e-06, "loss": 0.2701, "step": 8390 }, { "epoch": 3.63, "grad_norm": 1.50791699725049, "learning_rate": 4.256895288766614e-06, "loss": 0.258, "step": 8395 }, { "epoch": 3.63, "grad_norm": 1.654373380876267, "learning_rate": 4.244557453999282e-06, "loss": 0.2752, "step": 8400 }, { "epoch": 3.63, "grad_norm": 1.9897713100742098, "learning_rate": 4.232232706019923e-06, "loss": 0.325, "step": 8405 }, { "epoch": 3.63, "grad_norm": 1.02148414818951, "learning_rate": 4.219921072852689e-06, "loss": 0.2122, "step": 8410 }, { "epoch": 3.63, "grad_norm": 1.1892877259874424, "learning_rate": 4.207622582491907e-06, "loss": 0.2297, "step": 8415 }, { "epoch": 3.64, "grad_norm": 1.237889295378514, "learning_rate": 4.195337262902008e-06, "loss": 0.3163, "step": 8420 }, { "epoch": 3.64, "grad_norm": 1.0589613077003643, "learning_rate": 4.183065142017503e-06, "loss": 0.2808, "step": 8425 }, { "epoch": 3.64, "grad_norm": 0.7728753942869604, "learning_rate": 4.17080624774286e-06, "loss": 0.2444, "step": 8430 }, { "epoch": 3.64, "grad_norm": 1.6520086613499878, "learning_rate": 4.158560607952491e-06, "loss": 0.2676, "step": 8435 }, { "epoch": 3.65, "grad_norm": 1.3800566942565582, "learning_rate": 4.146328250490669e-06, "loss": 0.2525, "step": 8440 }, { "epoch": 3.65, "grad_norm": 1.0603690959383418, "learning_rate": 4.13410920317145e-06, "loss": 0.2182, "step": 8445 }, { "epoch": 3.65, "grad_norm": 1.1364069940034944, "learning_rate": 4.121903493778653e-06, "loss": 0.2353, "step": 8450 }, { "epoch": 3.65, "grad_norm": 1.2550604556115696, "learning_rate": 4.1097111500657406e-06, "loss": 0.2702, "step": 8455 }, { "epoch": 3.65, "grad_norm": 1.0855724510468503, "learning_rate": 4.097532199755801e-06, "loss": 0.3182, "step": 8460 }, { "epoch": 3.66, "grad_norm": 0.8061230337692806, "learning_rate": 4.085366670541469e-06, "loss": 0.274, "step": 8465 }, { "epoch": 3.66, "grad_norm": 0.8247683248127725, "learning_rate": 4.073214590084846e-06, "loss": 0.2667, "step": 8470 }, { "epoch": 3.66, "grad_norm": 1.1839230068133735, "learning_rate": 4.061075986017477e-06, "loss": 0.2364, "step": 8475 }, { "epoch": 3.66, "grad_norm": 1.620173384449278, "learning_rate": 4.048950885940244e-06, "loss": 0.212, "step": 8480 }, { "epoch": 3.67, "grad_norm": 1.193378324765039, "learning_rate": 4.036839317423335e-06, "loss": 0.1993, "step": 8485 }, { "epoch": 3.67, "grad_norm": 0.8733667456843335, "learning_rate": 4.024741308006168e-06, "loss": 0.2398, "step": 8490 }, { "epoch": 3.67, "grad_norm": 1.102519578294471, "learning_rate": 4.012656885197321e-06, "loss": 0.2587, "step": 8495 }, { "epoch": 3.67, "grad_norm": 2.102655622053936, "learning_rate": 4.0005860764744996e-06, "loss": 0.3024, "step": 8500 }, { "epoch": 3.67, "grad_norm": 1.2982264667539742, "learning_rate": 3.988528909284428e-06, "loss": 0.2769, "step": 8505 }, { "epoch": 3.68, "grad_norm": 0.9207503337068603, "learning_rate": 3.97648541104283e-06, "loss": 0.2297, "step": 8510 }, { "epoch": 3.68, "grad_norm": 1.2932708481162427, "learning_rate": 3.964455609134345e-06, "loss": 0.2235, "step": 8515 }, { "epoch": 3.68, "grad_norm": 0.7143624184996928, "learning_rate": 3.952439530912457e-06, "loss": 0.2468, "step": 8520 }, { "epoch": 3.68, "grad_norm": 1.1683102857430236, "learning_rate": 3.94043720369947e-06, "loss": 0.2602, "step": 8525 }, { "epoch": 3.68, "grad_norm": 0.8805470000188161, "learning_rate": 3.928448654786397e-06, "loss": 0.2471, "step": 8530 }, { "epoch": 3.69, "grad_norm": 1.5405448678459484, "learning_rate": 3.916473911432934e-06, "loss": 0.2497, "step": 8535 }, { "epoch": 3.69, "grad_norm": 3.2238368617167614, "learning_rate": 3.904513000867386e-06, "loss": 0.2465, "step": 8540 }, { "epoch": 3.69, "grad_norm": 1.4638528980101133, "learning_rate": 3.892565950286592e-06, "loss": 0.2678, "step": 8545 }, { "epoch": 3.69, "grad_norm": 1.5184496363579658, "learning_rate": 3.880632786855899e-06, "loss": 0.2024, "step": 8550 }, { "epoch": 3.7, "grad_norm": 1.8252758600534205, "learning_rate": 3.868713537709055e-06, "loss": 0.3266, "step": 8555 }, { "epoch": 3.7, "grad_norm": 0.64264179568571, "learning_rate": 3.856808229948179e-06, "loss": 0.1998, "step": 8560 }, { "epoch": 3.7, "grad_norm": 1.1336128812569761, "learning_rate": 3.844916890643696e-06, "loss": 0.2246, "step": 8565 }, { "epoch": 3.7, "grad_norm": 0.941977941979213, "learning_rate": 3.833039546834252e-06, "loss": 0.2437, "step": 8570 }, { "epoch": 3.7, "grad_norm": 0.9723728987072218, "learning_rate": 3.821176225526694e-06, "loss": 0.2441, "step": 8575 }, { "epoch": 3.71, "grad_norm": 1.886020160147755, "learning_rate": 3.80932695369596e-06, "loss": 0.2605, "step": 8580 }, { "epoch": 3.71, "grad_norm": 0.8056642057647267, "learning_rate": 3.797491758285059e-06, "loss": 0.2323, "step": 8585 }, { "epoch": 3.71, "grad_norm": 1.2590539891249422, "learning_rate": 3.7856706662049893e-06, "loss": 0.2743, "step": 8590 }, { "epoch": 3.71, "grad_norm": 1.4292301576478457, "learning_rate": 3.7738637043346683e-06, "loss": 0.2592, "step": 8595 }, { "epoch": 3.71, "grad_norm": 0.9180460073700387, "learning_rate": 3.7620708995209087e-06, "loss": 0.2075, "step": 8600 }, { "epoch": 3.72, "grad_norm": 0.6673357074467728, "learning_rate": 3.7502922785783093e-06, "loss": 0.1997, "step": 8605 }, { "epoch": 3.72, "grad_norm": 0.762136138706124, "learning_rate": 3.738527868289229e-06, "loss": 0.2719, "step": 8610 }, { "epoch": 3.72, "grad_norm": 1.1211722000299258, "learning_rate": 3.726777695403716e-06, "loss": 0.2276, "step": 8615 }, { "epoch": 3.72, "grad_norm": 1.5634256260632298, "learning_rate": 3.7150417866394304e-06, "loss": 0.2348, "step": 8620 }, { "epoch": 3.73, "grad_norm": 1.6256231637279743, "learning_rate": 3.7033201686816257e-06, "loss": 0.2406, "step": 8625 }, { "epoch": 3.73, "grad_norm": 1.2187105096731732, "learning_rate": 3.691612868183031e-06, "loss": 0.2668, "step": 8630 }, { "epoch": 3.73, "grad_norm": 1.2149726719763858, "learning_rate": 3.6799199117638383e-06, "loss": 0.2592, "step": 8635 }, { "epoch": 3.73, "grad_norm": 0.764253856485244, "learning_rate": 3.6682413260116235e-06, "loss": 0.2218, "step": 8640 }, { "epoch": 3.73, "grad_norm": 0.9006111227777452, "learning_rate": 3.656577137481271e-06, "loss": 0.2331, "step": 8645 }, { "epoch": 3.74, "grad_norm": 0.9146889853788844, "learning_rate": 3.644927372694954e-06, "loss": 0.2657, "step": 8650 }, { "epoch": 3.74, "grad_norm": 1.1118503466233536, "learning_rate": 3.633292058142024e-06, "loss": 0.2393, "step": 8655 }, { "epoch": 3.74, "grad_norm": 1.0922268807899702, "learning_rate": 3.621671220278993e-06, "loss": 0.2595, "step": 8660 }, { "epoch": 3.74, "grad_norm": 1.425843520802282, "learning_rate": 3.6100648855294417e-06, "loss": 0.2259, "step": 8665 }, { "epoch": 3.75, "grad_norm": 1.0010134323284015, "learning_rate": 3.598473080283982e-06, "loss": 0.2282, "step": 8670 }, { "epoch": 3.75, "grad_norm": 1.6319837617789361, "learning_rate": 3.5868958309001942e-06, "loss": 0.2152, "step": 8675 }, { "epoch": 3.75, "grad_norm": 1.1531835012647713, "learning_rate": 3.575333163702547e-06, "loss": 0.1998, "step": 8680 }, { "epoch": 3.75, "grad_norm": 0.7140806404064783, "learning_rate": 3.5637851049823645e-06, "loss": 0.1986, "step": 8685 }, { "epoch": 3.75, "grad_norm": 1.396396649138274, "learning_rate": 3.5522516809977416e-06, "loss": 0.2749, "step": 8690 }, { "epoch": 3.76, "grad_norm": 1.790703192319098, "learning_rate": 3.5407329179735027e-06, "loss": 0.2027, "step": 8695 }, { "epoch": 3.76, "grad_norm": 1.1619765530746924, "learning_rate": 3.529228842101149e-06, "loss": 0.2178, "step": 8700 }, { "epoch": 3.76, "grad_norm": 1.1226391583686264, "learning_rate": 3.517739479538762e-06, "loss": 0.2347, "step": 8705 }, { "epoch": 3.76, "grad_norm": 1.5086143605668283, "learning_rate": 3.5062648564109857e-06, "loss": 0.2359, "step": 8710 }, { "epoch": 3.76, "grad_norm": 1.6945930578072637, "learning_rate": 3.4948049988089373e-06, "loss": 0.2149, "step": 8715 }, { "epoch": 3.77, "grad_norm": 1.0255293861017403, "learning_rate": 3.4833599327901656e-06, "loss": 0.2112, "step": 8720 }, { "epoch": 3.77, "grad_norm": 1.7693180895647955, "learning_rate": 3.4719296843785977e-06, "loss": 0.2457, "step": 8725 }, { "epoch": 3.77, "grad_norm": 1.4966986753679365, "learning_rate": 3.4605142795644454e-06, "loss": 0.2843, "step": 8730 }, { "epoch": 3.77, "grad_norm": 0.670891091445466, "learning_rate": 3.4491137443041865e-06, "loss": 0.2146, "step": 8735 }, { "epoch": 3.78, "grad_norm": 1.8052307720833396, "learning_rate": 3.437728104520478e-06, "loss": 0.2104, "step": 8740 }, { "epoch": 3.78, "grad_norm": 1.1813095312113593, "learning_rate": 3.4263573861021104e-06, "loss": 0.2118, "step": 8745 }, { "epoch": 3.78, "grad_norm": 1.4173325757700486, "learning_rate": 3.415001614903958e-06, "loss": 0.2096, "step": 8750 }, { "epoch": 3.78, "grad_norm": 2.4218252609777213, "learning_rate": 3.4036608167468856e-06, "loss": 0.2576, "step": 8755 }, { "epoch": 3.78, "grad_norm": 1.461680496985334, "learning_rate": 3.3923350174177304e-06, "loss": 0.2608, "step": 8760 }, { "epoch": 3.79, "grad_norm": 1.2997679357447258, "learning_rate": 3.381024242669213e-06, "loss": 0.2357, "step": 8765 }, { "epoch": 3.79, "grad_norm": 0.8294127235093294, "learning_rate": 3.3697285182198957e-06, "loss": 0.2536, "step": 8770 }, { "epoch": 3.79, "grad_norm": 1.0896228556623861, "learning_rate": 3.35844786975413e-06, "loss": 0.2489, "step": 8775 }, { "epoch": 3.79, "grad_norm": 2.3166544955166914, "learning_rate": 3.347182322921967e-06, "loss": 0.1742, "step": 8780 }, { "epoch": 3.79, "grad_norm": 1.2716000940595575, "learning_rate": 3.3359319033391366e-06, "loss": 0.2001, "step": 8785 }, { "epoch": 3.8, "grad_norm": 1.3878114041564946, "learning_rate": 3.3246966365869593e-06, "loss": 0.1999, "step": 8790 }, { "epoch": 3.8, "grad_norm": 1.4073458518658912, "learning_rate": 3.3134765482123056e-06, "loss": 0.2048, "step": 8795 }, { "epoch": 3.8, "grad_norm": 2.257280307648163, "learning_rate": 3.3022716637275466e-06, "loss": 0.2383, "step": 8800 }, { "epoch": 3.8, "grad_norm": 1.4865011423989034, "learning_rate": 3.291082008610459e-06, "loss": 0.2236, "step": 8805 }, { "epoch": 3.81, "grad_norm": 1.6028396692657991, "learning_rate": 3.2799076083042104e-06, "loss": 0.2184, "step": 8810 }, { "epoch": 3.81, "grad_norm": 0.7425058030584089, "learning_rate": 3.268748488217267e-06, "loss": 0.1825, "step": 8815 }, { "epoch": 3.81, "grad_norm": 1.8488035722222276, "learning_rate": 3.2576046737233567e-06, "loss": 0.2211, "step": 8820 }, { "epoch": 3.81, "grad_norm": 1.3816883588335565, "learning_rate": 3.246476190161418e-06, "loss": 0.2256, "step": 8825 }, { "epoch": 3.81, "grad_norm": 2.071282818859703, "learning_rate": 3.235363062835507e-06, "loss": 0.2327, "step": 8830 }, { "epoch": 3.82, "grad_norm": 1.4979771549526812, "learning_rate": 3.224265317014781e-06, "loss": 0.213, "step": 8835 }, { "epoch": 3.82, "grad_norm": 1.056089605035267, "learning_rate": 3.213182977933408e-06, "loss": 0.2149, "step": 8840 }, { "epoch": 3.82, "grad_norm": 1.352835390586533, "learning_rate": 3.2021160707905353e-06, "loss": 0.1822, "step": 8845 }, { "epoch": 3.82, "grad_norm": 1.010725156923398, "learning_rate": 3.1910646207502173e-06, "loss": 0.1812, "step": 8850 }, { "epoch": 3.83, "grad_norm": 1.9017812068754216, "learning_rate": 3.1800286529413605e-06, "loss": 0.2013, "step": 8855 }, { "epoch": 3.83, "grad_norm": 1.6218418481134442, "learning_rate": 3.169008192457672e-06, "loss": 0.1713, "step": 8860 }, { "epoch": 3.83, "grad_norm": 1.8330711024738942, "learning_rate": 3.15800326435759e-06, "loss": 0.2366, "step": 8865 }, { "epoch": 3.83, "grad_norm": 1.325482478087709, "learning_rate": 3.1470138936642415e-06, "loss": 0.1753, "step": 8870 }, { "epoch": 3.83, "grad_norm": 1.2317388715406854, "learning_rate": 3.1360401053653776e-06, "loss": 0.1867, "step": 8875 }, { "epoch": 3.84, "grad_norm": 1.383078361027162, "learning_rate": 3.125081924413318e-06, "loss": 0.1834, "step": 8880 }, { "epoch": 3.84, "grad_norm": 1.6364075116336452, "learning_rate": 3.1141393757248973e-06, "loss": 0.1965, "step": 8885 }, { "epoch": 3.84, "grad_norm": 1.5408796381251828, "learning_rate": 3.103212484181396e-06, "loss": 0.1789, "step": 8890 }, { "epoch": 3.84, "grad_norm": 0.7313558930891003, "learning_rate": 3.092301274628502e-06, "loss": 0.221, "step": 8895 }, { "epoch": 3.84, "grad_norm": 1.7236990433362605, "learning_rate": 3.0814057718762413e-06, "loss": 0.1794, "step": 8900 }, { "epoch": 3.85, "grad_norm": 1.309025789799166, "learning_rate": 3.070526000698928e-06, "loss": 0.1904, "step": 8905 }, { "epoch": 3.85, "grad_norm": 1.3057919563887057, "learning_rate": 3.0596619858351073e-06, "loss": 0.1305, "step": 8910 }, { "epoch": 3.85, "grad_norm": 1.2826997903059916, "learning_rate": 3.0488137519874896e-06, "loss": 0.1854, "step": 8915 }, { "epoch": 3.85, "grad_norm": 2.0736399556689205, "learning_rate": 3.0379813238229074e-06, "loss": 0.1735, "step": 8920 }, { "epoch": 3.86, "grad_norm": 0.9527611277386501, "learning_rate": 3.027164725972256e-06, "loss": 0.1645, "step": 8925 }, { "epoch": 3.86, "grad_norm": 1.1335358478265696, "learning_rate": 3.016363983030434e-06, "loss": 0.1621, "step": 8930 }, { "epoch": 3.86, "grad_norm": 2.521984786858169, "learning_rate": 3.0055791195562887e-06, "loss": 0.2026, "step": 8935 }, { "epoch": 3.86, "grad_norm": 1.836220144705608, "learning_rate": 2.9948101600725565e-06, "loss": 0.1724, "step": 8940 }, { "epoch": 3.86, "grad_norm": 1.136504620875479, "learning_rate": 2.9840571290658184e-06, "loss": 0.1774, "step": 8945 }, { "epoch": 3.87, "grad_norm": 1.4816584731960452, "learning_rate": 2.9733200509864314e-06, "loss": 0.178, "step": 8950 }, { "epoch": 3.87, "grad_norm": 1.1642803297715238, "learning_rate": 2.962598950248481e-06, "loss": 0.154, "step": 8955 }, { "epoch": 3.87, "grad_norm": 2.7690606796997552, "learning_rate": 2.9518938512297267e-06, "loss": 0.1782, "step": 8960 }, { "epoch": 3.87, "grad_norm": 1.6223686343268287, "learning_rate": 2.9412047782715334e-06, "loss": 0.1524, "step": 8965 }, { "epoch": 3.87, "grad_norm": 1.3358911181846544, "learning_rate": 2.930531755678835e-06, "loss": 0.1703, "step": 8970 }, { "epoch": 3.88, "grad_norm": 0.7845302281471236, "learning_rate": 2.9198748077200644e-06, "loss": 0.1376, "step": 8975 }, { "epoch": 3.88, "grad_norm": 3.196234583882958, "learning_rate": 2.909233958627108e-06, "loss": 0.1851, "step": 8980 }, { "epoch": 3.88, "grad_norm": 1.5247189113614013, "learning_rate": 2.8986092325952474e-06, "loss": 0.1915, "step": 8985 }, { "epoch": 3.88, "grad_norm": 1.7392497600102992, "learning_rate": 2.888000653783094e-06, "loss": 0.1903, "step": 8990 }, { "epoch": 3.89, "grad_norm": 1.2139519723864711, "learning_rate": 2.877408246312552e-06, "loss": 0.1696, "step": 8995 }, { "epoch": 3.89, "grad_norm": 2.3036921230601677, "learning_rate": 2.866832034268754e-06, "loss": 0.1861, "step": 9000 }, { "epoch": 3.89, "grad_norm": 1.998159592775058, "learning_rate": 2.8562720417000056e-06, "loss": 0.1493, "step": 9005 }, { "epoch": 3.89, "grad_norm": 2.0385570109143663, "learning_rate": 2.8457282926177376e-06, "loss": 0.1725, "step": 9010 }, { "epoch": 3.89, "grad_norm": 1.038409279355451, "learning_rate": 2.8352008109964336e-06, "loss": 0.143, "step": 9015 }, { "epoch": 3.9, "grad_norm": 2.8111977365091607, "learning_rate": 2.8246896207735997e-06, "loss": 0.1763, "step": 9020 }, { "epoch": 3.9, "grad_norm": 1.115676301969164, "learning_rate": 2.8141947458496943e-06, "loss": 0.1369, "step": 9025 }, { "epoch": 3.9, "grad_norm": 2.2547605054805384, "learning_rate": 2.803716210088079e-06, "loss": 0.1525, "step": 9030 }, { "epoch": 3.9, "grad_norm": 2.655789824220769, "learning_rate": 2.7932540373149654e-06, "loss": 0.1598, "step": 9035 }, { "epoch": 3.9, "grad_norm": 1.366174287885307, "learning_rate": 2.78280825131935e-06, "loss": 0.1455, "step": 9040 }, { "epoch": 3.91, "grad_norm": 1.5512552019581032, "learning_rate": 2.772378875852978e-06, "loss": 0.1598, "step": 9045 }, { "epoch": 3.91, "grad_norm": 1.7769531325546062, "learning_rate": 2.7619659346302775e-06, "loss": 0.1343, "step": 9050 }, { "epoch": 3.91, "grad_norm": 1.7409746059130171, "learning_rate": 2.751569451328305e-06, "loss": 0.1497, "step": 9055 }, { "epoch": 3.91, "grad_norm": 3.2371623664471656, "learning_rate": 2.741189449586704e-06, "loss": 0.1452, "step": 9060 }, { "epoch": 3.92, "grad_norm": 1.756639522059405, "learning_rate": 2.7308259530076286e-06, "loss": 0.127, "step": 9065 }, { "epoch": 3.92, "grad_norm": 1.4592380310799502, "learning_rate": 2.720478985155712e-06, "loss": 0.1181, "step": 9070 }, { "epoch": 3.92, "grad_norm": 0.5540812705657977, "learning_rate": 2.7101485695580033e-06, "loss": 0.1276, "step": 9075 }, { "epoch": 3.92, "grad_norm": 0.730628334741152, "learning_rate": 2.699834729703914e-06, "loss": 0.1395, "step": 9080 }, { "epoch": 3.92, "grad_norm": 1.2814134539945647, "learning_rate": 2.6895374890451686e-06, "loss": 0.1406, "step": 9085 }, { "epoch": 3.93, "grad_norm": 1.6017173096861919, "learning_rate": 2.679256870995739e-06, "loss": 0.1497, "step": 9090 }, { "epoch": 3.93, "grad_norm": 4.377136176957885, "learning_rate": 2.668992898931809e-06, "loss": 0.1313, "step": 9095 }, { "epoch": 3.93, "grad_norm": 1.1466406266728912, "learning_rate": 2.658745596191712e-06, "loss": 0.1178, "step": 9100 }, { "epoch": 3.93, "grad_norm": 1.4412542325562125, "learning_rate": 2.648514986075874e-06, "loss": 0.1213, "step": 9105 }, { "epoch": 3.94, "grad_norm": 1.0094886930705556, "learning_rate": 2.638301091846772e-06, "loss": 0.1282, "step": 9110 }, { "epoch": 3.94, "grad_norm": 1.9523149478373765, "learning_rate": 2.6281039367288653e-06, "loss": 0.1332, "step": 9115 }, { "epoch": 3.94, "grad_norm": 1.6163037295208362, "learning_rate": 2.6179235439085584e-06, "loss": 0.1318, "step": 9120 }, { "epoch": 3.94, "grad_norm": 1.6749344686293903, "learning_rate": 2.607759936534138e-06, "loss": 0.1138, "step": 9125 }, { "epoch": 3.94, "grad_norm": 3.742751088082727, "learning_rate": 2.5976131377157264e-06, "loss": 0.1408, "step": 9130 }, { "epoch": 3.95, "grad_norm": 1.2402729828056847, "learning_rate": 2.587483170525229e-06, "loss": 0.1155, "step": 9135 }, { "epoch": 3.95, "grad_norm": 1.5290565815887704, "learning_rate": 2.577370057996268e-06, "loss": 0.1458, "step": 9140 }, { "epoch": 3.95, "grad_norm": 1.728302719729036, "learning_rate": 2.5672738231241547e-06, "loss": 0.1222, "step": 9145 }, { "epoch": 3.95, "grad_norm": 1.6795308054327185, "learning_rate": 2.557194488865814e-06, "loss": 0.1015, "step": 9150 }, { "epoch": 3.95, "grad_norm": 1.1889232094471702, "learning_rate": 2.5471320781397493e-06, "loss": 0.1276, "step": 9155 }, { "epoch": 3.96, "grad_norm": 1.677934768849866, "learning_rate": 2.537086613825981e-06, "loss": 0.1133, "step": 9160 }, { "epoch": 3.96, "grad_norm": 0.8772658619002762, "learning_rate": 2.5270581187659903e-06, "loss": 0.1193, "step": 9165 }, { "epoch": 3.96, "grad_norm": 1.4849426603993638, "learning_rate": 2.5170466157626803e-06, "loss": 0.1348, "step": 9170 }, { "epoch": 3.96, "grad_norm": 1.2635413147574273, "learning_rate": 2.5070521275803173e-06, "loss": 0.1225, "step": 9175 }, { "epoch": 3.97, "grad_norm": 1.921588573285143, "learning_rate": 2.4970746769444763e-06, "loss": 0.1412, "step": 9180 }, { "epoch": 3.97, "grad_norm": 2.084574262761302, "learning_rate": 2.487114286541997e-06, "loss": 0.1192, "step": 9185 }, { "epoch": 3.97, "grad_norm": 4.839684727447521, "learning_rate": 2.4771709790209163e-06, "loss": 0.0956, "step": 9190 }, { "epoch": 3.97, "grad_norm": 2.0640268839481233, "learning_rate": 2.4672447769904395e-06, "loss": 0.1107, "step": 9195 }, { "epoch": 3.97, "grad_norm": 0.8056949030488845, "learning_rate": 2.4573357030208724e-06, "loss": 0.1021, "step": 9200 }, { "epoch": 3.98, "grad_norm": 1.4232707574434897, "learning_rate": 2.447443779643576e-06, "loss": 0.0994, "step": 9205 }, { "epoch": 3.98, "grad_norm": 1.2627211187763787, "learning_rate": 2.4375690293509147e-06, "loss": 0.1016, "step": 9210 }, { "epoch": 3.98, "grad_norm": 2.7302827184383833, "learning_rate": 2.4277114745961994e-06, "loss": 0.1271, "step": 9215 }, { "epoch": 3.98, "grad_norm": 1.3872210610935318, "learning_rate": 2.4178711377936483e-06, "loss": 0.1084, "step": 9220 }, { "epoch": 3.98, "grad_norm": 2.743388377176648, "learning_rate": 2.408048041318325e-06, "loss": 0.1177, "step": 9225 }, { "epoch": 3.99, "grad_norm": 1.617957502406765, "learning_rate": 2.3982422075060942e-06, "loss": 0.1141, "step": 9230 }, { "epoch": 3.99, "grad_norm": 0.9313918034039486, "learning_rate": 2.38845365865357e-06, "loss": 0.0999, "step": 9235 }, { "epoch": 3.99, "grad_norm": 0.961739039698364, "learning_rate": 2.3786824170180577e-06, "loss": 0.1015, "step": 9240 }, { "epoch": 3.99, "grad_norm": 0.9267930669073516, "learning_rate": 2.3689285048175125e-06, "loss": 0.1093, "step": 9245 }, { "epoch": 4.0, "grad_norm": 1.2497965354801768, "learning_rate": 2.35919194423049e-06, "loss": 0.1134, "step": 9250 }, { "epoch": 4.0, "grad_norm": 0.7334404670095594, "learning_rate": 2.349472757396084e-06, "loss": 0.0949, "step": 9255 }, { "epoch": 4.0, "grad_norm": 1.5321023821904252, "learning_rate": 2.3397709664138933e-06, "loss": 0.0959, "step": 9260 }, { "epoch": 4.0, "eval_loss": 1.1934491395950317, "eval_runtime": 352.8997, "eval_samples_per_second": 21.272, "eval_steps_per_second": 0.334, "step": 9260 }, { "epoch": 4.0, "grad_norm": 1.0653990301674712, "learning_rate": 2.33008659334395e-06, "loss": 0.0982, "step": 9265 }, { "epoch": 4.0, "grad_norm": 1.3878497410391784, "learning_rate": 2.3204196602066875e-06, "loss": 0.1019, "step": 9270 }, { "epoch": 4.01, "grad_norm": 1.3837070784620797, "learning_rate": 2.310770188982886e-06, "loss": 0.1095, "step": 9275 }, { "epoch": 4.01, "grad_norm": 1.3288988915285944, "learning_rate": 2.3011382016136176e-06, "loss": 0.0963, "step": 9280 }, { "epoch": 4.01, "grad_norm": 0.5189759633457958, "learning_rate": 2.2915237200002017e-06, "loss": 0.079, "step": 9285 }, { "epoch": 4.01, "grad_norm": 0.4571507747282879, "learning_rate": 2.2819267660041488e-06, "loss": 0.1004, "step": 9290 }, { "epoch": 4.02, "grad_norm": 1.0405793385567381, "learning_rate": 2.272347361447117e-06, "loss": 0.0998, "step": 9295 }, { "epoch": 4.02, "grad_norm": 1.047472346373859, "learning_rate": 2.2627855281108637e-06, "loss": 0.0876, "step": 9300 }, { "epoch": 4.02, "grad_norm": 0.8366712173763914, "learning_rate": 2.2532412877371866e-06, "loss": 0.0993, "step": 9305 }, { "epoch": 4.02, "grad_norm": 1.2776670445610616, "learning_rate": 2.2437146620278892e-06, "loss": 0.0909, "step": 9310 }, { "epoch": 4.02, "grad_norm": 0.8102516110822815, "learning_rate": 2.23420567264471e-06, "loss": 0.0914, "step": 9315 }, { "epoch": 4.03, "grad_norm": 1.025504447258419, "learning_rate": 2.2247143412092954e-06, "loss": 0.095, "step": 9320 }, { "epoch": 4.03, "grad_norm": 1.0695140092879165, "learning_rate": 2.2152406893031375e-06, "loss": 0.0822, "step": 9325 }, { "epoch": 4.03, "grad_norm": 0.9489084492072272, "learning_rate": 2.2057847384675313e-06, "loss": 0.0793, "step": 9330 }, { "epoch": 4.03, "grad_norm": 0.411369707442681, "learning_rate": 2.196346510203522e-06, "loss": 0.0785, "step": 9335 }, { "epoch": 4.03, "grad_norm": 0.9137657417624943, "learning_rate": 2.1869260259718496e-06, "loss": 0.094, "step": 9340 }, { "epoch": 4.04, "grad_norm": 1.0826618792252996, "learning_rate": 2.1775233071929192e-06, "loss": 0.0841, "step": 9345 }, { "epoch": 4.04, "grad_norm": 1.0704564255497284, "learning_rate": 2.1681383752467322e-06, "loss": 0.1025, "step": 9350 }, { "epoch": 4.04, "grad_norm": 1.0041128774767623, "learning_rate": 2.158771251472852e-06, "loss": 0.0908, "step": 9355 }, { "epoch": 4.04, "grad_norm": 0.8948045733048677, "learning_rate": 2.1494219571703477e-06, "loss": 0.0813, "step": 9360 }, { "epoch": 4.05, "grad_norm": 1.582811230887437, "learning_rate": 2.1400905135977413e-06, "loss": 0.096, "step": 9365 }, { "epoch": 4.05, "grad_norm": 1.4642103391429275, "learning_rate": 2.130776941972974e-06, "loss": 0.0998, "step": 9370 }, { "epoch": 4.05, "grad_norm": 0.7389464692871076, "learning_rate": 2.121481263473345e-06, "loss": 0.1021, "step": 9375 }, { "epoch": 4.05, "grad_norm": 0.9025632399425323, "learning_rate": 2.112203499235472e-06, "loss": 0.091, "step": 9380 }, { "epoch": 4.05, "grad_norm": 0.9300066681199844, "learning_rate": 2.1029436703552376e-06, "loss": 0.0855, "step": 9385 }, { "epoch": 4.06, "grad_norm": 0.8485203002577559, "learning_rate": 2.0937017978877363e-06, "loss": 0.076, "step": 9390 }, { "epoch": 4.06, "grad_norm": 0.905058540351386, "learning_rate": 2.0844779028472407e-06, "loss": 0.0905, "step": 9395 }, { "epoch": 4.06, "grad_norm": 1.1361180090897958, "learning_rate": 2.075272006207144e-06, "loss": 0.0845, "step": 9400 }, { "epoch": 4.06, "grad_norm": 0.9367653552399894, "learning_rate": 2.0660841288999158e-06, "loss": 0.0797, "step": 9405 }, { "epoch": 4.06, "grad_norm": 0.918363753389822, "learning_rate": 2.0569142918170514e-06, "loss": 0.0822, "step": 9410 }, { "epoch": 4.07, "grad_norm": 1.4120591611331728, "learning_rate": 2.0477625158090242e-06, "loss": 0.0776, "step": 9415 }, { "epoch": 4.07, "grad_norm": 0.8011915390179812, "learning_rate": 2.038628821685242e-06, "loss": 0.0819, "step": 9420 }, { "epoch": 4.07, "grad_norm": 1.2772706148258295, "learning_rate": 2.029513230214001e-06, "loss": 0.0807, "step": 9425 }, { "epoch": 4.07, "grad_norm": 0.9856245196630447, "learning_rate": 2.0204157621224297e-06, "loss": 0.0789, "step": 9430 }, { "epoch": 4.08, "grad_norm": 1.0849195766702093, "learning_rate": 2.011336438096454e-06, "loss": 0.0847, "step": 9435 }, { "epoch": 4.08, "grad_norm": 0.9174908838938806, "learning_rate": 2.0022752787807354e-06, "loss": 0.0801, "step": 9440 }, { "epoch": 4.08, "grad_norm": 0.8817523981249608, "learning_rate": 1.9932323047786373e-06, "loss": 0.0814, "step": 9445 }, { "epoch": 4.08, "grad_norm": 0.7006834202212598, "learning_rate": 1.984207536652174e-06, "loss": 0.0724, "step": 9450 }, { "epoch": 4.08, "grad_norm": 0.9119370518619543, "learning_rate": 1.975200994921961e-06, "loss": 0.0753, "step": 9455 }, { "epoch": 4.09, "grad_norm": 0.6143322482587131, "learning_rate": 1.9662127000671738e-06, "loss": 0.0905, "step": 9460 }, { "epoch": 4.09, "grad_norm": 1.1511015120231334, "learning_rate": 1.9572426725254902e-06, "loss": 0.1014, "step": 9465 }, { "epoch": 4.09, "grad_norm": 0.6188993889727594, "learning_rate": 1.9482909326930586e-06, "loss": 0.0747, "step": 9470 }, { "epoch": 4.09, "grad_norm": 1.3750071264768102, "learning_rate": 1.9393575009244437e-06, "loss": 0.0899, "step": 9475 }, { "epoch": 4.1, "grad_norm": 1.2496684123687336, "learning_rate": 1.930442397532578e-06, "loss": 0.0851, "step": 9480 }, { "epoch": 4.1, "grad_norm": 1.3788685782832, "learning_rate": 1.9215456427887257e-06, "loss": 0.0787, "step": 9485 }, { "epoch": 4.1, "grad_norm": 0.792384806701021, "learning_rate": 1.912667256922418e-06, "loss": 0.0767, "step": 9490 }, { "epoch": 4.1, "grad_norm": 0.6332410126137582, "learning_rate": 1.9038072601214285e-06, "loss": 0.0763, "step": 9495 }, { "epoch": 4.1, "grad_norm": 0.7226336577944539, "learning_rate": 1.8949656725317144e-06, "loss": 0.0907, "step": 9500 }, { "epoch": 4.11, "grad_norm": 0.8517719067216786, "learning_rate": 1.8861425142573742e-06, "loss": 0.0793, "step": 9505 }, { "epoch": 4.11, "grad_norm": 0.7481972063779955, "learning_rate": 1.877337805360604e-06, "loss": 0.0822, "step": 9510 }, { "epoch": 4.11, "grad_norm": 1.2484833698888498, "learning_rate": 1.8685515658616404e-06, "loss": 0.0834, "step": 9515 }, { "epoch": 4.11, "grad_norm": 0.5698463781012858, "learning_rate": 1.8597838157387339e-06, "loss": 0.0817, "step": 9520 }, { "epoch": 4.11, "grad_norm": 0.9409277359440581, "learning_rate": 1.851034574928091e-06, "loss": 0.0747, "step": 9525 }, { "epoch": 4.12, "grad_norm": 0.590114308975083, "learning_rate": 1.8423038633238322e-06, "loss": 0.0787, "step": 9530 }, { "epoch": 4.12, "grad_norm": 1.0433688110927457, "learning_rate": 1.8335917007779403e-06, "loss": 0.0789, "step": 9535 }, { "epoch": 4.12, "grad_norm": 1.5691971033584857, "learning_rate": 1.824898107100227e-06, "loss": 0.0957, "step": 9540 }, { "epoch": 4.12, "grad_norm": 0.6829735197372712, "learning_rate": 1.816223102058282e-06, "loss": 0.0688, "step": 9545 }, { "epoch": 4.13, "grad_norm": 1.1699567655757332, "learning_rate": 1.8075667053774248e-06, "loss": 0.0761, "step": 9550 }, { "epoch": 4.13, "grad_norm": 0.8808444999201729, "learning_rate": 1.7989289367406682e-06, "loss": 0.0857, "step": 9555 }, { "epoch": 4.13, "grad_norm": 0.7162116245902845, "learning_rate": 1.790309815788659e-06, "loss": 0.0695, "step": 9560 }, { "epoch": 4.13, "grad_norm": 0.754845419443789, "learning_rate": 1.78170936211965e-06, "loss": 0.0741, "step": 9565 }, { "epoch": 4.13, "grad_norm": 0.5516335707135073, "learning_rate": 1.7731275952894488e-06, "loss": 0.0727, "step": 9570 }, { "epoch": 4.14, "grad_norm": 0.523203245900932, "learning_rate": 1.7645645348113683e-06, "loss": 0.0729, "step": 9575 }, { "epoch": 4.14, "grad_norm": 0.7940801541327239, "learning_rate": 1.756020200156191e-06, "loss": 0.0738, "step": 9580 }, { "epoch": 4.14, "grad_norm": 0.4578960285809017, "learning_rate": 1.7474946107521152e-06, "loss": 0.0746, "step": 9585 }, { "epoch": 4.14, "grad_norm": 0.8322294793091983, "learning_rate": 1.7389877859847171e-06, "loss": 0.0787, "step": 9590 }, { "epoch": 4.14, "grad_norm": 0.5550793793378821, "learning_rate": 1.7304997451969153e-06, "loss": 0.0727, "step": 9595 }, { "epoch": 4.15, "grad_norm": 0.7568868583227343, "learning_rate": 1.722030507688902e-06, "loss": 0.0783, "step": 9600 }, { "epoch": 4.15, "grad_norm": 0.7148668063423238, "learning_rate": 1.713580092718129e-06, "loss": 0.0751, "step": 9605 }, { "epoch": 4.15, "grad_norm": 0.6017299148369583, "learning_rate": 1.7051485194992335e-06, "loss": 0.0675, "step": 9610 }, { "epoch": 4.15, "grad_norm": 0.7850003228031432, "learning_rate": 1.6967358072040208e-06, "loss": 0.0701, "step": 9615 }, { "epoch": 4.16, "grad_norm": 0.6014246832157257, "learning_rate": 1.6883419749614137e-06, "loss": 0.0716, "step": 9620 }, { "epoch": 4.16, "grad_norm": 0.7364660205675014, "learning_rate": 1.6799670418573932e-06, "loss": 0.0715, "step": 9625 }, { "epoch": 4.16, "grad_norm": 0.7778637332406131, "learning_rate": 1.6716110269349782e-06, "loss": 0.0728, "step": 9630 }, { "epoch": 4.16, "grad_norm": 0.5496082761498826, "learning_rate": 1.6632739491941608e-06, "loss": 0.0851, "step": 9635 }, { "epoch": 4.16, "grad_norm": 1.039006198166973, "learning_rate": 1.654955827591881e-06, "loss": 0.0663, "step": 9640 }, { "epoch": 4.17, "grad_norm": 1.1424991869818257, "learning_rate": 1.6466566810419793e-06, "loss": 0.0778, "step": 9645 }, { "epoch": 4.17, "grad_norm": 0.6614813628662064, "learning_rate": 1.6383765284151398e-06, "loss": 0.0807, "step": 9650 }, { "epoch": 4.17, "grad_norm": 1.0752557823535098, "learning_rate": 1.6301153885388688e-06, "loss": 0.0752, "step": 9655 }, { "epoch": 4.17, "grad_norm": 1.158097069047512, "learning_rate": 1.6218732801974302e-06, "loss": 0.0829, "step": 9660 }, { "epoch": 4.17, "grad_norm": 0.81209934506329, "learning_rate": 1.6136502221318195e-06, "loss": 0.077, "step": 9665 }, { "epoch": 4.18, "grad_norm": 0.6199113304485403, "learning_rate": 1.6054462330397259e-06, "loss": 0.0724, "step": 9670 }, { "epoch": 4.18, "grad_norm": 0.6901257372883581, "learning_rate": 1.5972613315754583e-06, "loss": 0.0708, "step": 9675 }, { "epoch": 4.18, "grad_norm": 0.5410878140359482, "learning_rate": 1.5890955363499416e-06, "loss": 0.0643, "step": 9680 }, { "epoch": 4.18, "grad_norm": 0.5222408854828781, "learning_rate": 1.580948865930645e-06, "loss": 0.0689, "step": 9685 }, { "epoch": 4.19, "grad_norm": 1.17027325579557, "learning_rate": 1.572821338841558e-06, "loss": 0.0836, "step": 9690 }, { "epoch": 4.19, "grad_norm": 1.206300905777825, "learning_rate": 1.5647129735631416e-06, "loss": 0.071, "step": 9695 }, { "epoch": 4.19, "grad_norm": 0.6861332362617869, "learning_rate": 1.5566237885322843e-06, "loss": 0.0816, "step": 9700 }, { "epoch": 4.19, "grad_norm": 0.9367655165575982, "learning_rate": 1.5485538021422653e-06, "loss": 0.0861, "step": 9705 }, { "epoch": 4.19, "grad_norm": 0.6259566627418062, "learning_rate": 1.540503032742703e-06, "loss": 0.0645, "step": 9710 }, { "epoch": 4.2, "grad_norm": 0.8621509722069886, "learning_rate": 1.5324714986395284e-06, "loss": 0.0761, "step": 9715 }, { "epoch": 4.2, "grad_norm": 0.9362839555402197, "learning_rate": 1.5244592180949292e-06, "loss": 0.0759, "step": 9720 }, { "epoch": 4.2, "grad_norm": 0.8468366297743869, "learning_rate": 1.5164662093273174e-06, "loss": 0.0691, "step": 9725 }, { "epoch": 4.2, "grad_norm": 1.5679881546372574, "learning_rate": 1.5084924905112853e-06, "loss": 0.0762, "step": 9730 }, { "epoch": 4.21, "grad_norm": 0.8035954051894941, "learning_rate": 1.500538079777557e-06, "loss": 0.0709, "step": 9735 }, { "epoch": 4.21, "grad_norm": 0.7669725757041786, "learning_rate": 1.4926029952129607e-06, "loss": 0.0815, "step": 9740 }, { "epoch": 4.21, "grad_norm": 0.7855723793535851, "learning_rate": 1.4846872548603787e-06, "loss": 0.086, "step": 9745 }, { "epoch": 4.21, "grad_norm": 0.2646335220272136, "learning_rate": 1.4767908767187067e-06, "loss": 0.0582, "step": 9750 }, { "epoch": 4.21, "grad_norm": 0.8482657133461957, "learning_rate": 1.468913878742818e-06, "loss": 0.0664, "step": 9755 }, { "epoch": 4.22, "grad_norm": 0.7945340610515387, "learning_rate": 1.4610562788435123e-06, "loss": 0.067, "step": 9760 }, { "epoch": 4.22, "grad_norm": 0.6109194241490044, "learning_rate": 1.4532180948874874e-06, "loss": 0.0693, "step": 9765 }, { "epoch": 4.22, "grad_norm": 0.9556555429097761, "learning_rate": 1.4453993446972925e-06, "loss": 0.0762, "step": 9770 }, { "epoch": 4.22, "grad_norm": 0.5329908247970897, "learning_rate": 1.437600046051285e-06, "loss": 0.0714, "step": 9775 }, { "epoch": 4.22, "grad_norm": 0.8007351984555957, "learning_rate": 1.4298202166836006e-06, "loss": 0.0677, "step": 9780 }, { "epoch": 4.23, "grad_norm": 0.9599502718080456, "learning_rate": 1.422059874284092e-06, "loss": 0.0798, "step": 9785 }, { "epoch": 4.23, "grad_norm": 0.8461958157094835, "learning_rate": 1.414319036498315e-06, "loss": 0.0728, "step": 9790 }, { "epoch": 4.23, "grad_norm": 0.8684313938243562, "learning_rate": 1.4065977209274706e-06, "loss": 0.0694, "step": 9795 }, { "epoch": 4.23, "grad_norm": 0.45658884011612155, "learning_rate": 1.398895945128369e-06, "loss": 0.0736, "step": 9800 }, { "epoch": 4.24, "grad_norm": 0.4987698551679359, "learning_rate": 1.3912137266133962e-06, "loss": 0.0709, "step": 9805 }, { "epoch": 4.24, "grad_norm": 0.7190456258252003, "learning_rate": 1.3835510828504573e-06, "loss": 0.0712, "step": 9810 }, { "epoch": 4.24, "grad_norm": 0.7903660619505879, "learning_rate": 1.3759080312629602e-06, "loss": 0.0735, "step": 9815 }, { "epoch": 4.24, "grad_norm": 0.4239086367212096, "learning_rate": 1.3682845892297568e-06, "loss": 0.0645, "step": 9820 }, { "epoch": 4.24, "grad_norm": 0.7296580218816523, "learning_rate": 1.3606807740851136e-06, "loss": 0.0686, "step": 9825 }, { "epoch": 4.25, "grad_norm": 1.0482753668393678, "learning_rate": 1.3530966031186698e-06, "loss": 0.0774, "step": 9830 }, { "epoch": 4.25, "grad_norm": 3.23798164205354, "learning_rate": 1.3455320935753924e-06, "loss": 0.0751, "step": 9835 }, { "epoch": 4.25, "grad_norm": 0.7752045430553508, "learning_rate": 1.337987262655548e-06, "loss": 0.0716, "step": 9840 }, { "epoch": 4.25, "grad_norm": 1.0027667237986686, "learning_rate": 1.3304621275146546e-06, "loss": 0.0724, "step": 9845 }, { "epoch": 4.25, "grad_norm": 0.5926695037422461, "learning_rate": 1.3229567052634461e-06, "loss": 0.0601, "step": 9850 }, { "epoch": 4.26, "grad_norm": 0.6538303894841858, "learning_rate": 1.3154710129678372e-06, "loss": 0.0612, "step": 9855 }, { "epoch": 4.26, "grad_norm": 0.4649486420264303, "learning_rate": 1.308005067648872e-06, "loss": 0.0684, "step": 9860 }, { "epoch": 4.26, "grad_norm": 0.8426888212283112, "learning_rate": 1.300558886282699e-06, "loss": 0.0833, "step": 9865 }, { "epoch": 4.26, "grad_norm": 0.6257879916376212, "learning_rate": 1.2931324858005268e-06, "loss": 0.0634, "step": 9870 }, { "epoch": 4.27, "grad_norm": 0.7619274786441721, "learning_rate": 1.2857258830885865e-06, "loss": 0.0739, "step": 9875 }, { "epoch": 4.27, "grad_norm": 0.714597826232532, "learning_rate": 1.2783390949880947e-06, "loss": 0.0616, "step": 9880 }, { "epoch": 4.27, "grad_norm": 0.7202242150486695, "learning_rate": 1.2709721382952055e-06, "loss": 0.0626, "step": 9885 }, { "epoch": 4.27, "grad_norm": 0.41782547564555067, "learning_rate": 1.2636250297609876e-06, "loss": 0.0671, "step": 9890 }, { "epoch": 4.27, "grad_norm": 0.46816271815456095, "learning_rate": 1.2562977860913762e-06, "loss": 0.0689, "step": 9895 }, { "epoch": 4.28, "grad_norm": 1.0028729164346737, "learning_rate": 1.2489904239471384e-06, "loss": 0.0683, "step": 9900 }, { "epoch": 4.28, "grad_norm": 0.8422597030333709, "learning_rate": 1.2417029599438356e-06, "loss": 0.0718, "step": 9905 }, { "epoch": 4.28, "grad_norm": 1.1068122147961652, "learning_rate": 1.2344354106517786e-06, "loss": 0.0738, "step": 9910 }, { "epoch": 4.28, "grad_norm": 0.5512640293205736, "learning_rate": 1.2271877925960019e-06, "loss": 0.0708, "step": 9915 }, { "epoch": 4.29, "grad_norm": 0.6105033804061096, "learning_rate": 1.2199601222562196e-06, "loss": 0.0619, "step": 9920 }, { "epoch": 4.29, "grad_norm": 0.4856188866921057, "learning_rate": 1.2127524160667869e-06, "loss": 0.0685, "step": 9925 }, { "epoch": 4.29, "grad_norm": 0.9256462061093675, "learning_rate": 1.2055646904166652e-06, "loss": 0.0604, "step": 9930 }, { "epoch": 4.29, "grad_norm": 0.7952559019318669, "learning_rate": 1.1983969616493818e-06, "loss": 0.0677, "step": 9935 }, { "epoch": 4.29, "grad_norm": 0.5933272207096911, "learning_rate": 1.1912492460629964e-06, "loss": 0.0586, "step": 9940 }, { "epoch": 4.3, "grad_norm": 0.7335180054141032, "learning_rate": 1.1841215599100643e-06, "loss": 0.0896, "step": 9945 }, { "epoch": 4.3, "grad_norm": 0.5807708318876802, "learning_rate": 1.1770139193975938e-06, "loss": 0.0669, "step": 9950 }, { "epoch": 4.3, "grad_norm": 3.2465728542332126, "learning_rate": 1.1699263406870198e-06, "loss": 0.0747, "step": 9955 }, { "epoch": 4.3, "grad_norm": 0.9376095352267071, "learning_rate": 1.1628588398941498e-06, "loss": 0.0709, "step": 9960 }, { "epoch": 4.3, "grad_norm": 0.8747907603190435, "learning_rate": 1.155811433089148e-06, "loss": 0.0723, "step": 9965 }, { "epoch": 4.31, "grad_norm": 0.720029499713049, "learning_rate": 1.1487841362964835e-06, "loss": 0.0683, "step": 9970 }, { "epoch": 4.31, "grad_norm": 0.7103885214539689, "learning_rate": 1.1417769654949006e-06, "loss": 0.0711, "step": 9975 }, { "epoch": 4.31, "grad_norm": 0.5180032364147855, "learning_rate": 1.1347899366173841e-06, "loss": 0.0582, "step": 9980 }, { "epoch": 4.31, "grad_norm": 0.8132409849677699, "learning_rate": 1.1278230655511103e-06, "loss": 0.0675, "step": 9985 }, { "epoch": 4.32, "grad_norm": 0.24139115913417963, "learning_rate": 1.1208763681374292e-06, "loss": 0.0637, "step": 9990 }, { "epoch": 4.32, "grad_norm": 1.0353499843369223, "learning_rate": 1.1139498601718169e-06, "loss": 0.0717, "step": 9995 }, { "epoch": 4.32, "grad_norm": 1.2256119940521908, "learning_rate": 1.107043557403843e-06, "loss": 0.0805, "step": 10000 }, { "epoch": 4.32, "grad_norm": 0.5239300408769509, "learning_rate": 1.1001574755371359e-06, "loss": 0.0775, "step": 10005 }, { "epoch": 4.32, "grad_norm": 0.7855124071489255, "learning_rate": 1.0932916302293406e-06, "loss": 0.065, "step": 10010 }, { "epoch": 4.33, "grad_norm": 0.704823659963824, "learning_rate": 1.0864460370920904e-06, "loss": 0.0712, "step": 10015 }, { "epoch": 4.33, "grad_norm": 0.5511957029973897, "learning_rate": 1.079620711690973e-06, "loss": 0.0667, "step": 10020 }, { "epoch": 4.33, "grad_norm": 0.4287620497702917, "learning_rate": 1.072815669545485e-06, "loss": 0.0638, "step": 10025 }, { "epoch": 4.33, "grad_norm": 0.8820143962824251, "learning_rate": 1.066030926129008e-06, "loss": 0.0702, "step": 10030 }, { "epoch": 4.33, "grad_norm": 0.3713151760478781, "learning_rate": 1.0592664968687627e-06, "loss": 0.0737, "step": 10035 }, { "epoch": 4.34, "grad_norm": 0.9403215381824361, "learning_rate": 1.0525223971457821e-06, "loss": 0.0643, "step": 10040 }, { "epoch": 4.34, "grad_norm": 0.7591698484596258, "learning_rate": 1.0457986422948752e-06, "loss": 0.0671, "step": 10045 }, { "epoch": 4.34, "grad_norm": 0.6323614044936294, "learning_rate": 1.039095247604589e-06, "loss": 0.0681, "step": 10050 }, { "epoch": 4.34, "grad_norm": 0.8848119028119328, "learning_rate": 1.0324122283171767e-06, "loss": 0.0765, "step": 10055 }, { "epoch": 4.35, "grad_norm": 0.8298278075654341, "learning_rate": 1.025749599628556e-06, "loss": 0.0641, "step": 10060 }, { "epoch": 4.35, "grad_norm": 0.5541866046706723, "learning_rate": 1.0191073766882875e-06, "loss": 0.0566, "step": 10065 }, { "epoch": 4.35, "grad_norm": 0.6386188598418999, "learning_rate": 1.012485574599531e-06, "loss": 0.0654, "step": 10070 }, { "epoch": 4.35, "grad_norm": 0.4624781345723369, "learning_rate": 1.005884208419009e-06, "loss": 0.069, "step": 10075 }, { "epoch": 4.35, "grad_norm": 0.8121132787817534, "learning_rate": 9.993032931569858e-07, "loss": 0.0652, "step": 10080 }, { "epoch": 4.36, "grad_norm": 0.9607080496359577, "learning_rate": 9.927428437772113e-07, "loss": 0.0673, "step": 10085 }, { "epoch": 4.36, "grad_norm": 0.627392336343592, "learning_rate": 9.862028751969099e-07, "loss": 0.073, "step": 10090 }, { "epoch": 4.36, "grad_norm": 0.7200512938130613, "learning_rate": 9.79683402286734e-07, "loss": 0.0661, "step": 10095 }, { "epoch": 4.36, "grad_norm": 0.5809004286224504, "learning_rate": 9.731844398707313e-07, "loss": 0.0645, "step": 10100 }, { "epoch": 4.37, "grad_norm": 0.4701499769785015, "learning_rate": 9.667060027263152e-07, "loss": 0.0624, "step": 10105 }, { "epoch": 4.37, "grad_norm": 1.0412408044015375, "learning_rate": 9.602481055842228e-07, "loss": 0.0706, "step": 10110 }, { "epoch": 4.37, "grad_norm": 0.7109713416597109, "learning_rate": 9.538107631284943e-07, "loss": 0.0654, "step": 10115 }, { "epoch": 4.37, "grad_norm": 0.4148240501136039, "learning_rate": 9.47393989996428e-07, "loss": 0.0673, "step": 10120 }, { "epoch": 4.37, "grad_norm": 0.49130418610183135, "learning_rate": 9.409978007785514e-07, "loss": 0.0661, "step": 10125 }, { "epoch": 4.38, "grad_norm": 0.4060476502578717, "learning_rate": 9.346222100185931e-07, "loss": 0.0564, "step": 10130 }, { "epoch": 4.38, "grad_norm": 0.7474394992207792, "learning_rate": 9.282672322134367e-07, "loss": 0.0599, "step": 10135 }, { "epoch": 4.38, "grad_norm": 0.42235346298794035, "learning_rate": 9.219328818131013e-07, "loss": 0.0652, "step": 10140 }, { "epoch": 4.38, "grad_norm": 0.6853745391337568, "learning_rate": 9.15619173220702e-07, "loss": 0.0674, "step": 10145 }, { "epoch": 4.38, "grad_norm": 0.4667409504567176, "learning_rate": 9.093261207924175e-07, "loss": 0.0695, "step": 10150 }, { "epoch": 4.39, "grad_norm": 0.5579367649666931, "learning_rate": 9.030537388374627e-07, "loss": 0.0659, "step": 10155 }, { "epoch": 4.39, "grad_norm": 0.6908953456786018, "learning_rate": 8.968020416180434e-07, "loss": 0.0659, "step": 10160 }, { "epoch": 4.39, "grad_norm": 0.9183434314862593, "learning_rate": 8.905710433493375e-07, "loss": 0.0778, "step": 10165 }, { "epoch": 4.39, "grad_norm": 0.4109391359251177, "learning_rate": 8.843607581994585e-07, "loss": 0.0722, "step": 10170 }, { "epoch": 4.4, "grad_norm": 1.4316872445801794, "learning_rate": 8.781712002894205e-07, "loss": 0.0696, "step": 10175 }, { "epoch": 4.4, "grad_norm": 0.5923901753548628, "learning_rate": 8.720023836931079e-07, "loss": 0.067, "step": 10180 }, { "epoch": 4.4, "grad_norm": 0.5945036394746386, "learning_rate": 8.658543224372407e-07, "loss": 0.0605, "step": 10185 }, { "epoch": 4.4, "grad_norm": 0.7776265459380092, "learning_rate": 8.597270305013483e-07, "loss": 0.0739, "step": 10190 }, { "epoch": 4.4, "grad_norm": 0.5753102906059375, "learning_rate": 8.536205218177351e-07, "loss": 0.0649, "step": 10195 }, { "epoch": 4.41, "grad_norm": 0.7139209337155082, "learning_rate": 8.475348102714464e-07, "loss": 0.0743, "step": 10200 }, { "epoch": 4.41, "grad_norm": 0.3196259522914314, "learning_rate": 8.414699097002422e-07, "loss": 0.0727, "step": 10205 }, { "epoch": 4.41, "grad_norm": 0.41683002866071117, "learning_rate": 8.354258338945542e-07, "loss": 0.0588, "step": 10210 }, { "epoch": 4.41, "grad_norm": 0.850604799529318, "learning_rate": 8.294025965974717e-07, "loss": 0.0652, "step": 10215 }, { "epoch": 4.41, "grad_norm": 0.5300497819554433, "learning_rate": 8.234002115046957e-07, "loss": 0.0647, "step": 10220 }, { "epoch": 4.42, "grad_norm": 0.5035451796576669, "learning_rate": 8.174186922645155e-07, "loss": 0.0696, "step": 10225 }, { "epoch": 4.42, "grad_norm": 0.380831280980267, "learning_rate": 8.11458052477776e-07, "loss": 0.0647, "step": 10230 }, { "epoch": 4.42, "grad_norm": 0.7292244058285478, "learning_rate": 8.055183056978421e-07, "loss": 0.0664, "step": 10235 }, { "epoch": 4.42, "grad_norm": 0.8310509794310577, "learning_rate": 7.99599465430575e-07, "loss": 0.0673, "step": 10240 }, { "epoch": 4.43, "grad_norm": 1.0816929263590291, "learning_rate": 7.937015451342999e-07, "loss": 0.0766, "step": 10245 }, { "epoch": 4.43, "grad_norm": 0.683497006037796, "learning_rate": 7.878245582197718e-07, "loss": 0.0615, "step": 10250 }, { "epoch": 4.43, "grad_norm": 0.6580740484054255, "learning_rate": 7.819685180501502e-07, "loss": 0.0583, "step": 10255 }, { "epoch": 4.43, "grad_norm": 0.6176177788270545, "learning_rate": 7.761334379409569e-07, "loss": 0.0602, "step": 10260 }, { "epoch": 4.43, "grad_norm": 0.5662803463718628, "learning_rate": 7.703193311600654e-07, "loss": 0.0712, "step": 10265 }, { "epoch": 4.44, "grad_norm": 0.6550414709715258, "learning_rate": 7.64526210927653e-07, "loss": 0.0621, "step": 10270 }, { "epoch": 4.44, "grad_norm": 0.342545878900588, "learning_rate": 7.587540904161794e-07, "loss": 0.0657, "step": 10275 }, { "epoch": 4.44, "grad_norm": 0.5255994175005089, "learning_rate": 7.530029827503593e-07, "loss": 0.0672, "step": 10280 }, { "epoch": 4.44, "grad_norm": 0.4457783191130157, "learning_rate": 7.472729010071178e-07, "loss": 0.061, "step": 10285 }, { "epoch": 4.44, "grad_norm": 0.41693577159351586, "learning_rate": 7.415638582155804e-07, "loss": 0.0616, "step": 10290 }, { "epoch": 4.45, "grad_norm": 0.5062588412348282, "learning_rate": 7.35875867357031e-07, "loss": 0.07, "step": 10295 }, { "epoch": 4.45, "grad_norm": 0.7012064353284458, "learning_rate": 7.302089413648838e-07, "loss": 0.0662, "step": 10300 }, { "epoch": 4.45, "grad_norm": 0.49602914879558746, "learning_rate": 7.245630931246606e-07, "loss": 0.0709, "step": 10305 }, { "epoch": 4.45, "grad_norm": 0.877101152761563, "learning_rate": 7.189383354739476e-07, "loss": 0.0746, "step": 10310 }, { "epoch": 4.46, "grad_norm": 0.6109331747310377, "learning_rate": 7.133346812023823e-07, "loss": 0.0619, "step": 10315 }, { "epoch": 4.46, "grad_norm": 0.6710843159924726, "learning_rate": 7.077521430516143e-07, "loss": 0.0635, "step": 10320 }, { "epoch": 4.46, "grad_norm": 1.4984514944289056, "learning_rate": 7.0219073371528e-07, "loss": 0.0677, "step": 10325 }, { "epoch": 4.46, "grad_norm": 0.65040299798008, "learning_rate": 6.96650465838975e-07, "loss": 0.0699, "step": 10330 }, { "epoch": 4.46, "grad_norm": 0.6274345261918733, "learning_rate": 6.911313520202167e-07, "loss": 0.0735, "step": 10335 }, { "epoch": 4.47, "grad_norm": 1.0263488571321568, "learning_rate": 6.85633404808429e-07, "loss": 0.0717, "step": 10340 }, { "epoch": 4.47, "grad_norm": 0.5816865823316484, "learning_rate": 6.801566367049018e-07, "loss": 0.0673, "step": 10345 }, { "epoch": 4.47, "grad_norm": 0.48045338154681266, "learning_rate": 6.747010601627724e-07, "loss": 0.0658, "step": 10350 }, { "epoch": 4.47, "grad_norm": 0.6100745958999338, "learning_rate": 6.692666875869902e-07, "loss": 0.0762, "step": 10355 }, { "epoch": 4.48, "grad_norm": 0.6287186987071508, "learning_rate": 6.638535313342886e-07, "loss": 0.0631, "step": 10360 }, { "epoch": 4.48, "grad_norm": 0.5641903490443426, "learning_rate": 6.584616037131631e-07, "loss": 0.0686, "step": 10365 }, { "epoch": 4.48, "grad_norm": 0.8508675496237825, "learning_rate": 6.530909169838362e-07, "loss": 0.0599, "step": 10370 }, { "epoch": 4.48, "grad_norm": 0.8371094642421063, "learning_rate": 6.477414833582363e-07, "loss": 0.0668, "step": 10375 }, { "epoch": 4.48, "grad_norm": 0.6556091987202345, "learning_rate": 6.424133149999611e-07, "loss": 0.0662, "step": 10380 }, { "epoch": 4.49, "grad_norm": 0.5788970639761051, "learning_rate": 6.371064240242586e-07, "loss": 0.0737, "step": 10385 }, { "epoch": 4.49, "grad_norm": 0.9576674573709517, "learning_rate": 6.31820822497996e-07, "loss": 0.0725, "step": 10390 }, { "epoch": 4.49, "grad_norm": 1.2339994955128544, "learning_rate": 6.265565224396319e-07, "loss": 0.0633, "step": 10395 }, { "epoch": 4.49, "grad_norm": 0.7727346443142231, "learning_rate": 6.21313535819189e-07, "loss": 0.0599, "step": 10400 }, { "epoch": 4.49, "grad_norm": 0.5883563416250307, "learning_rate": 6.160918745582268e-07, "loss": 0.0587, "step": 10405 }, { "epoch": 4.5, "grad_norm": 0.6543057962328555, "learning_rate": 6.108915505298141e-07, "loss": 0.065, "step": 10410 }, { "epoch": 4.5, "grad_norm": 0.45099270153584864, "learning_rate": 6.057125755585069e-07, "loss": 0.0617, "step": 10415 }, { "epoch": 4.5, "grad_norm": 0.6813172291762168, "learning_rate": 6.005549614203144e-07, "loss": 0.0711, "step": 10420 }, { "epoch": 4.5, "grad_norm": 0.42037421073670783, "learning_rate": 5.954187198426764e-07, "loss": 0.0664, "step": 10425 }, { "epoch": 4.51, "grad_norm": 0.7336243689992468, "learning_rate": 5.903038625044332e-07, "loss": 0.0708, "step": 10430 }, { "epoch": 4.51, "grad_norm": 0.8346702406824394, "learning_rate": 5.852104010358039e-07, "loss": 0.0615, "step": 10435 }, { "epoch": 4.51, "grad_norm": 0.8369876118704074, "learning_rate": 5.801383470183564e-07, "loss": 0.0607, "step": 10440 }, { "epoch": 4.51, "grad_norm": 0.7884787540617048, "learning_rate": 5.75087711984983e-07, "loss": 0.0612, "step": 10445 }, { "epoch": 4.51, "grad_norm": 0.9952443752024916, "learning_rate": 5.700585074198739e-07, "loss": 0.0622, "step": 10450 }, { "epoch": 4.52, "grad_norm": 0.7486367420029569, "learning_rate": 5.650507447584863e-07, "loss": 0.0682, "step": 10455 }, { "epoch": 4.52, "grad_norm": 0.5399364925124402, "learning_rate": 5.600644353875273e-07, "loss": 0.0704, "step": 10460 }, { "epoch": 4.52, "grad_norm": 0.7213995163485463, "learning_rate": 5.550995906449208e-07, "loss": 0.064, "step": 10465 }, { "epoch": 4.52, "grad_norm": 0.5658492332641553, "learning_rate": 5.501562218197853e-07, "loss": 0.0629, "step": 10470 }, { "epoch": 4.52, "grad_norm": 0.3714861481274943, "learning_rate": 5.452343401524085e-07, "loss": 0.0627, "step": 10475 }, { "epoch": 4.53, "grad_norm": 1.0177022722601001, "learning_rate": 5.403339568342158e-07, "loss": 0.0729, "step": 10480 }, { "epoch": 4.53, "grad_norm": 0.6677804981240922, "learning_rate": 5.354550830077521e-07, "loss": 0.0543, "step": 10485 }, { "epoch": 4.53, "grad_norm": 0.3341148617290627, "learning_rate": 5.305977297666553e-07, "loss": 0.0636, "step": 10490 }, { "epoch": 4.53, "grad_norm": 0.5413680488310173, "learning_rate": 5.257619081556276e-07, "loss": 0.0613, "step": 10495 }, { "epoch": 4.54, "grad_norm": 0.6914227311788763, "learning_rate": 5.209476291704152e-07, "loss": 0.0593, "step": 10500 }, { "epoch": 4.54, "grad_norm": 0.6314441804068395, "learning_rate": 5.16154903757774e-07, "loss": 0.0682, "step": 10505 }, { "epoch": 4.54, "grad_norm": 0.5451852735147008, "learning_rate": 5.113837428154567e-07, "loss": 0.0585, "step": 10510 }, { "epoch": 4.54, "grad_norm": 0.759423987615597, "learning_rate": 5.066341571921818e-07, "loss": 0.0654, "step": 10515 }, { "epoch": 4.54, "grad_norm": 0.3161641862772605, "learning_rate": 5.019061576876094e-07, "loss": 0.0651, "step": 10520 }, { "epoch": 4.55, "grad_norm": 0.7204327028723595, "learning_rate": 4.971997550523156e-07, "loss": 0.0604, "step": 10525 }, { "epoch": 4.55, "grad_norm": 0.6399016153777878, "learning_rate": 4.925149599877699e-07, "loss": 0.0653, "step": 10530 }, { "epoch": 4.55, "grad_norm": 0.5888152115518005, "learning_rate": 4.878517831463092e-07, "loss": 0.0692, "step": 10535 }, { "epoch": 4.55, "grad_norm": 1.1000748717903832, "learning_rate": 4.832102351311175e-07, "loss": 0.0775, "step": 10540 }, { "epoch": 4.56, "grad_norm": 0.833611260996263, "learning_rate": 4.785903264961967e-07, "loss": 0.0654, "step": 10545 }, { "epoch": 4.56, "grad_norm": 0.41303077294718554, "learning_rate": 4.739920677463461e-07, "loss": 0.0688, "step": 10550 }, { "epoch": 4.56, "grad_norm": 0.53144152081788, "learning_rate": 4.6941546933713553e-07, "loss": 0.0535, "step": 10555 }, { "epoch": 4.56, "grad_norm": 0.3593735462606911, "learning_rate": 4.6486054167488414e-07, "loss": 0.0688, "step": 10560 }, { "epoch": 4.56, "grad_norm": 0.5132813504324446, "learning_rate": 4.603272951166371e-07, "loss": 0.0598, "step": 10565 }, { "epoch": 4.57, "grad_norm": 0.37892051369945495, "learning_rate": 4.558157399701379e-07, "loss": 0.0631, "step": 10570 }, { "epoch": 4.57, "grad_norm": 0.7363528376552387, "learning_rate": 4.5132588649381413e-07, "loss": 0.0606, "step": 10575 }, { "epoch": 4.57, "grad_norm": 0.628861659000855, "learning_rate": 4.4685774489673926e-07, "loss": 0.0652, "step": 10580 }, { "epoch": 4.57, "grad_norm": 0.4642134473139565, "learning_rate": 4.4241132533862306e-07, "loss": 0.0599, "step": 10585 }, { "epoch": 4.57, "grad_norm": 0.4240911790644064, "learning_rate": 4.3798663792978367e-07, "loss": 0.0653, "step": 10590 }, { "epoch": 4.58, "grad_norm": 0.5919110283678517, "learning_rate": 4.335836927311232e-07, "loss": 0.0676, "step": 10595 }, { "epoch": 4.58, "grad_norm": 0.6943140171165241, "learning_rate": 4.292024997541078e-07, "loss": 0.0603, "step": 10600 }, { "epoch": 4.58, "grad_norm": 0.6557567667966683, "learning_rate": 4.248430689607397e-07, "loss": 0.0611, "step": 10605 }, { "epoch": 4.58, "grad_norm": 0.9732091473705298, "learning_rate": 4.205054102635409e-07, "loss": 0.067, "step": 10610 }, { "epoch": 4.59, "grad_norm": 0.6632492498420535, "learning_rate": 4.1618953352552616e-07, "loss": 0.0581, "step": 10615 }, { "epoch": 4.59, "grad_norm": 0.5304776663139763, "learning_rate": 4.118954485601845e-07, "loss": 0.059, "step": 10620 }, { "epoch": 4.59, "grad_norm": 0.5358707936464782, "learning_rate": 4.0762316513145436e-07, "loss": 0.0514, "step": 10625 }, { "epoch": 4.59, "grad_norm": 0.5362570016989209, "learning_rate": 4.033726929536974e-07, "loss": 0.0593, "step": 10630 }, { "epoch": 4.59, "grad_norm": 0.5513726440823109, "learning_rate": 3.991440416916836e-07, "loss": 0.0649, "step": 10635 }, { "epoch": 4.6, "grad_norm": 0.6461092576242113, "learning_rate": 3.9493722096056844e-07, "loss": 0.0601, "step": 10640 }, { "epoch": 4.6, "grad_norm": 0.38614596195664247, "learning_rate": 3.907522403258657e-07, "loss": 0.0544, "step": 10645 }, { "epoch": 4.6, "grad_norm": 0.9317175798093095, "learning_rate": 3.865891093034313e-07, "loss": 0.0807, "step": 10650 }, { "epoch": 4.6, "grad_norm": 0.4708803679522015, "learning_rate": 3.824478373594342e-07, "loss": 0.0625, "step": 10655 }, { "epoch": 4.6, "grad_norm": 0.5846023860363823, "learning_rate": 3.7832843391034766e-07, "loss": 0.0617, "step": 10660 }, { "epoch": 4.61, "grad_norm": 0.8074441691925042, "learning_rate": 3.742309083229134e-07, "loss": 0.0756, "step": 10665 }, { "epoch": 4.61, "grad_norm": 0.7419611525652908, "learning_rate": 3.7015526991413195e-07, "loss": 0.0681, "step": 10670 }, { "epoch": 4.61, "grad_norm": 0.49521738757182165, "learning_rate": 3.661015279512337e-07, "loss": 0.059, "step": 10675 }, { "epoch": 4.61, "grad_norm": 0.6363196159249647, "learning_rate": 3.620696916516597e-07, "loss": 0.0675, "step": 10680 }, { "epoch": 4.62, "grad_norm": 0.648756517846174, "learning_rate": 3.580597701830446e-07, "loss": 0.0564, "step": 10685 }, { "epoch": 4.62, "grad_norm": 0.22145343101134532, "learning_rate": 3.540717726631915e-07, "loss": 0.0657, "step": 10690 }, { "epoch": 4.62, "grad_norm": 0.745216261651182, "learning_rate": 3.5010570816005227e-07, "loss": 0.0617, "step": 10695 }, { "epoch": 4.62, "grad_norm": 0.5619150318693038, "learning_rate": 3.4616158569170887e-07, "loss": 0.0639, "step": 10700 }, { "epoch": 4.62, "grad_norm": 0.9913195001224414, "learning_rate": 3.4223941422634856e-07, "loss": 0.0707, "step": 10705 }, { "epoch": 4.63, "grad_norm": 0.9018805551119227, "learning_rate": 3.383392026822463e-07, "loss": 0.0768, "step": 10710 }, { "epoch": 4.63, "grad_norm": 0.5561922448862705, "learning_rate": 3.344609599277482e-07, "loss": 0.0529, "step": 10715 }, { "epoch": 4.63, "grad_norm": 0.6365875808087165, "learning_rate": 3.306046947812447e-07, "loss": 0.0689, "step": 10720 }, { "epoch": 4.63, "grad_norm": 0.7609070256401496, "learning_rate": 3.267704160111529e-07, "loss": 0.0623, "step": 10725 }, { "epoch": 4.63, "grad_norm": 0.9504410302089583, "learning_rate": 3.2295813233589544e-07, "loss": 0.0588, "step": 10730 }, { "epoch": 4.64, "grad_norm": 0.7288968981026365, "learning_rate": 3.191678524238884e-07, "loss": 0.0659, "step": 10735 }, { "epoch": 4.64, "grad_norm": 0.7518592308885353, "learning_rate": 3.1539958489350875e-07, "loss": 0.0751, "step": 10740 }, { "epoch": 4.64, "grad_norm": 0.24201120306356882, "learning_rate": 3.1165333831308487e-07, "loss": 0.0592, "step": 10745 }, { "epoch": 4.64, "grad_norm": 0.5676494509868415, "learning_rate": 3.0792912120087394e-07, "loss": 0.0596, "step": 10750 }, { "epoch": 4.65, "grad_norm": 0.47676863842198597, "learning_rate": 3.04226942025041e-07, "loss": 0.0588, "step": 10755 }, { "epoch": 4.65, "grad_norm": 0.48666045227889837, "learning_rate": 3.005468092036401e-07, "loss": 0.0661, "step": 10760 }, { "epoch": 4.65, "grad_norm": 0.5972850094581258, "learning_rate": 2.968887311045998e-07, "loss": 0.0547, "step": 10765 }, { "epoch": 4.65, "grad_norm": 0.9857922947921498, "learning_rate": 2.932527160456955e-07, "loss": 0.0756, "step": 10770 }, { "epoch": 4.65, "grad_norm": 0.6562993291105274, "learning_rate": 2.896387722945404e-07, "loss": 0.0561, "step": 10775 }, { "epoch": 4.66, "grad_norm": 0.8811705396317993, "learning_rate": 2.8604690806855684e-07, "loss": 0.0677, "step": 10780 }, { "epoch": 4.66, "grad_norm": 0.7505002155595076, "learning_rate": 2.8247713153496504e-07, "loss": 0.062, "step": 10785 }, { "epoch": 4.66, "grad_norm": 0.5670638816746271, "learning_rate": 2.789294508107632e-07, "loss": 0.0603, "step": 10790 }, { "epoch": 4.66, "grad_norm": 0.937199394901662, "learning_rate": 2.754038739627052e-07, "loss": 0.0651, "step": 10795 }, { "epoch": 4.67, "grad_norm": 0.49139293261777545, "learning_rate": 2.7190040900728855e-07, "loss": 0.0632, "step": 10800 }, { "epoch": 4.67, "grad_norm": 0.48692569853606976, "learning_rate": 2.684190639107265e-07, "loss": 0.0651, "step": 10805 }, { "epoch": 4.67, "grad_norm": 0.49372578994517236, "learning_rate": 2.6495984658894024e-07, "loss": 0.0546, "step": 10810 }, { "epoch": 4.67, "grad_norm": 0.6689499796929367, "learning_rate": 2.6152276490753804e-07, "loss": 0.0565, "step": 10815 }, { "epoch": 4.67, "grad_norm": 0.5439861457356306, "learning_rate": 2.5810782668179156e-07, "loss": 0.0683, "step": 10820 }, { "epoch": 4.68, "grad_norm": 0.7221669120831035, "learning_rate": 2.547150396766263e-07, "loss": 0.0616, "step": 10825 }, { "epoch": 4.68, "grad_norm": 0.5912466008937799, "learning_rate": 2.513444116065966e-07, "loss": 0.066, "step": 10830 }, { "epoch": 4.68, "grad_norm": 0.41578702847781396, "learning_rate": 2.4799595013587305e-07, "loss": 0.052, "step": 10835 }, { "epoch": 4.68, "grad_norm": 0.4394587515378354, "learning_rate": 2.446696628782252e-07, "loss": 0.0572, "step": 10840 }, { "epoch": 4.68, "grad_norm": 0.5604383766879553, "learning_rate": 2.4136555739700193e-07, "loss": 0.0663, "step": 10845 }, { "epoch": 4.69, "grad_norm": 0.9935450578535743, "learning_rate": 2.3808364120511241e-07, "loss": 0.064, "step": 10850 }, { "epoch": 4.69, "grad_norm": 0.44874457386774114, "learning_rate": 2.348239217650139e-07, "loss": 0.0602, "step": 10855 }, { "epoch": 4.69, "grad_norm": 0.23246095201744477, "learning_rate": 2.3158640648869302e-07, "loss": 0.06, "step": 10860 }, { "epoch": 4.69, "grad_norm": 0.6255865619079833, "learning_rate": 2.2837110273764563e-07, "loss": 0.061, "step": 10865 }, { "epoch": 4.7, "grad_norm": 0.6191570836290404, "learning_rate": 2.2517801782286574e-07, "loss": 0.0584, "step": 10870 }, { "epoch": 4.7, "grad_norm": 0.46531391044936, "learning_rate": 2.220071590048256e-07, "loss": 0.0518, "step": 10875 }, { "epoch": 4.7, "grad_norm": 0.6023954576902755, "learning_rate": 2.1885853349345565e-07, "loss": 0.0611, "step": 10880 }, { "epoch": 4.7, "grad_norm": 0.6359453628805575, "learning_rate": 2.1573214844813673e-07, "loss": 0.065, "step": 10885 }, { "epoch": 4.7, "grad_norm": 0.6893301879412579, "learning_rate": 2.1262801097767682e-07, "loss": 0.0551, "step": 10890 }, { "epoch": 4.71, "grad_norm": 0.4997543153017668, "learning_rate": 2.095461281402966e-07, "loss": 0.0653, "step": 10895 }, { "epoch": 4.71, "grad_norm": 0.7737567709025983, "learning_rate": 2.064865069436184e-07, "loss": 0.0745, "step": 10900 }, { "epoch": 4.71, "grad_norm": 0.8763437454871517, "learning_rate": 2.0344915434463819e-07, "loss": 0.0631, "step": 10905 }, { "epoch": 4.71, "grad_norm": 0.8875366806583803, "learning_rate": 2.0043407724972265e-07, "loss": 0.0596, "step": 10910 }, { "epoch": 4.71, "grad_norm": 0.7156184277959455, "learning_rate": 1.9744128251458773e-07, "loss": 0.0627, "step": 10915 }, { "epoch": 4.72, "grad_norm": 0.5289051350934006, "learning_rate": 1.9447077694428107e-07, "loss": 0.0512, "step": 10920 }, { "epoch": 4.72, "grad_norm": 0.522043354481508, "learning_rate": 1.915225672931742e-07, "loss": 0.0575, "step": 10925 }, { "epoch": 4.72, "grad_norm": 0.5634897346612658, "learning_rate": 1.8859666026493363e-07, "loss": 0.0638, "step": 10930 }, { "epoch": 4.72, "grad_norm": 0.5283634108884022, "learning_rate": 1.8569306251252196e-07, "loss": 0.0602, "step": 10935 }, { "epoch": 4.73, "grad_norm": 0.6740764935995843, "learning_rate": 1.8281178063817017e-07, "loss": 0.0595, "step": 10940 }, { "epoch": 4.73, "grad_norm": 0.6823376637996936, "learning_rate": 1.7995282119336765e-07, "loss": 0.0722, "step": 10945 }, { "epoch": 4.73, "grad_norm": 0.6746547854494919, "learning_rate": 1.7711619067884988e-07, "loss": 0.0688, "step": 10950 }, { "epoch": 4.73, "grad_norm": 0.5892333526896758, "learning_rate": 1.7430189554457634e-07, "loss": 0.0638, "step": 10955 }, { "epoch": 4.73, "grad_norm": 0.5044180039297371, "learning_rate": 1.715099421897215e-07, "loss": 0.0606, "step": 10960 }, { "epoch": 4.74, "grad_norm": 0.49999321097375804, "learning_rate": 1.6874033696266056e-07, "loss": 0.0635, "step": 10965 }, { "epoch": 4.74, "grad_norm": 0.6194582830877933, "learning_rate": 1.6599308616095266e-07, "loss": 0.0579, "step": 10970 }, { "epoch": 4.74, "grad_norm": 0.7205882487405464, "learning_rate": 1.6326819603132537e-07, "loss": 0.0665, "step": 10975 }, { "epoch": 4.74, "grad_norm": 0.7802188871645003, "learning_rate": 1.6056567276966362e-07, "loss": 0.0778, "step": 10980 }, { "epoch": 4.75, "grad_norm": 0.6375719460013147, "learning_rate": 1.5788552252099409e-07, "loss": 0.0675, "step": 10985 }, { "epoch": 4.75, "grad_norm": 0.7194773791511878, "learning_rate": 1.552277513794731e-07, "loss": 0.0645, "step": 10990 }, { "epoch": 4.75, "grad_norm": 0.8278357416000771, "learning_rate": 1.5259236538836765e-07, "loss": 0.0628, "step": 10995 }, { "epoch": 4.75, "grad_norm": 0.45801396543779305, "learning_rate": 1.4997937054004874e-07, "loss": 0.0517, "step": 11000 }, { "epoch": 4.75, "grad_norm": 0.6376220924448733, "learning_rate": 1.4738877277597153e-07, "loss": 0.0593, "step": 11005 }, { "epoch": 4.76, "grad_norm": 0.608559541624421, "learning_rate": 1.4482057798666516e-07, "loss": 0.0554, "step": 11010 }, { "epoch": 4.76, "grad_norm": 0.5119755335379622, "learning_rate": 1.4227479201172067e-07, "loss": 0.0672, "step": 11015 }, { "epoch": 4.76, "grad_norm": 0.4490665941016147, "learning_rate": 1.397514206397721e-07, "loss": 0.0667, "step": 11020 }, { "epoch": 4.76, "grad_norm": 0.6251201055280632, "learning_rate": 1.3725046960848975e-07, "loss": 0.0625, "step": 11025 }, { "epoch": 4.76, "grad_norm": 0.6564636586318714, "learning_rate": 1.3477194460456256e-07, "loss": 0.0684, "step": 11030 }, { "epoch": 4.77, "grad_norm": 0.4745320704645207, "learning_rate": 1.3231585126368905e-07, "loss": 0.0609, "step": 11035 }, { "epoch": 4.77, "grad_norm": 0.5746810367692561, "learning_rate": 1.298821951705598e-07, "loss": 0.0595, "step": 11040 }, { "epoch": 4.77, "grad_norm": 0.5623447071438609, "learning_rate": 1.274709818588482e-07, "loss": 0.0682, "step": 11045 }, { "epoch": 4.77, "grad_norm": 0.6650477950040313, "learning_rate": 1.2508221681119869e-07, "loss": 0.0659, "step": 11050 }, { "epoch": 4.78, "grad_norm": 0.7413436883098828, "learning_rate": 1.2271590545921086e-07, "loss": 0.0726, "step": 11055 }, { "epoch": 4.78, "grad_norm": 0.8125027194289178, "learning_rate": 1.2037205318342959e-07, "loss": 0.0627, "step": 11060 }, { "epoch": 4.78, "grad_norm": 0.5557374623847701, "learning_rate": 1.1805066531332954e-07, "loss": 0.0609, "step": 11065 }, { "epoch": 4.78, "grad_norm": 0.38154583494463035, "learning_rate": 1.1575174712731063e-07, "loss": 0.0628, "step": 11070 }, { "epoch": 4.78, "grad_norm": 0.6962083608684523, "learning_rate": 1.1347530385267702e-07, "loss": 0.0603, "step": 11075 }, { "epoch": 4.79, "grad_norm": 0.5587167971298492, "learning_rate": 1.1122134066562929e-07, "loss": 0.0648, "step": 11080 }, { "epoch": 4.79, "grad_norm": 0.3344941093274932, "learning_rate": 1.0898986269125444e-07, "loss": 0.0553, "step": 11085 }, { "epoch": 4.79, "grad_norm": 0.5161742767786995, "learning_rate": 1.0678087500351041e-07, "loss": 0.0593, "step": 11090 }, { "epoch": 4.79, "grad_norm": 1.0950079374259003, "learning_rate": 1.0459438262521937e-07, "loss": 0.0719, "step": 11095 }, { "epoch": 4.79, "grad_norm": 0.6120823620469348, "learning_rate": 1.0243039052804993e-07, "loss": 0.068, "step": 11100 }, { "epoch": 4.8, "grad_norm": 0.8295518627255871, "learning_rate": 1.0028890363251165e-07, "loss": 0.0681, "step": 11105 }, { "epoch": 4.8, "grad_norm": 0.6779051073720008, "learning_rate": 9.816992680794057e-08, "loss": 0.0603, "step": 11110 }, { "epoch": 4.8, "grad_norm": 0.5656436041883544, "learning_rate": 9.607346487248926e-08, "loss": 0.0622, "step": 11115 }, { "epoch": 4.8, "grad_norm": 0.5831695643818923, "learning_rate": 9.399952259311451e-08, "loss": 0.0544, "step": 11120 }, { "epoch": 4.81, "grad_norm": 0.7699048409977948, "learning_rate": 9.194810468557191e-08, "loss": 0.0616, "step": 11125 }, { "epoch": 4.81, "grad_norm": 0.5542117866160443, "learning_rate": 8.991921581439466e-08, "loss": 0.0656, "step": 11130 }, { "epoch": 4.81, "grad_norm": 0.523700705648689, "learning_rate": 8.791286059289472e-08, "loss": 0.0536, "step": 11135 }, { "epoch": 4.81, "grad_norm": 0.42022329976259326, "learning_rate": 8.59290435831428e-08, "loss": 0.0685, "step": 11140 }, { "epoch": 4.81, "grad_norm": 0.6101833141730744, "learning_rate": 8.396776929596395e-08, "loss": 0.0637, "step": 11145 }, { "epoch": 4.82, "grad_norm": 0.5631499331780958, "learning_rate": 8.202904219092645e-08, "loss": 0.0633, "step": 11150 }, { "epoch": 4.82, "grad_norm": 0.5029707683863562, "learning_rate": 8.011286667632734e-08, "loss": 0.0554, "step": 11155 }, { "epoch": 4.82, "grad_norm": 0.2800645526623471, "learning_rate": 7.821924710918804e-08, "loss": 0.0641, "step": 11160 }, { "epoch": 4.82, "grad_norm": 0.6921504299490085, "learning_rate": 7.6348187795241e-08, "loss": 0.0663, "step": 11165 }, { "epoch": 4.83, "grad_norm": 0.5401882659743539, "learning_rate": 7.44996929889219e-08, "loss": 0.0613, "step": 11170 }, { "epoch": 4.83, "grad_norm": 0.7010012725517893, "learning_rate": 7.267376689335859e-08, "loss": 0.0614, "step": 11175 }, { "epoch": 4.83, "grad_norm": 1.170334350237117, "learning_rate": 7.087041366036107e-08, "loss": 0.0691, "step": 11180 }, { "epoch": 4.83, "grad_norm": 0.4312270382192399, "learning_rate": 6.908963739041375e-08, "loss": 0.0555, "step": 11185 }, { "epoch": 4.83, "grad_norm": 0.8744098840264921, "learning_rate": 6.73314421326643e-08, "loss": 0.0699, "step": 11190 }, { "epoch": 4.84, "grad_norm": 0.654485383566514, "learning_rate": 6.559583188491813e-08, "loss": 0.0654, "step": 11195 }, { "epoch": 4.84, "grad_norm": 0.9216834484238109, "learning_rate": 6.3882810593624e-08, "loss": 0.0615, "step": 11200 }, { "epoch": 4.84, "grad_norm": 0.38282636888860044, "learning_rate": 6.219238215386724e-08, "loss": 0.0614, "step": 11205 }, { "epoch": 4.84, "grad_norm": 0.8347631702339198, "learning_rate": 6.052455040936323e-08, "loss": 0.0654, "step": 11210 }, { "epoch": 4.84, "grad_norm": 0.5972877912821046, "learning_rate": 5.887931915244616e-08, "loss": 0.0576, "step": 11215 }, { "epoch": 4.85, "grad_norm": 0.3402597401613388, "learning_rate": 5.7256692124060266e-08, "loss": 0.0666, "step": 11220 }, { "epoch": 4.85, "grad_norm": 0.5181058757005162, "learning_rate": 5.565667301375421e-08, "loss": 0.0574, "step": 11225 }, { "epoch": 4.85, "grad_norm": 0.7291902925580718, "learning_rate": 5.407926545966669e-08, "loss": 0.0671, "step": 11230 }, { "epoch": 4.85, "grad_norm": 0.6238851287111219, "learning_rate": 5.252447304852637e-08, "loss": 0.0612, "step": 11235 }, { "epoch": 4.86, "grad_norm": 0.47992124625711946, "learning_rate": 5.0992299315634205e-08, "loss": 0.0624, "step": 11240 }, { "epoch": 4.86, "grad_norm": 0.903082591577723, "learning_rate": 4.948274774486672e-08, "loss": 0.0593, "step": 11245 }, { "epoch": 4.86, "grad_norm": 0.5865369565493873, "learning_rate": 4.7995821768656023e-08, "loss": 0.0582, "step": 11250 }, { "epoch": 4.86, "grad_norm": 0.4105720861995551, "learning_rate": 4.653152476799205e-08, "loss": 0.0679, "step": 11255 }, { "epoch": 4.86, "grad_norm": 0.5013484723911407, "learning_rate": 4.508986007241034e-08, "loss": 0.052, "step": 11260 }, { "epoch": 4.87, "grad_norm": 0.46155962018502744, "learning_rate": 4.3670830959980926e-08, "loss": 0.0592, "step": 11265 }, { "epoch": 4.87, "grad_norm": 0.3917564639266349, "learning_rate": 4.227444065730946e-08, "loss": 0.0717, "step": 11270 }, { "epoch": 4.87, "grad_norm": 0.7102335770665541, "learning_rate": 4.090069233952387e-08, "loss": 0.0592, "step": 11275 }, { "epoch": 4.87, "grad_norm": 0.9293214472850908, "learning_rate": 3.954958913026774e-08, "loss": 0.0692, "step": 11280 }, { "epoch": 4.87, "grad_norm": 0.35019330798760945, "learning_rate": 3.822113410169359e-08, "loss": 0.0517, "step": 11285 }, { "epoch": 4.88, "grad_norm": 0.6879669929612432, "learning_rate": 3.691533027445626e-08, "loss": 0.0632, "step": 11290 }, { "epoch": 4.88, "grad_norm": 0.6725965830420647, "learning_rate": 3.5632180617708454e-08, "loss": 0.0558, "step": 11295 }, { "epoch": 4.88, "grad_norm": 0.48268769982003784, "learning_rate": 3.4371688049089634e-08, "loss": 0.0612, "step": 11300 }, { "epoch": 4.88, "grad_norm": 0.5835298871058775, "learning_rate": 3.313385543472048e-08, "loss": 0.0539, "step": 11305 }, { "epoch": 4.89, "grad_norm": 0.5209353031141057, "learning_rate": 3.1918685589199524e-08, "loss": 0.0527, "step": 11310 }, { "epoch": 4.89, "grad_norm": 0.8854807664918086, "learning_rate": 3.0726181275592123e-08, "loss": 0.0646, "step": 11315 }, { "epoch": 4.89, "grad_norm": 0.6794898404276607, "learning_rate": 2.9556345205428162e-08, "loss": 0.066, "step": 11320 }, { "epoch": 4.89, "grad_norm": 0.6889118722336371, "learning_rate": 2.8409180038693197e-08, "loss": 0.0656, "step": 11325 }, { "epoch": 4.89, "grad_norm": 0.8580280186583628, "learning_rate": 2.7284688383824032e-08, "loss": 0.0717, "step": 11330 }, { "epoch": 4.9, "grad_norm": 0.44207148094124704, "learning_rate": 2.6182872797702042e-08, "loss": 0.0583, "step": 11335 }, { "epoch": 4.9, "grad_norm": 0.6360109435337133, "learning_rate": 2.5103735785647622e-08, "loss": 0.0706, "step": 11340 }, { "epoch": 4.9, "grad_norm": 0.7462959795590766, "learning_rate": 2.4047279801414634e-08, "loss": 0.0622, "step": 11345 }, { "epoch": 4.9, "grad_norm": 0.6410660993010558, "learning_rate": 2.3013507247183764e-08, "loss": 0.0549, "step": 11350 }, { "epoch": 4.9, "grad_norm": 0.5495295224336869, "learning_rate": 2.200242047355805e-08, "loss": 0.0641, "step": 11355 }, { "epoch": 4.91, "grad_norm": 0.7576999245837946, "learning_rate": 2.101402177955847e-08, "loss": 0.0597, "step": 11360 }, { "epoch": 4.91, "grad_norm": 0.3694481911504727, "learning_rate": 2.0048313412616148e-08, "loss": 0.0594, "step": 11365 }, { "epoch": 4.91, "grad_norm": 0.8524024130383686, "learning_rate": 1.910529756856905e-08, "loss": 0.0687, "step": 11370 }, { "epoch": 4.91, "grad_norm": 0.6884949477612426, "learning_rate": 1.8184976391657506e-08, "loss": 0.0626, "step": 11375 }, { "epoch": 4.92, "grad_norm": 0.4649978643711475, "learning_rate": 1.7287351974517587e-08, "loss": 0.061, "step": 11380 }, { "epoch": 4.92, "grad_norm": 0.5122065337906697, "learning_rate": 1.6412426358177747e-08, "loss": 0.0691, "step": 11385 }, { "epoch": 4.92, "grad_norm": 0.7026091205226528, "learning_rate": 1.5560201532052178e-08, "loss": 0.0581, "step": 11390 }, { "epoch": 4.92, "grad_norm": 0.554757648960058, "learning_rate": 1.4730679433940797e-08, "loss": 0.0613, "step": 11395 }, { "epoch": 4.92, "grad_norm": 0.6596515020568042, "learning_rate": 1.3923861950019268e-08, "loss": 0.0624, "step": 11400 }, { "epoch": 4.93, "grad_norm": 0.7074006847005723, "learning_rate": 1.3139750914836769e-08, "loss": 0.0737, "step": 11405 }, { "epoch": 4.93, "grad_norm": 0.3975859392319982, "learning_rate": 1.2378348111314886e-08, "loss": 0.0594, "step": 11410 }, { "epoch": 4.93, "grad_norm": 0.46001198290010065, "learning_rate": 1.1639655270738737e-08, "loss": 0.0635, "step": 11415 }, { "epoch": 4.93, "grad_norm": 1.0500438224677553, "learning_rate": 1.0923674072758073e-08, "loss": 0.0647, "step": 11420 }, { "epoch": 4.94, "grad_norm": 1.0586761531771032, "learning_rate": 1.0230406145376182e-08, "loss": 0.0646, "step": 11425 }, { "epoch": 4.94, "grad_norm": 0.49977826374240264, "learning_rate": 9.559853064954328e-09, "loss": 0.0654, "step": 11430 }, { "epoch": 4.94, "grad_norm": 0.3655032988191377, "learning_rate": 8.912016356203979e-09, "loss": 0.0531, "step": 11435 }, { "epoch": 4.94, "grad_norm": 0.5741494921527955, "learning_rate": 8.286897492182366e-09, "loss": 0.0593, "step": 11440 }, { "epoch": 4.94, "grad_norm": 0.31099743151940534, "learning_rate": 7.684497894292486e-09, "loss": 0.0557, "step": 11445 }, { "epoch": 4.95, "grad_norm": 0.6795734502665878, "learning_rate": 7.104818932276436e-09, "loss": 0.0611, "step": 11450 }, { "epoch": 4.95, "grad_norm": 0.5347254342737182, "learning_rate": 6.5478619242143045e-09, "loss": 0.0529, "step": 11455 }, { "epoch": 4.95, "grad_norm": 0.8047733415516746, "learning_rate": 6.013628136521954e-09, "loss": 0.0665, "step": 11460 }, { "epoch": 4.95, "grad_norm": 0.6337459372389843, "learning_rate": 5.502118783945465e-09, "loss": 0.0605, "step": 11465 }, { "epoch": 4.95, "grad_norm": 0.9441016264698104, "learning_rate": 5.013335029560029e-09, "loss": 0.0728, "step": 11470 }, { "epoch": 4.96, "grad_norm": 1.0466550453370893, "learning_rate": 4.547277984767728e-09, "loss": 0.0632, "step": 11475 }, { "epoch": 4.96, "grad_norm": 0.6740902732659063, "learning_rate": 4.103948709295313e-09, "loss": 0.068, "step": 11480 }, { "epoch": 4.96, "grad_norm": 0.7021120177061997, "learning_rate": 3.683348211187543e-09, "loss": 0.0636, "step": 11485 }, { "epoch": 4.96, "grad_norm": 0.8641288363790272, "learning_rate": 3.2854774468116246e-09, "loss": 0.0668, "step": 11490 }, { "epoch": 4.97, "grad_norm": 0.6076936871599368, "learning_rate": 2.9103373208505536e-09, "loss": 0.0645, "step": 11495 }, { "epoch": 4.97, "grad_norm": 0.7107727964656887, "learning_rate": 2.5579286863020025e-09, "loss": 0.0598, "step": 11500 }, { "epoch": 4.97, "grad_norm": 0.694761980405919, "learning_rate": 2.2282523444761006e-09, "loss": 0.0633, "step": 11505 }, { "epoch": 4.97, "grad_norm": 0.7006893637526823, "learning_rate": 1.921309044995434e-09, "loss": 0.0677, "step": 11510 }, { "epoch": 4.97, "grad_norm": 0.40600992482089776, "learning_rate": 1.637099485789495e-09, "loss": 0.0542, "step": 11515 }, { "epoch": 4.98, "grad_norm": 0.582747199794571, "learning_rate": 1.375624313099122e-09, "loss": 0.0648, "step": 11520 }, { "epoch": 4.98, "grad_norm": 1.068294118794195, "learning_rate": 1.1368841214676184e-09, "loss": 0.0733, "step": 11525 }, { "epoch": 4.98, "grad_norm": 0.9704193069259763, "learning_rate": 9.208794537451938e-10, "loss": 0.0677, "step": 11530 }, { "epoch": 4.98, "grad_norm": 0.3587463908186778, "learning_rate": 7.276108010867422e-10, "loss": 0.056, "step": 11535 }, { "epoch": 4.98, "grad_norm": 0.5892073955445215, "learning_rate": 5.570786029474029e-10, "loss": 0.06, "step": 11540 }, { "epoch": 4.99, "grad_norm": 1.2197020031797046, "learning_rate": 4.0928324708588986e-10, "loss": 0.0729, "step": 11545 }, { "epoch": 4.99, "grad_norm": 0.6267489883478871, "learning_rate": 2.8422506956116147e-10, "loss": 0.0588, "step": 11550 }, { "epoch": 4.99, "grad_norm": 0.5631817849121783, "learning_rate": 1.819043547313104e-10, "loss": 0.0566, "step": 11555 }, { "epoch": 4.99, "grad_norm": 0.5248536348350673, "learning_rate": 1.0232133525467369e-10, "loss": 0.0602, "step": 11560 }, { "epoch": 5.0, "grad_norm": 0.44706874544550185, "learning_rate": 4.5476192086502115e-11, "loss": 0.0615, "step": 11565 }, { "epoch": 5.0, "grad_norm": 0.7352828065983817, "learning_rate": 1.136905448451131e-11, "loss": 0.0618, "step": 11570 }, { "epoch": 5.0, "grad_norm": 0.6400220635512455, "learning_rate": 0.0, "loss": 0.0725, "step": 11575 }, { "epoch": 5.0, "eval_loss": 0.9067042469978333, "eval_runtime": 352.1066, "eval_samples_per_second": 21.32, "eval_steps_per_second": 0.335, "step": 11575 }, { "epoch": 5.0, "step": 11575, "total_flos": 2423569514496000.0, "train_loss": 0.37871731976260375, "train_runtime": 72163.6976, "train_samples_per_second": 5.131, "train_steps_per_second": 0.16 } ], "logging_steps": 5, "max_steps": 11575, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 2423569514496000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }