{ "best_metric": 0.5286828336634978, "best_model_checkpoint": "./openalex-topic-title-abstract2/checkpoint-263760", "epoch": 10.0, "eval_steps": 500, "global_step": 263760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037913254473764028, "grad_norm": 5.145029067993164, "learning_rate": 7.582650894752806e-08, "loss": 8.4365, "step": 100 }, { "epoch": 0.0075826508947528055, "grad_norm": 3.6480586528778076, "learning_rate": 1.516530178950561e-07, "loss": 8.4326, "step": 200 }, { "epoch": 0.011373976342129208, "grad_norm": 6.856083393096924, "learning_rate": 2.2747952684258417e-07, "loss": 8.432, "step": 300 }, { "epoch": 0.015165301789505611, "grad_norm": 2.851271629333496, "learning_rate": 3.033060357901122e-07, "loss": 8.4272, "step": 400 }, { "epoch": 0.018956627236882014, "grad_norm": 3.9989030361175537, "learning_rate": 3.7913254473764033e-07, "loss": 8.4258, "step": 500 }, { "epoch": 0.022747952684258416, "grad_norm": 2.1243252754211426, "learning_rate": 4.5495905368516834e-07, "loss": 8.424, "step": 600 }, { "epoch": 0.02653927813163482, "grad_norm": 3.4075329303741455, "learning_rate": 5.307855626326964e-07, "loss": 8.4229, "step": 700 }, { "epoch": 0.030330603579011222, "grad_norm": 4.63064432144165, "learning_rate": 6.066120715802244e-07, "loss": 8.423, "step": 800 }, { "epoch": 0.034121929026387623, "grad_norm": 1.3439950942993164, "learning_rate": 6.824385805277526e-07, "loss": 8.4207, "step": 900 }, { "epoch": 0.03791325447376403, "grad_norm": 2.664422035217285, "learning_rate": 7.582650894752807e-07, "loss": 8.42, "step": 1000 }, { "epoch": 0.04170457992114043, "grad_norm": 1.0533157587051392, "learning_rate": 8.340915984228088e-07, "loss": 8.4207, "step": 1100 }, { "epoch": 0.04549590536851683, "grad_norm": 2.203589916229248, "learning_rate": 9.099181073703367e-07, "loss": 8.4204, "step": 1200 }, { "epoch": 0.049287230815893236, "grad_norm": 1.0500197410583496, "learning_rate": 9.857446163178648e-07, "loss": 8.4205, "step": 1300 }, { "epoch": 0.05307855626326964, "grad_norm": 1.0050631761550903, "learning_rate": 1.0615711252653929e-06, "loss": 8.4192, "step": 1400 }, { "epoch": 0.05686988171064604, "grad_norm": 2.2500393390655518, "learning_rate": 1.1373976342129208e-06, "loss": 8.4191, "step": 1500 }, { "epoch": 0.060661207158022444, "grad_norm": 1.1111443042755127, "learning_rate": 1.213224143160449e-06, "loss": 8.4177, "step": 1600 }, { "epoch": 0.06445253260539885, "grad_norm": 2.2496349811553955, "learning_rate": 1.289050652107977e-06, "loss": 8.4184, "step": 1700 }, { "epoch": 0.06824385805277525, "grad_norm": 1.4542392492294312, "learning_rate": 1.3648771610555051e-06, "loss": 8.4164, "step": 1800 }, { "epoch": 0.07203518350015166, "grad_norm": 1.3898897171020508, "learning_rate": 1.4407036700030332e-06, "loss": 8.4152, "step": 1900 }, { "epoch": 0.07582650894752806, "grad_norm": 1.2690976858139038, "learning_rate": 1.5165301789505613e-06, "loss": 8.4134, "step": 2000 }, { "epoch": 0.07961783439490445, "grad_norm": 1.5716992616653442, "learning_rate": 1.5923566878980892e-06, "loss": 8.4099, "step": 2100 }, { "epoch": 0.08340915984228087, "grad_norm": 2.4420125484466553, "learning_rate": 1.6681831968456176e-06, "loss": 8.4041, "step": 2200 }, { "epoch": 0.08720048528965726, "grad_norm": 1.5877169370651245, "learning_rate": 1.7440097057931455e-06, "loss": 8.3982, "step": 2300 }, { "epoch": 0.09099181073703366, "grad_norm": 1.7921488285064697, "learning_rate": 1.8198362147406733e-06, "loss": 8.39, "step": 2400 }, { "epoch": 0.09478313618441007, "grad_norm": 1.7756202220916748, "learning_rate": 1.8956627236882017e-06, "loss": 8.3859, "step": 2500 }, { "epoch": 0.09857446163178647, "grad_norm": 1.9342089891433716, "learning_rate": 1.9714892326357296e-06, "loss": 8.3716, "step": 2600 }, { "epoch": 0.10236578707916287, "grad_norm": 2.074688196182251, "learning_rate": 2.0473157415832577e-06, "loss": 8.3589, "step": 2700 }, { "epoch": 0.10615711252653928, "grad_norm": 2.2163164615631104, "learning_rate": 2.1231422505307858e-06, "loss": 8.3454, "step": 2800 }, { "epoch": 0.10994843797391568, "grad_norm": 2.4278650283813477, "learning_rate": 2.198968759478314e-06, "loss": 8.333, "step": 2900 }, { "epoch": 0.11373976342129208, "grad_norm": 2.2896523475646973, "learning_rate": 2.2747952684258416e-06, "loss": 8.3135, "step": 3000 }, { "epoch": 0.11753108886866849, "grad_norm": 2.357070207595825, "learning_rate": 2.35062177737337e-06, "loss": 8.305, "step": 3100 }, { "epoch": 0.12132241431604489, "grad_norm": 2.4195120334625244, "learning_rate": 2.426448286320898e-06, "loss": 8.2893, "step": 3200 }, { "epoch": 0.1251137397634213, "grad_norm": 2.3945577144622803, "learning_rate": 2.5022747952684263e-06, "loss": 8.2682, "step": 3300 }, { "epoch": 0.1289050652107977, "grad_norm": 2.359384059906006, "learning_rate": 2.578101304215954e-06, "loss": 8.2546, "step": 3400 }, { "epoch": 0.1326963906581741, "grad_norm": 2.4975228309631348, "learning_rate": 2.653927813163482e-06, "loss": 8.2414, "step": 3500 }, { "epoch": 0.1364877161055505, "grad_norm": 2.5740394592285156, "learning_rate": 2.7297543221110102e-06, "loss": 8.2252, "step": 3600 }, { "epoch": 0.1402790415529269, "grad_norm": 2.7141807079315186, "learning_rate": 2.805580831058538e-06, "loss": 8.2017, "step": 3700 }, { "epoch": 0.14407036700030332, "grad_norm": 2.333970785140991, "learning_rate": 2.8814073400060665e-06, "loss": 8.1925, "step": 3800 }, { "epoch": 0.14786169244767972, "grad_norm": 2.5017683506011963, "learning_rate": 2.9572338489535946e-06, "loss": 8.178, "step": 3900 }, { "epoch": 0.1516530178950561, "grad_norm": 2.524169445037842, "learning_rate": 3.0330603579011227e-06, "loss": 8.1624, "step": 4000 }, { "epoch": 0.1554443433424325, "grad_norm": 2.759120225906372, "learning_rate": 3.1088868668486504e-06, "loss": 8.145, "step": 4100 }, { "epoch": 0.1592356687898089, "grad_norm": 3.3268203735351562, "learning_rate": 3.1847133757961785e-06, "loss": 8.1206, "step": 4200 }, { "epoch": 0.1630269942371853, "grad_norm": 2.9013912677764893, "learning_rate": 3.2605398847437066e-06, "loss": 8.1132, "step": 4300 }, { "epoch": 0.16681831968456173, "grad_norm": 2.69913911819458, "learning_rate": 3.336366393691235e-06, "loss": 8.0929, "step": 4400 }, { "epoch": 0.17060964513193813, "grad_norm": 2.7606115341186523, "learning_rate": 3.412192902638763e-06, "loss": 8.0811, "step": 4500 }, { "epoch": 0.17440097057931453, "grad_norm": 2.781407594680786, "learning_rate": 3.488019411586291e-06, "loss": 8.0588, "step": 4600 }, { "epoch": 0.17819229602669093, "grad_norm": 2.585346221923828, "learning_rate": 3.563845920533819e-06, "loss": 8.046, "step": 4700 }, { "epoch": 0.18198362147406733, "grad_norm": 2.6356940269470215, "learning_rate": 3.6396724294813467e-06, "loss": 8.0271, "step": 4800 }, { "epoch": 0.18577494692144372, "grad_norm": 2.7742843627929688, "learning_rate": 3.715498938428875e-06, "loss": 8.0121, "step": 4900 }, { "epoch": 0.18956627236882015, "grad_norm": 2.3873519897460938, "learning_rate": 3.7913254473764033e-06, "loss": 7.9986, "step": 5000 }, { "epoch": 0.19335759781619655, "grad_norm": 2.880704641342163, "learning_rate": 3.8671519563239314e-06, "loss": 7.9745, "step": 5100 }, { "epoch": 0.19714892326357294, "grad_norm": 2.430640697479248, "learning_rate": 3.942978465271459e-06, "loss": 7.9633, "step": 5200 }, { "epoch": 0.20094024871094934, "grad_norm": 2.5916383266448975, "learning_rate": 4.018804974218987e-06, "loss": 7.9402, "step": 5300 }, { "epoch": 0.20473157415832574, "grad_norm": 2.488168239593506, "learning_rate": 4.094631483166515e-06, "loss": 7.9355, "step": 5400 }, { "epoch": 0.20852289960570217, "grad_norm": 2.540648937225342, "learning_rate": 4.170457992114044e-06, "loss": 7.9148, "step": 5500 }, { "epoch": 0.21231422505307856, "grad_norm": 3.1133856773376465, "learning_rate": 4.2462845010615716e-06, "loss": 7.8995, "step": 5600 }, { "epoch": 0.21610555050045496, "grad_norm": 2.529254198074341, "learning_rate": 4.322111010009099e-06, "loss": 7.8791, "step": 5700 }, { "epoch": 0.21989687594783136, "grad_norm": 2.545454502105713, "learning_rate": 4.397937518956628e-06, "loss": 7.8675, "step": 5800 }, { "epoch": 0.22368820139520776, "grad_norm": 2.5128116607666016, "learning_rate": 4.4737640279041555e-06, "loss": 7.8536, "step": 5900 }, { "epoch": 0.22747952684258416, "grad_norm": 3.195108652114868, "learning_rate": 4.549590536851683e-06, "loss": 7.8413, "step": 6000 }, { "epoch": 0.23127085228996058, "grad_norm": 3.2973287105560303, "learning_rate": 4.625417045799212e-06, "loss": 7.8243, "step": 6100 }, { "epoch": 0.23506217773733698, "grad_norm": 2.895899534225464, "learning_rate": 4.70124355474674e-06, "loss": 7.808, "step": 6200 }, { "epoch": 0.23885350318471338, "grad_norm": 3.311063289642334, "learning_rate": 4.777070063694268e-06, "loss": 7.7913, "step": 6300 }, { "epoch": 0.24264482863208978, "grad_norm": 2.936001777648926, "learning_rate": 4.852896572641796e-06, "loss": 7.7806, "step": 6400 }, { "epoch": 0.24643615407946617, "grad_norm": 3.0042006969451904, "learning_rate": 4.928723081589324e-06, "loss": 7.7595, "step": 6500 }, { "epoch": 0.2502274795268426, "grad_norm": 3.3353512287139893, "learning_rate": 5.004549590536853e-06, "loss": 7.7522, "step": 6600 }, { "epoch": 0.25401880497421897, "grad_norm": 2.926194429397583, "learning_rate": 5.0803760994843795e-06, "loss": 7.7278, "step": 6700 }, { "epoch": 0.2578101304215954, "grad_norm": 2.7767958641052246, "learning_rate": 5.156202608431908e-06, "loss": 7.7192, "step": 6800 }, { "epoch": 0.26160145586897177, "grad_norm": 2.7102842330932617, "learning_rate": 5.232029117379436e-06, "loss": 7.7045, "step": 6900 }, { "epoch": 0.2653927813163482, "grad_norm": 2.9081335067749023, "learning_rate": 5.307855626326964e-06, "loss": 7.6876, "step": 7000 }, { "epoch": 0.2691841067637246, "grad_norm": 2.9533612728118896, "learning_rate": 5.383682135274493e-06, "loss": 7.6761, "step": 7100 }, { "epoch": 0.272975432211101, "grad_norm": 3.2787046432495117, "learning_rate": 5.4595086442220205e-06, "loss": 7.6608, "step": 7200 }, { "epoch": 0.2767667576584774, "grad_norm": 2.6297895908355713, "learning_rate": 5.535335153169549e-06, "loss": 7.6352, "step": 7300 }, { "epoch": 0.2805580831058538, "grad_norm": 2.7180070877075195, "learning_rate": 5.611161662117076e-06, "loss": 7.6292, "step": 7400 }, { "epoch": 0.2843494085532302, "grad_norm": 3.664940595626831, "learning_rate": 5.686988171064604e-06, "loss": 7.6137, "step": 7500 }, { "epoch": 0.28814073400060664, "grad_norm": 3.0332205295562744, "learning_rate": 5.762814680012133e-06, "loss": 7.6003, "step": 7600 }, { "epoch": 0.291932059447983, "grad_norm": 2.984022855758667, "learning_rate": 5.838641188959661e-06, "loss": 7.5871, "step": 7700 }, { "epoch": 0.29572338489535943, "grad_norm": 2.917029857635498, "learning_rate": 5.914467697907189e-06, "loss": 7.5604, "step": 7800 }, { "epoch": 0.2995147103427358, "grad_norm": 3.4502170085906982, "learning_rate": 5.990294206854717e-06, "loss": 7.5466, "step": 7900 }, { "epoch": 0.3033060357901122, "grad_norm": 2.762951374053955, "learning_rate": 6.066120715802245e-06, "loss": 7.541, "step": 8000 }, { "epoch": 0.30709736123748865, "grad_norm": 3.5170693397521973, "learning_rate": 6.141947224749774e-06, "loss": 7.5262, "step": 8100 }, { "epoch": 0.310888686684865, "grad_norm": 3.095799446105957, "learning_rate": 6.217773733697301e-06, "loss": 7.5042, "step": 8200 }, { "epoch": 0.31468001213224145, "grad_norm": 3.3766589164733887, "learning_rate": 6.293600242644829e-06, "loss": 7.4963, "step": 8300 }, { "epoch": 0.3184713375796178, "grad_norm": 2.920715570449829, "learning_rate": 6.369426751592357e-06, "loss": 7.4784, "step": 8400 }, { "epoch": 0.32226266302699424, "grad_norm": 3.63759708404541, "learning_rate": 6.4452532605398855e-06, "loss": 7.4539, "step": 8500 }, { "epoch": 0.3260539884743706, "grad_norm": 3.240902900695801, "learning_rate": 6.521079769487413e-06, "loss": 7.4394, "step": 8600 }, { "epoch": 0.32984531392174704, "grad_norm": 3.262450695037842, "learning_rate": 6.596906278434942e-06, "loss": 7.4283, "step": 8700 }, { "epoch": 0.33363663936912347, "grad_norm": 3.0677380561828613, "learning_rate": 6.67273278738247e-06, "loss": 7.4146, "step": 8800 }, { "epoch": 0.33742796481649984, "grad_norm": 3.5806808471679688, "learning_rate": 6.748559296329997e-06, "loss": 7.3976, "step": 8900 }, { "epoch": 0.34121929026387626, "grad_norm": 3.4095427989959717, "learning_rate": 6.824385805277526e-06, "loss": 7.3714, "step": 9000 }, { "epoch": 0.34501061571125263, "grad_norm": 2.8725080490112305, "learning_rate": 6.900212314225053e-06, "loss": 7.3654, "step": 9100 }, { "epoch": 0.34880194115862906, "grad_norm": 3.818187952041626, "learning_rate": 6.976038823172582e-06, "loss": 7.3515, "step": 9200 }, { "epoch": 0.3525932666060055, "grad_norm": 3.411681890487671, "learning_rate": 7.05186533212011e-06, "loss": 7.3491, "step": 9300 }, { "epoch": 0.35638459205338185, "grad_norm": 4.557471752166748, "learning_rate": 7.127691841067638e-06, "loss": 7.3259, "step": 9400 }, { "epoch": 0.3601759175007583, "grad_norm": 3.421715259552002, "learning_rate": 7.2035183500151666e-06, "loss": 7.2953, "step": 9500 }, { "epoch": 0.36396724294813465, "grad_norm": 3.1979868412017822, "learning_rate": 7.279344858962693e-06, "loss": 7.2907, "step": 9600 }, { "epoch": 0.3677585683955111, "grad_norm": 3.6711843013763428, "learning_rate": 7.355171367910222e-06, "loss": 7.2675, "step": 9700 }, { "epoch": 0.37154989384288745, "grad_norm": 3.323714256286621, "learning_rate": 7.43099787685775e-06, "loss": 7.2543, "step": 9800 }, { "epoch": 0.37534121929026387, "grad_norm": 3.56269907951355, "learning_rate": 7.506824385805278e-06, "loss": 7.2523, "step": 9900 }, { "epoch": 0.3791325447376403, "grad_norm": 3.0917670726776123, "learning_rate": 7.582650894752807e-06, "loss": 7.224, "step": 10000 }, { "epoch": 0.38292387018501667, "grad_norm": 3.1806726455688477, "learning_rate": 7.658477403700334e-06, "loss": 7.2059, "step": 10100 }, { "epoch": 0.3867151956323931, "grad_norm": 2.9372220039367676, "learning_rate": 7.734303912647863e-06, "loss": 7.201, "step": 10200 }, { "epoch": 0.39050652107976946, "grad_norm": 3.3361432552337646, "learning_rate": 7.81013042159539e-06, "loss": 7.1684, "step": 10300 }, { "epoch": 0.3942978465271459, "grad_norm": 4.768875598907471, "learning_rate": 7.885956930542918e-06, "loss": 7.1668, "step": 10400 }, { "epoch": 0.3980891719745223, "grad_norm": 3.3766818046569824, "learning_rate": 7.961783439490447e-06, "loss": 7.1495, "step": 10500 }, { "epoch": 0.4018804974218987, "grad_norm": 3.0961954593658447, "learning_rate": 8.037609948437974e-06, "loss": 7.1268, "step": 10600 }, { "epoch": 0.4056718228692751, "grad_norm": 3.540738344192505, "learning_rate": 8.113436457385502e-06, "loss": 7.1143, "step": 10700 }, { "epoch": 0.4094631483166515, "grad_norm": 3.530588388442993, "learning_rate": 8.18926296633303e-06, "loss": 7.0969, "step": 10800 }, { "epoch": 0.4132544737640279, "grad_norm": 3.46834659576416, "learning_rate": 8.26508947528056e-06, "loss": 7.084, "step": 10900 }, { "epoch": 0.41704579921140433, "grad_norm": 3.5852622985839844, "learning_rate": 8.340915984228088e-06, "loss": 7.0662, "step": 11000 }, { "epoch": 0.4208371246587807, "grad_norm": 3.078829050064087, "learning_rate": 8.416742493175615e-06, "loss": 7.0476, "step": 11100 }, { "epoch": 0.42462845010615713, "grad_norm": 3.2757062911987305, "learning_rate": 8.492569002123143e-06, "loss": 7.0459, "step": 11200 }, { "epoch": 0.4284197755535335, "grad_norm": 4.492921352386475, "learning_rate": 8.56839551107067e-06, "loss": 7.0224, "step": 11300 }, { "epoch": 0.4322111010009099, "grad_norm": 3.7321064472198486, "learning_rate": 8.644222020018199e-06, "loss": 7.0088, "step": 11400 }, { "epoch": 0.4360024264482863, "grad_norm": 3.206341028213501, "learning_rate": 8.720048528965727e-06, "loss": 6.9927, "step": 11500 }, { "epoch": 0.4397937518956627, "grad_norm": 3.7218759059906006, "learning_rate": 8.795875037913256e-06, "loss": 6.9812, "step": 11600 }, { "epoch": 0.44358507734303915, "grad_norm": 3.359732151031494, "learning_rate": 8.871701546860784e-06, "loss": 6.9572, "step": 11700 }, { "epoch": 0.4473764027904155, "grad_norm": 3.8753859996795654, "learning_rate": 8.947528055808311e-06, "loss": 6.9389, "step": 11800 }, { "epoch": 0.45116772823779194, "grad_norm": 3.282609224319458, "learning_rate": 9.02335456475584e-06, "loss": 6.927, "step": 11900 }, { "epoch": 0.4549590536851683, "grad_norm": 4.510603427886963, "learning_rate": 9.099181073703366e-06, "loss": 6.9076, "step": 12000 }, { "epoch": 0.45875037913254474, "grad_norm": 4.137149333953857, "learning_rate": 9.175007582650895e-06, "loss": 6.8945, "step": 12100 }, { "epoch": 0.46254170457992116, "grad_norm": 2.9457147121429443, "learning_rate": 9.250834091598423e-06, "loss": 6.891, "step": 12200 }, { "epoch": 0.46633303002729753, "grad_norm": 3.9326858520507812, "learning_rate": 9.326660600545952e-06, "loss": 6.8648, "step": 12300 }, { "epoch": 0.47012435547467396, "grad_norm": 3.521275520324707, "learning_rate": 9.40248710949348e-06, "loss": 6.8291, "step": 12400 }, { "epoch": 0.47391568092205033, "grad_norm": 3.74531888961792, "learning_rate": 9.478313618441007e-06, "loss": 6.84, "step": 12500 }, { "epoch": 0.47770700636942676, "grad_norm": 3.803335189819336, "learning_rate": 9.554140127388536e-06, "loss": 6.8106, "step": 12600 }, { "epoch": 0.4814983318168031, "grad_norm": 3.738898515701294, "learning_rate": 9.629966636336063e-06, "loss": 6.7891, "step": 12700 }, { "epoch": 0.48528965726417955, "grad_norm": 3.299368143081665, "learning_rate": 9.705793145283591e-06, "loss": 6.7746, "step": 12800 }, { "epoch": 0.489080982711556, "grad_norm": 3.836482286453247, "learning_rate": 9.78161965423112e-06, "loss": 6.7617, "step": 12900 }, { "epoch": 0.49287230815893235, "grad_norm": 3.7235238552093506, "learning_rate": 9.857446163178648e-06, "loss": 6.7473, "step": 13000 }, { "epoch": 0.4966636336063088, "grad_norm": 3.7794718742370605, "learning_rate": 9.933272672126177e-06, "loss": 6.7391, "step": 13100 }, { "epoch": 0.5004549590536852, "grad_norm": 4.1656928062438965, "learning_rate": 1.0009099181073705e-05, "loss": 6.7175, "step": 13200 }, { "epoch": 0.5042462845010616, "grad_norm": 3.850841760635376, "learning_rate": 1.0084925690021232e-05, "loss": 6.6905, "step": 13300 }, { "epoch": 0.5080376099484379, "grad_norm": 4.364170074462891, "learning_rate": 1.0160752198968759e-05, "loss": 6.6883, "step": 13400 }, { "epoch": 0.5118289353958144, "grad_norm": 3.6860668659210205, "learning_rate": 1.023657870791629e-05, "loss": 6.6626, "step": 13500 }, { "epoch": 0.5156202608431908, "grad_norm": 5.069449424743652, "learning_rate": 1.0312405216863816e-05, "loss": 6.6438, "step": 13600 }, { "epoch": 0.5194115862905672, "grad_norm": 4.301501750946045, "learning_rate": 1.0388231725811345e-05, "loss": 6.6294, "step": 13700 }, { "epoch": 0.5232029117379435, "grad_norm": 3.9061102867126465, "learning_rate": 1.0464058234758871e-05, "loss": 6.62, "step": 13800 }, { "epoch": 0.52699423718532, "grad_norm": 4.278404235839844, "learning_rate": 1.0539884743706402e-05, "loss": 6.602, "step": 13900 }, { "epoch": 0.5307855626326964, "grad_norm": 3.9584097862243652, "learning_rate": 1.0615711252653929e-05, "loss": 6.5839, "step": 14000 }, { "epoch": 0.5345768880800728, "grad_norm": 3.9116945266723633, "learning_rate": 1.0691537761601455e-05, "loss": 6.5652, "step": 14100 }, { "epoch": 0.5383682135274492, "grad_norm": 4.966970443725586, "learning_rate": 1.0767364270548986e-05, "loss": 6.5473, "step": 14200 }, { "epoch": 0.5421595389748256, "grad_norm": 5.10191535949707, "learning_rate": 1.0843190779496512e-05, "loss": 6.5468, "step": 14300 }, { "epoch": 0.545950864422202, "grad_norm": 4.128649711608887, "learning_rate": 1.0919017288444041e-05, "loss": 6.5411, "step": 14400 }, { "epoch": 0.5497421898695785, "grad_norm": 4.651221752166748, "learning_rate": 1.099484379739157e-05, "loss": 6.5087, "step": 14500 }, { "epoch": 0.5535335153169548, "grad_norm": 5.050411701202393, "learning_rate": 1.1070670306339098e-05, "loss": 6.5005, "step": 14600 }, { "epoch": 0.5573248407643312, "grad_norm": 4.873382568359375, "learning_rate": 1.1146496815286625e-05, "loss": 6.4652, "step": 14700 }, { "epoch": 0.5611161662117076, "grad_norm": 4.002323627471924, "learning_rate": 1.1222323324234152e-05, "loss": 6.456, "step": 14800 }, { "epoch": 0.564907491659084, "grad_norm": 4.28359317779541, "learning_rate": 1.1298149833181682e-05, "loss": 6.4369, "step": 14900 }, { "epoch": 0.5686988171064604, "grad_norm": 4.153304100036621, "learning_rate": 1.1373976342129209e-05, "loss": 6.4205, "step": 15000 }, { "epoch": 0.5724901425538368, "grad_norm": 4.286805629730225, "learning_rate": 1.1449802851076737e-05, "loss": 6.4036, "step": 15100 }, { "epoch": 0.5762814680012133, "grad_norm": 4.225104808807373, "learning_rate": 1.1525629360024266e-05, "loss": 6.3932, "step": 15200 }, { "epoch": 0.5800727934485896, "grad_norm": 4.62565803527832, "learning_rate": 1.1601455868971794e-05, "loss": 6.3898, "step": 15300 }, { "epoch": 0.583864118895966, "grad_norm": 3.8628976345062256, "learning_rate": 1.1677282377919321e-05, "loss": 6.3627, "step": 15400 }, { "epoch": 0.5876554443433424, "grad_norm": 4.33087158203125, "learning_rate": 1.1753108886866848e-05, "loss": 6.3415, "step": 15500 }, { "epoch": 0.5914467697907189, "grad_norm": 4.725861549377441, "learning_rate": 1.1828935395814378e-05, "loss": 6.3427, "step": 15600 }, { "epoch": 0.5952380952380952, "grad_norm": 4.341407299041748, "learning_rate": 1.1904761904761905e-05, "loss": 6.306, "step": 15700 }, { "epoch": 0.5990294206854716, "grad_norm": 4.900088787078857, "learning_rate": 1.1980588413709434e-05, "loss": 6.2926, "step": 15800 }, { "epoch": 0.6028207461328481, "grad_norm": 3.9068453311920166, "learning_rate": 1.2056414922656962e-05, "loss": 6.296, "step": 15900 }, { "epoch": 0.6066120715802245, "grad_norm": 5.647632122039795, "learning_rate": 1.213224143160449e-05, "loss": 6.2605, "step": 16000 }, { "epoch": 0.6104033970276008, "grad_norm": 4.339327812194824, "learning_rate": 1.2208067940552018e-05, "loss": 6.2411, "step": 16100 }, { "epoch": 0.6141947224749773, "grad_norm": 4.017951011657715, "learning_rate": 1.2283894449499548e-05, "loss": 6.2322, "step": 16200 }, { "epoch": 0.6179860479223537, "grad_norm": 5.77239465713501, "learning_rate": 1.2359720958447075e-05, "loss": 6.2224, "step": 16300 }, { "epoch": 0.62177737336973, "grad_norm": 5.61624813079834, "learning_rate": 1.2435547467394601e-05, "loss": 6.2087, "step": 16400 }, { "epoch": 0.6255686988171064, "grad_norm": 4.294707775115967, "learning_rate": 1.251137397634213e-05, "loss": 6.1838, "step": 16500 }, { "epoch": 0.6293600242644829, "grad_norm": 4.497371673583984, "learning_rate": 1.2587200485289658e-05, "loss": 6.1765, "step": 16600 }, { "epoch": 0.6331513497118593, "grad_norm": 4.804287433624268, "learning_rate": 1.2663026994237187e-05, "loss": 6.1457, "step": 16700 }, { "epoch": 0.6369426751592356, "grad_norm": 5.072319030761719, "learning_rate": 1.2738853503184714e-05, "loss": 6.1274, "step": 16800 }, { "epoch": 0.6407340006066121, "grad_norm": 4.513099670410156, "learning_rate": 1.2814680012132244e-05, "loss": 6.1071, "step": 16900 }, { "epoch": 0.6445253260539885, "grad_norm": 4.653482913970947, "learning_rate": 1.2890506521079771e-05, "loss": 6.1188, "step": 17000 }, { "epoch": 0.6483166515013649, "grad_norm": 6.5934014320373535, "learning_rate": 1.2966333030027298e-05, "loss": 6.0903, "step": 17100 }, { "epoch": 0.6521079769487412, "grad_norm": 5.7481207847595215, "learning_rate": 1.3042159538974826e-05, "loss": 6.0689, "step": 17200 }, { "epoch": 0.6558993023961177, "grad_norm": 6.08207893371582, "learning_rate": 1.3117986047922355e-05, "loss": 6.0557, "step": 17300 }, { "epoch": 0.6596906278434941, "grad_norm": 4.946324825286865, "learning_rate": 1.3193812556869883e-05, "loss": 6.0469, "step": 17400 }, { "epoch": 0.6634819532908705, "grad_norm": 5.613690376281738, "learning_rate": 1.326963906581741e-05, "loss": 6.0309, "step": 17500 }, { "epoch": 0.6672732787382469, "grad_norm": 4.984470367431641, "learning_rate": 1.334546557476494e-05, "loss": 5.9897, "step": 17600 }, { "epoch": 0.6710646041856233, "grad_norm": 5.933009147644043, "learning_rate": 1.3421292083712467e-05, "loss": 5.9981, "step": 17700 }, { "epoch": 0.6748559296329997, "grad_norm": 5.027645111083984, "learning_rate": 1.3497118592659994e-05, "loss": 5.974, "step": 17800 }, { "epoch": 0.6786472550803762, "grad_norm": 5.055216312408447, "learning_rate": 1.3572945101607524e-05, "loss": 5.9582, "step": 17900 }, { "epoch": 0.6824385805277525, "grad_norm": 5.123015403747559, "learning_rate": 1.3648771610555051e-05, "loss": 5.9308, "step": 18000 }, { "epoch": 0.6862299059751289, "grad_norm": 5.168835163116455, "learning_rate": 1.372459811950258e-05, "loss": 5.9273, "step": 18100 }, { "epoch": 0.6900212314225053, "grad_norm": 4.577189922332764, "learning_rate": 1.3800424628450107e-05, "loss": 5.9093, "step": 18200 }, { "epoch": 0.6938125568698817, "grad_norm": 4.787968158721924, "learning_rate": 1.3876251137397637e-05, "loss": 5.8954, "step": 18300 }, { "epoch": 0.6976038823172581, "grad_norm": 4.792823314666748, "learning_rate": 1.3952077646345164e-05, "loss": 5.8672, "step": 18400 }, { "epoch": 0.7013952077646345, "grad_norm": 5.701659202575684, "learning_rate": 1.402790415529269e-05, "loss": 5.8791, "step": 18500 }, { "epoch": 0.705186533212011, "grad_norm": 5.087223529815674, "learning_rate": 1.410373066424022e-05, "loss": 5.8679, "step": 18600 }, { "epoch": 0.7089778586593873, "grad_norm": 6.281311988830566, "learning_rate": 1.4179557173187748e-05, "loss": 5.818, "step": 18700 }, { "epoch": 0.7127691841067637, "grad_norm": 5.398939609527588, "learning_rate": 1.4255383682135276e-05, "loss": 5.8175, "step": 18800 }, { "epoch": 0.7165605095541401, "grad_norm": 5.402658939361572, "learning_rate": 1.4331210191082803e-05, "loss": 5.7886, "step": 18900 }, { "epoch": 0.7203518350015166, "grad_norm": 4.911555290222168, "learning_rate": 1.4407036700030333e-05, "loss": 5.7854, "step": 19000 }, { "epoch": 0.7241431604488929, "grad_norm": 4.4705424308776855, "learning_rate": 1.448286320897786e-05, "loss": 5.7841, "step": 19100 }, { "epoch": 0.7279344858962693, "grad_norm": 4.733659267425537, "learning_rate": 1.4558689717925387e-05, "loss": 5.7673, "step": 19200 }, { "epoch": 0.7317258113436458, "grad_norm": 5.409914970397949, "learning_rate": 1.4634516226872917e-05, "loss": 5.722, "step": 19300 }, { "epoch": 0.7355171367910222, "grad_norm": 4.3083062171936035, "learning_rate": 1.4710342735820444e-05, "loss": 5.7102, "step": 19400 }, { "epoch": 0.7393084622383985, "grad_norm": 5.239251136779785, "learning_rate": 1.4786169244767972e-05, "loss": 5.7048, "step": 19500 }, { "epoch": 0.7430997876857749, "grad_norm": 5.65474796295166, "learning_rate": 1.48619957537155e-05, "loss": 5.6909, "step": 19600 }, { "epoch": 0.7468911131331514, "grad_norm": 5.377579212188721, "learning_rate": 1.493782226266303e-05, "loss": 5.6665, "step": 19700 }, { "epoch": 0.7506824385805277, "grad_norm": 5.3440728187561035, "learning_rate": 1.5013648771610556e-05, "loss": 5.66, "step": 19800 }, { "epoch": 0.7544737640279041, "grad_norm": 4.964608192443848, "learning_rate": 1.5089475280558083e-05, "loss": 5.6265, "step": 19900 }, { "epoch": 0.7582650894752806, "grad_norm": 6.132308483123779, "learning_rate": 1.5165301789505613e-05, "loss": 5.6303, "step": 20000 }, { "epoch": 0.762056414922657, "grad_norm": 5.548910140991211, "learning_rate": 1.524112829845314e-05, "loss": 5.6253, "step": 20100 }, { "epoch": 0.7658477403700333, "grad_norm": 5.697509765625, "learning_rate": 1.531695480740067e-05, "loss": 5.5756, "step": 20200 }, { "epoch": 0.7696390658174098, "grad_norm": 6.124240398406982, "learning_rate": 1.53927813163482e-05, "loss": 5.5825, "step": 20300 }, { "epoch": 0.7734303912647862, "grad_norm": 5.401780128479004, "learning_rate": 1.5468607825295726e-05, "loss": 5.558, "step": 20400 }, { "epoch": 0.7772217167121626, "grad_norm": 5.2996826171875, "learning_rate": 1.5544434334243253e-05, "loss": 5.5451, "step": 20500 }, { "epoch": 0.7810130421595389, "grad_norm": 5.600028038024902, "learning_rate": 1.562026084319078e-05, "loss": 5.5305, "step": 20600 }, { "epoch": 0.7848043676069154, "grad_norm": 6.0726094245910645, "learning_rate": 1.569608735213831e-05, "loss": 5.5419, "step": 20700 }, { "epoch": 0.7885956930542918, "grad_norm": 4.8809380531311035, "learning_rate": 1.5771913861085837e-05, "loss": 5.4963, "step": 20800 }, { "epoch": 0.7923870185016681, "grad_norm": 5.681726455688477, "learning_rate": 1.5847740370033363e-05, "loss": 5.4929, "step": 20900 }, { "epoch": 0.7961783439490446, "grad_norm": 5.8515496253967285, "learning_rate": 1.5923566878980894e-05, "loss": 5.4792, "step": 21000 }, { "epoch": 0.799969669396421, "grad_norm": 5.939082622528076, "learning_rate": 1.599939338792842e-05, "loss": 5.4641, "step": 21100 }, { "epoch": 0.8037609948437974, "grad_norm": 6.101682186126709, "learning_rate": 1.6075219896875947e-05, "loss": 5.4448, "step": 21200 }, { "epoch": 0.8075523202911737, "grad_norm": 5.826840877532959, "learning_rate": 1.6151046405823478e-05, "loss": 5.4322, "step": 21300 }, { "epoch": 0.8113436457385502, "grad_norm": 5.972003936767578, "learning_rate": 1.6226872914771004e-05, "loss": 5.3864, "step": 21400 }, { "epoch": 0.8151349711859266, "grad_norm": 5.972353935241699, "learning_rate": 1.6302699423718535e-05, "loss": 5.4122, "step": 21500 }, { "epoch": 0.818926296633303, "grad_norm": 6.203449249267578, "learning_rate": 1.637852593266606e-05, "loss": 5.3475, "step": 21600 }, { "epoch": 0.8227176220806794, "grad_norm": 5.791791915893555, "learning_rate": 1.645435244161359e-05, "loss": 5.3895, "step": 21700 }, { "epoch": 0.8265089475280558, "grad_norm": 5.368638515472412, "learning_rate": 1.653017895056112e-05, "loss": 5.3586, "step": 21800 }, { "epoch": 0.8303002729754322, "grad_norm": 5.299839019775391, "learning_rate": 1.6606005459508645e-05, "loss": 5.3445, "step": 21900 }, { "epoch": 0.8340915984228087, "grad_norm": 6.046605587005615, "learning_rate": 1.6681831968456176e-05, "loss": 5.3363, "step": 22000 }, { "epoch": 0.837882923870185, "grad_norm": 5.848520755767822, "learning_rate": 1.6757658477403702e-05, "loss": 5.3092, "step": 22100 }, { "epoch": 0.8416742493175614, "grad_norm": 6.135545253753662, "learning_rate": 1.683348498635123e-05, "loss": 5.2706, "step": 22200 }, { "epoch": 0.8454655747649378, "grad_norm": 6.415537357330322, "learning_rate": 1.6909311495298756e-05, "loss": 5.266, "step": 22300 }, { "epoch": 0.8492569002123143, "grad_norm": 4.9459710121154785, "learning_rate": 1.6985138004246286e-05, "loss": 5.254, "step": 22400 }, { "epoch": 0.8530482256596906, "grad_norm": 5.925342082977295, "learning_rate": 1.7060964513193813e-05, "loss": 5.2572, "step": 22500 }, { "epoch": 0.856839551107067, "grad_norm": 6.176941394805908, "learning_rate": 1.713679102214134e-05, "loss": 5.2263, "step": 22600 }, { "epoch": 0.8606308765544435, "grad_norm": 6.169352054595947, "learning_rate": 1.721261753108887e-05, "loss": 5.23, "step": 22700 }, { "epoch": 0.8644222020018199, "grad_norm": 6.158240795135498, "learning_rate": 1.7288444040036397e-05, "loss": 5.18, "step": 22800 }, { "epoch": 0.8682135274491962, "grad_norm": 6.035087585449219, "learning_rate": 1.7364270548983927e-05, "loss": 5.1843, "step": 22900 }, { "epoch": 0.8720048528965726, "grad_norm": 5.852565765380859, "learning_rate": 1.7440097057931454e-05, "loss": 5.1618, "step": 23000 }, { "epoch": 0.8757961783439491, "grad_norm": 6.5992431640625, "learning_rate": 1.7515923566878984e-05, "loss": 5.1747, "step": 23100 }, { "epoch": 0.8795875037913254, "grad_norm": 5.80078649520874, "learning_rate": 1.759175007582651e-05, "loss": 5.1365, "step": 23200 }, { "epoch": 0.8833788292387018, "grad_norm": 6.44807767868042, "learning_rate": 1.7667576584774038e-05, "loss": 5.1336, "step": 23300 }, { "epoch": 0.8871701546860783, "grad_norm": 6.59194803237915, "learning_rate": 1.7743403093721568e-05, "loss": 5.1361, "step": 23400 }, { "epoch": 0.8909614801334547, "grad_norm": 5.842111110687256, "learning_rate": 1.7819229602669095e-05, "loss": 5.1104, "step": 23500 }, { "epoch": 0.894752805580831, "grad_norm": 5.542649269104004, "learning_rate": 1.7895056111616622e-05, "loss": 5.0991, "step": 23600 }, { "epoch": 0.8985441310282075, "grad_norm": 6.7896552085876465, "learning_rate": 1.7970882620564152e-05, "loss": 5.0508, "step": 23700 }, { "epoch": 0.9023354564755839, "grad_norm": 6.471670627593994, "learning_rate": 1.804670912951168e-05, "loss": 5.0623, "step": 23800 }, { "epoch": 0.9061267819229603, "grad_norm": 6.739445209503174, "learning_rate": 1.8122535638459206e-05, "loss": 5.0376, "step": 23900 }, { "epoch": 0.9099181073703366, "grad_norm": 6.240634918212891, "learning_rate": 1.8198362147406733e-05, "loss": 5.0084, "step": 24000 }, { "epoch": 0.9137094328177131, "grad_norm": 6.532129764556885, "learning_rate": 1.8274188656354263e-05, "loss": 5.015, "step": 24100 }, { "epoch": 0.9175007582650895, "grad_norm": 5.787896156311035, "learning_rate": 1.835001516530179e-05, "loss": 5.0001, "step": 24200 }, { "epoch": 0.9212920837124658, "grad_norm": 7.697067737579346, "learning_rate": 1.842584167424932e-05, "loss": 4.985, "step": 24300 }, { "epoch": 0.9250834091598423, "grad_norm": 6.250032424926758, "learning_rate": 1.8501668183196847e-05, "loss": 4.9564, "step": 24400 }, { "epoch": 0.9288747346072187, "grad_norm": 6.607831001281738, "learning_rate": 1.8577494692144377e-05, "loss": 4.954, "step": 24500 }, { "epoch": 0.9326660600545951, "grad_norm": 7.3180766105651855, "learning_rate": 1.8653321201091904e-05, "loss": 4.9326, "step": 24600 }, { "epoch": 0.9364573855019714, "grad_norm": 6.219171524047852, "learning_rate": 1.872914771003943e-05, "loss": 4.9493, "step": 24700 }, { "epoch": 0.9402487109493479, "grad_norm": 6.749953269958496, "learning_rate": 1.880497421898696e-05, "loss": 4.9328, "step": 24800 }, { "epoch": 0.9440400363967243, "grad_norm": 6.906899452209473, "learning_rate": 1.8880800727934488e-05, "loss": 4.9101, "step": 24900 }, { "epoch": 0.9478313618441007, "grad_norm": 7.212630748748779, "learning_rate": 1.8956627236882015e-05, "loss": 4.8623, "step": 25000 }, { "epoch": 0.9516226872914771, "grad_norm": 6.847315311431885, "learning_rate": 1.9032453745829545e-05, "loss": 4.8527, "step": 25100 }, { "epoch": 0.9554140127388535, "grad_norm": 7.239388942718506, "learning_rate": 1.910828025477707e-05, "loss": 4.8437, "step": 25200 }, { "epoch": 0.9592053381862299, "grad_norm": 6.992358684539795, "learning_rate": 1.91841067637246e-05, "loss": 4.8545, "step": 25300 }, { "epoch": 0.9629966636336063, "grad_norm": 7.048510551452637, "learning_rate": 1.9259933272672125e-05, "loss": 4.8119, "step": 25400 }, { "epoch": 0.9667879890809827, "grad_norm": 7.969037055969238, "learning_rate": 1.9335759781619656e-05, "loss": 4.849, "step": 25500 }, { "epoch": 0.9705793145283591, "grad_norm": 7.874882221221924, "learning_rate": 1.9411586290567182e-05, "loss": 4.8179, "step": 25600 }, { "epoch": 0.9743706399757355, "grad_norm": 6.164852142333984, "learning_rate": 1.9487412799514713e-05, "loss": 4.7876, "step": 25700 }, { "epoch": 0.978161965423112, "grad_norm": 7.158965587615967, "learning_rate": 1.956323930846224e-05, "loss": 4.7707, "step": 25800 }, { "epoch": 0.9819532908704883, "grad_norm": 6.17612886428833, "learning_rate": 1.963906581740977e-05, "loss": 4.7773, "step": 25900 }, { "epoch": 0.9857446163178647, "grad_norm": 9.235833168029785, "learning_rate": 1.9714892326357297e-05, "loss": 4.7584, "step": 26000 }, { "epoch": 0.9895359417652412, "grad_norm": 6.95273494720459, "learning_rate": 1.9790718835304827e-05, "loss": 4.7381, "step": 26100 }, { "epoch": 0.9933272672126175, "grad_norm": 7.556499481201172, "learning_rate": 1.9866545344252354e-05, "loss": 4.7441, "step": 26200 }, { "epoch": 0.9971185926599939, "grad_norm": 7.051175594329834, "learning_rate": 1.994237185319988e-05, "loss": 4.7089, "step": 26300 }, { "epoch": 1.0, "eval_accuracy": 0.19204311725732917, "eval_loss": 4.6093902587890625, "eval_runtime": 1226.3453, "eval_samples_per_second": 688.243, "eval_steps_per_second": 5.377, "step": 26376 }, { "epoch": 1.0009099181073704, "grad_norm": 7.434732437133789, "learning_rate": 1.9999999495583435e-05, "loss": 4.6902, "step": 26400 }, { "epoch": 1.0047012435547467, "grad_norm": 6.476230144500732, "learning_rate": 1.9999986534882878e-05, "loss": 4.6364, "step": 26500 }, { "epoch": 1.0084925690021231, "grad_norm": 6.2854084968566895, "learning_rate": 1.999995605974417e-05, "loss": 4.6121, "step": 26600 }, { "epoch": 1.0122838944494996, "grad_norm": 6.504358291625977, "learning_rate": 1.9999908070220685e-05, "loss": 4.6069, "step": 26700 }, { "epoch": 1.0160752198968759, "grad_norm": 7.319517612457275, "learning_rate": 1.9999842566396474e-05, "loss": 4.6163, "step": 26800 }, { "epoch": 1.0198665453442524, "grad_norm": 6.980406284332275, "learning_rate": 1.9999759548386266e-05, "loss": 4.5514, "step": 26900 }, { "epoch": 1.0236578707916288, "grad_norm": 7.115331649780273, "learning_rate": 1.9999659016335463e-05, "loss": 4.5636, "step": 27000 }, { "epoch": 1.027449196239005, "grad_norm": 7.8184967041015625, "learning_rate": 1.9999540970420135e-05, "loss": 4.5771, "step": 27100 }, { "epoch": 1.0312405216863816, "grad_norm": 7.0858893394470215, "learning_rate": 1.9999405410847043e-05, "loss": 4.5522, "step": 27200 }, { "epoch": 1.035031847133758, "grad_norm": 8.256982803344727, "learning_rate": 1.9999252337853605e-05, "loss": 4.5228, "step": 27300 }, { "epoch": 1.0388231725811343, "grad_norm": 6.4392218589782715, "learning_rate": 1.999908175170792e-05, "loss": 4.5276, "step": 27400 }, { "epoch": 1.0426144980285108, "grad_norm": 6.790013790130615, "learning_rate": 1.999889365270877e-05, "loss": 4.5158, "step": 27500 }, { "epoch": 1.046405823475887, "grad_norm": 6.7690606117248535, "learning_rate": 1.9998688041185584e-05, "loss": 4.4907, "step": 27600 }, { "epoch": 1.0501971489232635, "grad_norm": 7.427313804626465, "learning_rate": 1.9998464917498496e-05, "loss": 4.4819, "step": 27700 }, { "epoch": 1.05398847437064, "grad_norm": 7.126349925994873, "learning_rate": 1.999822428203828e-05, "loss": 4.4412, "step": 27800 }, { "epoch": 1.0577797998180163, "grad_norm": 7.040098190307617, "learning_rate": 1.9997966135226412e-05, "loss": 4.4458, "step": 27900 }, { "epoch": 1.0615711252653928, "grad_norm": 8.258943557739258, "learning_rate": 1.999769047751501e-05, "loss": 4.4643, "step": 28000 }, { "epoch": 1.0653624507127692, "grad_norm": 7.388154983520508, "learning_rate": 1.9997397309386876e-05, "loss": 4.4269, "step": 28100 }, { "epoch": 1.0691537761601455, "grad_norm": 7.948140621185303, "learning_rate": 1.9997086631355483e-05, "loss": 4.4294, "step": 28200 }, { "epoch": 1.072945101607522, "grad_norm": 8.284055709838867, "learning_rate": 1.999675844396496e-05, "loss": 4.3871, "step": 28300 }, { "epoch": 1.0767364270548985, "grad_norm": 8.075194358825684, "learning_rate": 1.999641274779012e-05, "loss": 4.4079, "step": 28400 }, { "epoch": 1.0805277525022747, "grad_norm": 8.403909683227539, "learning_rate": 1.9996049543436417e-05, "loss": 4.4228, "step": 28500 }, { "epoch": 1.0843190779496512, "grad_norm": 6.7134690284729, "learning_rate": 1.999566883153999e-05, "loss": 4.412, "step": 28600 }, { "epoch": 1.0881104033970277, "grad_norm": 8.342605590820312, "learning_rate": 1.9995270612767642e-05, "loss": 4.3706, "step": 28700 }, { "epoch": 1.091901728844404, "grad_norm": 8.538543701171875, "learning_rate": 1.999485488781683e-05, "loss": 4.3657, "step": 28800 }, { "epoch": 1.0956930542917804, "grad_norm": 9.306371688842773, "learning_rate": 1.999442165741566e-05, "loss": 4.3653, "step": 28900 }, { "epoch": 1.099484379739157, "grad_norm": 7.590816020965576, "learning_rate": 1.999397092232293e-05, "loss": 4.3569, "step": 29000 }, { "epoch": 1.1032757051865332, "grad_norm": 8.061256408691406, "learning_rate": 1.999350268332807e-05, "loss": 4.3513, "step": 29100 }, { "epoch": 1.1070670306339097, "grad_norm": 8.024669647216797, "learning_rate": 1.999301694125117e-05, "loss": 4.3545, "step": 29200 }, { "epoch": 1.110858356081286, "grad_norm": 9.196686744689941, "learning_rate": 1.999251369694299e-05, "loss": 4.3021, "step": 29300 }, { "epoch": 1.1146496815286624, "grad_norm": 8.209677696228027, "learning_rate": 1.999199295128493e-05, "loss": 4.2894, "step": 29400 }, { "epoch": 1.1184410069760389, "grad_norm": 8.374725341796875, "learning_rate": 1.9991454705189055e-05, "loss": 4.2916, "step": 29500 }, { "epoch": 1.1222323324234151, "grad_norm": 7.0996174812316895, "learning_rate": 1.9990898959598066e-05, "loss": 4.2543, "step": 29600 }, { "epoch": 1.1260236578707916, "grad_norm": 8.065669059753418, "learning_rate": 1.9990325715485325e-05, "loss": 4.2383, "step": 29700 }, { "epoch": 1.129814983318168, "grad_norm": 8.074858665466309, "learning_rate": 1.998973497385484e-05, "loss": 4.2374, "step": 29800 }, { "epoch": 1.1336063087655444, "grad_norm": 8.171634674072266, "learning_rate": 1.9989126735741257e-05, "loss": 4.2326, "step": 29900 }, { "epoch": 1.1373976342129208, "grad_norm": 7.593137741088867, "learning_rate": 1.9988501002209878e-05, "loss": 4.2238, "step": 30000 }, { "epoch": 1.1411889596602973, "grad_norm": 7.71358060836792, "learning_rate": 1.9987857774356644e-05, "loss": 4.2281, "step": 30100 }, { "epoch": 1.1449802851076736, "grad_norm": 7.5212483406066895, "learning_rate": 1.9987197053308126e-05, "loss": 4.1984, "step": 30200 }, { "epoch": 1.14877161055505, "grad_norm": 7.653642654418945, "learning_rate": 1.9986518840221544e-05, "loss": 4.1985, "step": 30300 }, { "epoch": 1.1525629360024265, "grad_norm": 8.598163604736328, "learning_rate": 1.9985823136284752e-05, "loss": 4.1941, "step": 30400 }, { "epoch": 1.1563542614498028, "grad_norm": 8.4173002243042, "learning_rate": 1.9985109942716243e-05, "loss": 4.1978, "step": 30500 }, { "epoch": 1.1601455868971793, "grad_norm": 7.832417011260986, "learning_rate": 1.998437926076513e-05, "loss": 4.1293, "step": 30600 }, { "epoch": 1.1639369123445555, "grad_norm": 7.49244499206543, "learning_rate": 1.998363109171117e-05, "loss": 4.1775, "step": 30700 }, { "epoch": 1.167728237791932, "grad_norm": 8.137709617614746, "learning_rate": 1.9982865436864733e-05, "loss": 4.1407, "step": 30800 }, { "epoch": 1.1715195632393085, "grad_norm": 7.725815773010254, "learning_rate": 1.998208229756683e-05, "loss": 4.1483, "step": 30900 }, { "epoch": 1.1753108886866848, "grad_norm": 9.529303550720215, "learning_rate": 1.998128167518908e-05, "loss": 4.116, "step": 31000 }, { "epoch": 1.1791022141340612, "grad_norm": 9.311180114746094, "learning_rate": 1.998046357113374e-05, "loss": 4.1176, "step": 31100 }, { "epoch": 1.1828935395814377, "grad_norm": 7.9721598625183105, "learning_rate": 1.997962798683366e-05, "loss": 4.0813, "step": 31200 }, { "epoch": 1.186684865028814, "grad_norm": 7.690219402313232, "learning_rate": 1.9978774923752334e-05, "loss": 4.0805, "step": 31300 }, { "epoch": 1.1904761904761905, "grad_norm": 9.044037818908691, "learning_rate": 1.997790438338385e-05, "loss": 4.0909, "step": 31400 }, { "epoch": 1.194267515923567, "grad_norm": 8.840503692626953, "learning_rate": 1.9977016367252916e-05, "loss": 4.0967, "step": 31500 }, { "epoch": 1.1980588413709432, "grad_norm": 7.727489948272705, "learning_rate": 1.9976110876914844e-05, "loss": 4.0695, "step": 31600 }, { "epoch": 1.2018501668183197, "grad_norm": 8.020549774169922, "learning_rate": 1.997518791395555e-05, "loss": 4.0555, "step": 31700 }, { "epoch": 1.2056414922656962, "grad_norm": 9.169303894042969, "learning_rate": 1.9974247479991553e-05, "loss": 4.0506, "step": 31800 }, { "epoch": 1.2094328177130724, "grad_norm": 7.542046070098877, "learning_rate": 1.9973289576669975e-05, "loss": 4.0259, "step": 31900 }, { "epoch": 1.213224143160449, "grad_norm": 8.61517333984375, "learning_rate": 1.9972314205668534e-05, "loss": 4.0613, "step": 32000 }, { "epoch": 1.2170154686078254, "grad_norm": 9.990470886230469, "learning_rate": 1.997132136869553e-05, "loss": 4.0672, "step": 32100 }, { "epoch": 1.2208067940552016, "grad_norm": 8.95771598815918, "learning_rate": 1.9970311067489875e-05, "loss": 4.0198, "step": 32200 }, { "epoch": 1.2245981195025781, "grad_norm": 8.106546401977539, "learning_rate": 1.996928330382105e-05, "loss": 3.9783, "step": 32300 }, { "epoch": 1.2283894449499546, "grad_norm": 9.15388298034668, "learning_rate": 1.9968238079489133e-05, "loss": 4.0098, "step": 32400 }, { "epoch": 1.2321807703973309, "grad_norm": 7.201200485229492, "learning_rate": 1.9967175396324774e-05, "loss": 3.9904, "step": 32500 }, { "epoch": 1.2359720958447074, "grad_norm": 8.28470230102539, "learning_rate": 1.9966095256189206e-05, "loss": 3.9963, "step": 32600 }, { "epoch": 1.2397634212920838, "grad_norm": 8.642807006835938, "learning_rate": 1.996499766097424e-05, "loss": 3.9469, "step": 32700 }, { "epoch": 1.24355474673946, "grad_norm": 7.6962890625, "learning_rate": 1.996388261260225e-05, "loss": 3.9612, "step": 32800 }, { "epoch": 1.2473460721868366, "grad_norm": 8.612123489379883, "learning_rate": 1.9962750113026184e-05, "loss": 3.9656, "step": 32900 }, { "epoch": 1.251137397634213, "grad_norm": 8.585956573486328, "learning_rate": 1.9961600164229557e-05, "loss": 3.9639, "step": 33000 }, { "epoch": 1.2549287230815893, "grad_norm": 8.958784103393555, "learning_rate": 1.996043276822644e-05, "loss": 3.9338, "step": 33100 }, { "epoch": 1.2587200485289658, "grad_norm": 8.623564720153809, "learning_rate": 1.9959247927061463e-05, "loss": 3.9034, "step": 33200 }, { "epoch": 1.262511373976342, "grad_norm": 8.047469139099121, "learning_rate": 1.9958045642809818e-05, "loss": 3.9388, "step": 33300 }, { "epoch": 1.2663026994237185, "grad_norm": 9.270243644714355, "learning_rate": 1.9956825917577233e-05, "loss": 3.8977, "step": 33400 }, { "epoch": 1.270094024871095, "grad_norm": 7.927067279815674, "learning_rate": 1.9955588753499998e-05, "loss": 3.8887, "step": 33500 }, { "epoch": 1.2738853503184713, "grad_norm": 8.39449691772461, "learning_rate": 1.995433415274493e-05, "loss": 3.9123, "step": 33600 }, { "epoch": 1.2776766757658478, "grad_norm": 7.916572570800781, "learning_rate": 1.9953062117509406e-05, "loss": 3.9038, "step": 33700 }, { "epoch": 1.281468001213224, "grad_norm": 8.871452331542969, "learning_rate": 1.9951772650021318e-05, "loss": 3.8732, "step": 33800 }, { "epoch": 1.2852593266606005, "grad_norm": 9.258794784545898, "learning_rate": 1.9950465752539106e-05, "loss": 3.9102, "step": 33900 }, { "epoch": 1.289050652107977, "grad_norm": 8.147697448730469, "learning_rate": 1.9949141427351728e-05, "loss": 3.8495, "step": 34000 }, { "epoch": 1.2928419775553532, "grad_norm": 10.429221153259277, "learning_rate": 1.9947799676778667e-05, "loss": 3.8513, "step": 34100 }, { "epoch": 1.2966333030027297, "grad_norm": 8.67578411102295, "learning_rate": 1.9946440503169926e-05, "loss": 3.8537, "step": 34200 }, { "epoch": 1.3004246284501062, "grad_norm": 8.415848731994629, "learning_rate": 1.9945063908906026e-05, "loss": 3.8005, "step": 34300 }, { "epoch": 1.3042159538974825, "grad_norm": 8.97568130493164, "learning_rate": 1.9943669896398e-05, "loss": 3.8228, "step": 34400 }, { "epoch": 1.308007279344859, "grad_norm": 8.27374267578125, "learning_rate": 1.9942258468087383e-05, "loss": 3.8245, "step": 34500 }, { "epoch": 1.3117986047922354, "grad_norm": 9.306171417236328, "learning_rate": 1.9940829626446218e-05, "loss": 3.8086, "step": 34600 }, { "epoch": 1.3155899302396117, "grad_norm": 9.582313537597656, "learning_rate": 1.9939383373977038e-05, "loss": 3.7739, "step": 34700 }, { "epoch": 1.3193812556869882, "grad_norm": 9.462979316711426, "learning_rate": 1.9937919713212885e-05, "loss": 3.8068, "step": 34800 }, { "epoch": 1.3231725811343646, "grad_norm": 8.853312492370605, "learning_rate": 1.9936438646717277e-05, "loss": 3.7952, "step": 34900 }, { "epoch": 1.326963906581741, "grad_norm": 8.140557289123535, "learning_rate": 1.9934940177084222e-05, "loss": 3.7877, "step": 35000 }, { "epoch": 1.3307552320291174, "grad_norm": 8.893766403198242, "learning_rate": 1.9933424306938208e-05, "loss": 3.7814, "step": 35100 }, { "epoch": 1.3345465574764939, "grad_norm": 8.464703559875488, "learning_rate": 1.993189103893421e-05, "loss": 3.7735, "step": 35200 }, { "epoch": 1.3383378829238701, "grad_norm": 8.840137481689453, "learning_rate": 1.993034037575765e-05, "loss": 3.7675, "step": 35300 }, { "epoch": 1.3421292083712466, "grad_norm": 9.416961669921875, "learning_rate": 1.9928772320124435e-05, "loss": 3.7538, "step": 35400 }, { "epoch": 1.345920533818623, "grad_norm": 8.475790977478027, "learning_rate": 1.992718687478094e-05, "loss": 3.7426, "step": 35500 }, { "epoch": 1.3497118592659993, "grad_norm": 8.619400978088379, "learning_rate": 1.9925584042503975e-05, "loss": 3.7246, "step": 35600 }, { "epoch": 1.3535031847133758, "grad_norm": 8.692911148071289, "learning_rate": 1.992396382610082e-05, "loss": 3.7481, "step": 35700 }, { "epoch": 1.3572945101607523, "grad_norm": 9.832612037658691, "learning_rate": 1.99223262284092e-05, "loss": 3.729, "step": 35800 }, { "epoch": 1.3610858356081286, "grad_norm": 9.229455947875977, "learning_rate": 1.992067125229727e-05, "loss": 3.7453, "step": 35900 }, { "epoch": 1.364877161055505, "grad_norm": 8.48790168762207, "learning_rate": 1.991899890066364e-05, "loss": 3.6917, "step": 36000 }, { "epoch": 1.3686684865028815, "grad_norm": 8.7225341796875, "learning_rate": 1.9917309176437337e-05, "loss": 3.7084, "step": 36100 }, { "epoch": 1.3724598119502578, "grad_norm": 8.597464561462402, "learning_rate": 1.991560208257783e-05, "loss": 3.6883, "step": 36200 }, { "epoch": 1.3762511373976343, "grad_norm": 9.846688270568848, "learning_rate": 1.9913877622074996e-05, "loss": 3.6579, "step": 36300 }, { "epoch": 1.3800424628450108, "grad_norm": 8.503822326660156, "learning_rate": 1.9912135797949137e-05, "loss": 3.6983, "step": 36400 }, { "epoch": 1.383833788292387, "grad_norm": 9.0000581741333, "learning_rate": 1.9910376613250962e-05, "loss": 3.6541, "step": 36500 }, { "epoch": 1.3876251137397635, "grad_norm": 7.895684242248535, "learning_rate": 1.9908600071061593e-05, "loss": 3.6368, "step": 36600 }, { "epoch": 1.3914164391871398, "grad_norm": 9.519074440002441, "learning_rate": 1.9906806174492545e-05, "loss": 3.6593, "step": 36700 }, { "epoch": 1.3952077646345162, "grad_norm": 9.424005508422852, "learning_rate": 1.990499492668573e-05, "loss": 3.6418, "step": 36800 }, { "epoch": 1.3989990900818925, "grad_norm": 8.73523998260498, "learning_rate": 1.9903166330813454e-05, "loss": 3.6175, "step": 36900 }, { "epoch": 1.402790415529269, "grad_norm": 8.736367225646973, "learning_rate": 1.9901320390078398e-05, "loss": 3.6221, "step": 37000 }, { "epoch": 1.4065817409766455, "grad_norm": 8.567627906799316, "learning_rate": 1.9899457107713636e-05, "loss": 3.6066, "step": 37100 }, { "epoch": 1.4103730664240217, "grad_norm": 8.445911407470703, "learning_rate": 1.98975764869826e-05, "loss": 3.5954, "step": 37200 }, { "epoch": 1.4141643918713982, "grad_norm": 8.579058647155762, "learning_rate": 1.9895678531179104e-05, "loss": 3.6303, "step": 37300 }, { "epoch": 1.4179557173187747, "grad_norm": 8.963314056396484, "learning_rate": 1.9893763243627307e-05, "loss": 3.6285, "step": 37400 }, { "epoch": 1.421747042766151, "grad_norm": 9.01703929901123, "learning_rate": 1.9891830627681738e-05, "loss": 3.5916, "step": 37500 }, { "epoch": 1.4255383682135274, "grad_norm": 9.465667724609375, "learning_rate": 1.988988068672727e-05, "loss": 3.6069, "step": 37600 }, { "epoch": 1.429329693660904, "grad_norm": 8.692371368408203, "learning_rate": 1.988791342417911e-05, "loss": 3.5754, "step": 37700 }, { "epoch": 1.4331210191082802, "grad_norm": 8.821332931518555, "learning_rate": 1.9885928843482824e-05, "loss": 3.6187, "step": 37800 }, { "epoch": 1.4369123445556566, "grad_norm": 10.146221160888672, "learning_rate": 1.9883926948114294e-05, "loss": 3.5864, "step": 37900 }, { "epoch": 1.4407036700030331, "grad_norm": 8.599617958068848, "learning_rate": 1.988190774157973e-05, "loss": 3.57, "step": 38000 }, { "epoch": 1.4444949954504094, "grad_norm": 8.331599235534668, "learning_rate": 1.9879871227415668e-05, "loss": 3.5705, "step": 38100 }, { "epoch": 1.4482863208977859, "grad_norm": 8.810132026672363, "learning_rate": 1.987781740918895e-05, "loss": 3.5597, "step": 38200 }, { "epoch": 1.4520776463451623, "grad_norm": 9.46640682220459, "learning_rate": 1.9875746290496725e-05, "loss": 3.5317, "step": 38300 }, { "epoch": 1.4558689717925386, "grad_norm": 8.556076049804688, "learning_rate": 1.987365787496645e-05, "loss": 3.5745, "step": 38400 }, { "epoch": 1.459660297239915, "grad_norm": 8.548824310302734, "learning_rate": 1.9871552166255874e-05, "loss": 3.527, "step": 38500 }, { "epoch": 1.4634516226872916, "grad_norm": 7.90036153793335, "learning_rate": 1.986942916805303e-05, "loss": 3.5709, "step": 38600 }, { "epoch": 1.4672429481346678, "grad_norm": 9.092789649963379, "learning_rate": 1.9867288884076235e-05, "loss": 3.5195, "step": 38700 }, { "epoch": 1.4710342735820443, "grad_norm": 10.739933967590332, "learning_rate": 1.986513131807408e-05, "loss": 3.5069, "step": 38800 }, { "epoch": 1.4748255990294208, "grad_norm": 8.57911205291748, "learning_rate": 1.986295647382543e-05, "loss": 3.5589, "step": 38900 }, { "epoch": 1.478616924476797, "grad_norm": 8.822707176208496, "learning_rate": 1.98607643551394e-05, "loss": 3.5209, "step": 39000 }, { "epoch": 1.4824082499241735, "grad_norm": 9.19753646850586, "learning_rate": 1.9858554965855377e-05, "loss": 3.516, "step": 39100 }, { "epoch": 1.48619957537155, "grad_norm": 9.257062911987305, "learning_rate": 1.985632830984298e-05, "loss": 3.5188, "step": 39200 }, { "epoch": 1.4899909008189263, "grad_norm": 9.091058731079102, "learning_rate": 1.9854084391002084e-05, "loss": 3.504, "step": 39300 }, { "epoch": 1.4937822262663027, "grad_norm": 9.342292785644531, "learning_rate": 1.9851823213262786e-05, "loss": 3.4943, "step": 39400 }, { "epoch": 1.4975735517136792, "grad_norm": 9.467326164245605, "learning_rate": 1.984954478058542e-05, "loss": 3.4756, "step": 39500 }, { "epoch": 1.5013648771610555, "grad_norm": 9.375906944274902, "learning_rate": 1.984724909696054e-05, "loss": 3.4876, "step": 39600 }, { "epoch": 1.5051562026084317, "grad_norm": 9.445342063903809, "learning_rate": 1.98449361664089e-05, "loss": 3.4866, "step": 39700 }, { "epoch": 1.5089475280558085, "grad_norm": 8.7040433883667, "learning_rate": 1.9842605992981494e-05, "loss": 3.4658, "step": 39800 }, { "epoch": 1.5127388535031847, "grad_norm": 8.078710556030273, "learning_rate": 1.984025858075948e-05, "loss": 3.4632, "step": 39900 }, { "epoch": 1.516530178950561, "grad_norm": 8.885797500610352, "learning_rate": 1.983789393385423e-05, "loss": 3.4647, "step": 40000 }, { "epoch": 1.5203215043979377, "grad_norm": 9.236519813537598, "learning_rate": 1.9835512056407295e-05, "loss": 3.4296, "step": 40100 }, { "epoch": 1.524112829845314, "grad_norm": 10.012004852294922, "learning_rate": 1.98331129525904e-05, "loss": 3.4014, "step": 40200 }, { "epoch": 1.5279041552926902, "grad_norm": 8.906109809875488, "learning_rate": 1.9830696626605456e-05, "loss": 3.4375, "step": 40300 }, { "epoch": 1.5316954807400667, "grad_norm": 9.195486068725586, "learning_rate": 1.982826308268452e-05, "loss": 3.4238, "step": 40400 }, { "epoch": 1.5354868061874432, "grad_norm": 8.66920280456543, "learning_rate": 1.9825812325089817e-05, "loss": 3.4029, "step": 40500 }, { "epoch": 1.5392781316348194, "grad_norm": 9.20577335357666, "learning_rate": 1.982334435811372e-05, "loss": 3.4073, "step": 40600 }, { "epoch": 1.543069457082196, "grad_norm": 9.310471534729004, "learning_rate": 1.9820859186078732e-05, "loss": 3.4062, "step": 40700 }, { "epoch": 1.5468607825295724, "grad_norm": 8.580398559570312, "learning_rate": 1.9818356813337502e-05, "loss": 3.4246, "step": 40800 }, { "epoch": 1.5506521079769486, "grad_norm": 9.721495628356934, "learning_rate": 1.98158372442728e-05, "loss": 3.4315, "step": 40900 }, { "epoch": 1.5544434334243251, "grad_norm": 8.983660697937012, "learning_rate": 1.981330048329752e-05, "loss": 3.3885, "step": 41000 }, { "epoch": 1.5582347588717016, "grad_norm": 9.241628646850586, "learning_rate": 1.9810746534854657e-05, "loss": 3.382, "step": 41100 }, { "epoch": 1.5620260843190779, "grad_norm": 9.482072830200195, "learning_rate": 1.980817540341732e-05, "loss": 3.3483, "step": 41200 }, { "epoch": 1.5658174097664543, "grad_norm": 9.437420845031738, "learning_rate": 1.9805587093488704e-05, "loss": 3.3888, "step": 41300 }, { "epoch": 1.5696087352138308, "grad_norm": 10.110761642456055, "learning_rate": 1.9802981609602092e-05, "loss": 3.3914, "step": 41400 }, { "epoch": 1.573400060661207, "grad_norm": 9.664725303649902, "learning_rate": 1.980035895632085e-05, "loss": 3.3446, "step": 41500 }, { "epoch": 1.5771913861085836, "grad_norm": 9.232138633728027, "learning_rate": 1.9797719138238417e-05, "loss": 3.4012, "step": 41600 }, { "epoch": 1.58098271155596, "grad_norm": 9.467063903808594, "learning_rate": 1.9795062159978292e-05, "loss": 3.3396, "step": 41700 }, { "epoch": 1.5847740370033363, "grad_norm": 8.991984367370605, "learning_rate": 1.9792388026194027e-05, "loss": 3.3687, "step": 41800 }, { "epoch": 1.5885653624507128, "grad_norm": 9.068346977233887, "learning_rate": 1.978969674156922e-05, "loss": 3.3553, "step": 41900 }, { "epoch": 1.5923566878980893, "grad_norm": 9.739985466003418, "learning_rate": 1.9786988310817523e-05, "loss": 3.3233, "step": 42000 }, { "epoch": 1.5961480133454655, "grad_norm": 8.56322956085205, "learning_rate": 1.9784262738682596e-05, "loss": 3.3338, "step": 42100 }, { "epoch": 1.599939338792842, "grad_norm": 9.217336654663086, "learning_rate": 1.9781520029938134e-05, "loss": 3.3169, "step": 42200 }, { "epoch": 1.6037306642402185, "grad_norm": 9.182036399841309, "learning_rate": 1.9778760189387848e-05, "loss": 3.329, "step": 42300 }, { "epoch": 1.6075219896875947, "grad_norm": 8.042176246643066, "learning_rate": 1.9775983221865442e-05, "loss": 3.3206, "step": 42400 }, { "epoch": 1.6113133151349712, "grad_norm": 9.8140230178833, "learning_rate": 1.9773189132234632e-05, "loss": 3.309, "step": 42500 }, { "epoch": 1.6151046405823477, "grad_norm": 9.491947174072266, "learning_rate": 1.9770377925389116e-05, "loss": 3.3464, "step": 42600 }, { "epoch": 1.618895966029724, "grad_norm": 9.23323917388916, "learning_rate": 1.9767549606252564e-05, "loss": 3.2843, "step": 42700 }, { "epoch": 1.6226872914771004, "grad_norm": 9.023582458496094, "learning_rate": 1.9764704179778635e-05, "loss": 3.3141, "step": 42800 }, { "epoch": 1.626478616924477, "grad_norm": 10.815488815307617, "learning_rate": 1.9761841650950935e-05, "loss": 3.2959, "step": 42900 }, { "epoch": 1.6302699423718532, "grad_norm": 9.213822364807129, "learning_rate": 1.975896202478303e-05, "loss": 3.2885, "step": 43000 }, { "epoch": 1.6340612678192294, "grad_norm": 10.04034423828125, "learning_rate": 1.9756065306318427e-05, "loss": 3.2901, "step": 43100 }, { "epoch": 1.6378525932666061, "grad_norm": 8.469430923461914, "learning_rate": 1.9753151500630575e-05, "loss": 3.317, "step": 43200 }, { "epoch": 1.6416439187139824, "grad_norm": 10.34746265411377, "learning_rate": 1.975022061282285e-05, "loss": 3.2995, "step": 43300 }, { "epoch": 1.6454352441613587, "grad_norm": 8.371482849121094, "learning_rate": 1.974727264802854e-05, "loss": 3.274, "step": 43400 }, { "epoch": 1.6492265696087354, "grad_norm": 9.099817276000977, "learning_rate": 1.974430761141085e-05, "loss": 3.2771, "step": 43500 }, { "epoch": 1.6530178950561116, "grad_norm": 9.906704902648926, "learning_rate": 1.974132550816288e-05, "loss": 3.3168, "step": 43600 }, { "epoch": 1.656809220503488, "grad_norm": 9.77303409576416, "learning_rate": 1.9738326343507623e-05, "loss": 3.2937, "step": 43700 }, { "epoch": 1.6606005459508644, "grad_norm": 9.168594360351562, "learning_rate": 1.973531012269796e-05, "loss": 3.2659, "step": 43800 }, { "epoch": 1.6643918713982409, "grad_norm": 10.06536865234375, "learning_rate": 1.973227685101663e-05, "loss": 3.2529, "step": 43900 }, { "epoch": 1.6681831968456171, "grad_norm": 9.138151168823242, "learning_rate": 1.9729226533776252e-05, "loss": 3.2699, "step": 44000 }, { "epoch": 1.6719745222929936, "grad_norm": 9.641526222229004, "learning_rate": 1.9726159176319292e-05, "loss": 3.2616, "step": 44100 }, { "epoch": 1.67576584774037, "grad_norm": 9.27478313446045, "learning_rate": 1.9723074784018056e-05, "loss": 3.2262, "step": 44200 }, { "epoch": 1.6795571731877463, "grad_norm": 8.915653228759766, "learning_rate": 1.97199733622747e-05, "loss": 3.1745, "step": 44300 }, { "epoch": 1.6833484986351228, "grad_norm": 10.35103988647461, "learning_rate": 1.971685491652119e-05, "loss": 3.2636, "step": 44400 }, { "epoch": 1.6871398240824993, "grad_norm": 8.449880599975586, "learning_rate": 1.971371945221932e-05, "loss": 3.2247, "step": 44500 }, { "epoch": 1.6909311495298756, "grad_norm": 9.980459213256836, "learning_rate": 1.9710566974860683e-05, "loss": 3.2229, "step": 44600 }, { "epoch": 1.694722474977252, "grad_norm": 8.53636646270752, "learning_rate": 1.9707397489966675e-05, "loss": 3.201, "step": 44700 }, { "epoch": 1.6985138004246285, "grad_norm": 9.442465782165527, "learning_rate": 1.970421100308848e-05, "loss": 3.202, "step": 44800 }, { "epoch": 1.7023051258720048, "grad_norm": 9.770416259765625, "learning_rate": 1.970100751980706e-05, "loss": 3.1965, "step": 44900 }, { "epoch": 1.7060964513193813, "grad_norm": 10.412403106689453, "learning_rate": 1.969778704573314e-05, "loss": 3.2065, "step": 45000 }, { "epoch": 1.7098877767667577, "grad_norm": 10.858807563781738, "learning_rate": 1.9694549586507202e-05, "loss": 3.2252, "step": 45100 }, { "epoch": 1.713679102214134, "grad_norm": 9.193475723266602, "learning_rate": 1.9691295147799492e-05, "loss": 3.2086, "step": 45200 }, { "epoch": 1.7174704276615105, "grad_norm": 9.386664390563965, "learning_rate": 1.9688023735309977e-05, "loss": 3.1981, "step": 45300 }, { "epoch": 1.721261753108887, "grad_norm": 9.21669864654541, "learning_rate": 1.9684735354768363e-05, "loss": 3.1453, "step": 45400 }, { "epoch": 1.7250530785562632, "grad_norm": 8.74583911895752, "learning_rate": 1.9681430011934074e-05, "loss": 3.1742, "step": 45500 }, { "epoch": 1.7288444040036397, "grad_norm": 8.638651847839355, "learning_rate": 1.967810771259623e-05, "loss": 3.192, "step": 45600 }, { "epoch": 1.7326357294510162, "grad_norm": 10.432538986206055, "learning_rate": 1.9674768462573674e-05, "loss": 3.2047, "step": 45700 }, { "epoch": 1.7364270548983924, "grad_norm": 9.020901679992676, "learning_rate": 1.967141226771491e-05, "loss": 3.1885, "step": 45800 }, { "epoch": 1.740218380345769, "grad_norm": 9.195768356323242, "learning_rate": 1.9668039133898137e-05, "loss": 3.1514, "step": 45900 }, { "epoch": 1.7440097057931454, "grad_norm": 10.438103675842285, "learning_rate": 1.966464906703122e-05, "loss": 3.1891, "step": 46000 }, { "epoch": 1.7478010312405217, "grad_norm": 9.233426094055176, "learning_rate": 1.9661242073051678e-05, "loss": 3.1455, "step": 46100 }, { "epoch": 1.7515923566878981, "grad_norm": 10.126344680786133, "learning_rate": 1.9657818157926677e-05, "loss": 3.1713, "step": 46200 }, { "epoch": 1.7553836821352746, "grad_norm": 9.437684059143066, "learning_rate": 1.9654377327653015e-05, "loss": 3.1365, "step": 46300 }, { "epoch": 1.7591750075826509, "grad_norm": 10.739609718322754, "learning_rate": 1.9650919588257125e-05, "loss": 3.1549, "step": 46400 }, { "epoch": 1.7629663330300271, "grad_norm": 9.439779281616211, "learning_rate": 1.9647444945795056e-05, "loss": 3.1578, "step": 46500 }, { "epoch": 1.7667576584774038, "grad_norm": 9.716024398803711, "learning_rate": 1.964395340635245e-05, "loss": 3.1821, "step": 46600 }, { "epoch": 1.77054898392478, "grad_norm": 9.943897247314453, "learning_rate": 1.9640444976044552e-05, "loss": 3.1812, "step": 46700 }, { "epoch": 1.7743403093721564, "grad_norm": 10.687925338745117, "learning_rate": 1.963691966101619e-05, "loss": 3.184, "step": 46800 }, { "epoch": 1.778131634819533, "grad_norm": 9.503206253051758, "learning_rate": 1.9633377467441767e-05, "loss": 3.1466, "step": 46900 }, { "epoch": 1.7819229602669093, "grad_norm": 10.196721076965332, "learning_rate": 1.962981840152524e-05, "loss": 3.1523, "step": 47000 }, { "epoch": 1.7857142857142856, "grad_norm": 10.124358177185059, "learning_rate": 1.962624246950012e-05, "loss": 3.1522, "step": 47100 }, { "epoch": 1.789505611161662, "grad_norm": 9.147980690002441, "learning_rate": 1.9622649677629466e-05, "loss": 3.1592, "step": 47200 }, { "epoch": 1.7932969366090385, "grad_norm": 9.038829803466797, "learning_rate": 1.9619040032205855e-05, "loss": 3.1096, "step": 47300 }, { "epoch": 1.7970882620564148, "grad_norm": 9.316182136535645, "learning_rate": 1.9615413539551382e-05, "loss": 3.1137, "step": 47400 }, { "epoch": 1.8008795875037913, "grad_norm": 9.435154914855957, "learning_rate": 1.9611770206017663e-05, "loss": 3.1072, "step": 47500 }, { "epoch": 1.8046709129511678, "grad_norm": 10.25540828704834, "learning_rate": 1.9608110037985796e-05, "loss": 3.1575, "step": 47600 }, { "epoch": 1.808462238398544, "grad_norm": 9.306611061096191, "learning_rate": 1.9604433041866367e-05, "loss": 3.1252, "step": 47700 }, { "epoch": 1.8122535638459205, "grad_norm": 8.667296409606934, "learning_rate": 1.960073922409944e-05, "loss": 3.0843, "step": 47800 }, { "epoch": 1.816044889293297, "grad_norm": 10.393031120300293, "learning_rate": 1.9597028591154535e-05, "loss": 3.1267, "step": 47900 }, { "epoch": 1.8198362147406733, "grad_norm": 9.919468879699707, "learning_rate": 1.959330114953063e-05, "loss": 3.0886, "step": 48000 }, { "epoch": 1.8236275401880497, "grad_norm": 10.632074356079102, "learning_rate": 1.958955690575613e-05, "loss": 3.1031, "step": 48100 }, { "epoch": 1.8274188656354262, "grad_norm": 9.928255081176758, "learning_rate": 1.958579586638889e-05, "loss": 3.0818, "step": 48200 }, { "epoch": 1.8312101910828025, "grad_norm": 9.859417915344238, "learning_rate": 1.9582018038016156e-05, "loss": 3.0983, "step": 48300 }, { "epoch": 1.835001516530179, "grad_norm": 9.056523323059082, "learning_rate": 1.9578223427254594e-05, "loss": 3.1168, "step": 48400 }, { "epoch": 1.8387928419775554, "grad_norm": 10.139375686645508, "learning_rate": 1.9574412040750267e-05, "loss": 3.0915, "step": 48500 }, { "epoch": 1.8425841674249317, "grad_norm": 9.003227233886719, "learning_rate": 1.9570583885178604e-05, "loss": 3.052, "step": 48600 }, { "epoch": 1.8463754928723082, "grad_norm": 10.701089859008789, "learning_rate": 1.9566738967244413e-05, "loss": 3.0786, "step": 48700 }, { "epoch": 1.8501668183196847, "grad_norm": 10.447517395019531, "learning_rate": 1.956287729368187e-05, "loss": 3.0703, "step": 48800 }, { "epoch": 1.853958143767061, "grad_norm": 9.662825584411621, "learning_rate": 1.9558998871254483e-05, "loss": 3.0155, "step": 48900 }, { "epoch": 1.8577494692144374, "grad_norm": 8.822328567504883, "learning_rate": 1.9555103706755103e-05, "loss": 3.0624, "step": 49000 }, { "epoch": 1.8615407946618139, "grad_norm": 10.724114418029785, "learning_rate": 1.9551191807005896e-05, "loss": 3.0574, "step": 49100 }, { "epoch": 1.8653321201091901, "grad_norm": 9.055379867553711, "learning_rate": 1.9547263178858346e-05, "loss": 3.0203, "step": 49200 }, { "epoch": 1.8691234455565666, "grad_norm": 8.167655944824219, "learning_rate": 1.9543317829193237e-05, "loss": 3.0425, "step": 49300 }, { "epoch": 1.872914771003943, "grad_norm": 10.045111656188965, "learning_rate": 1.953935576492063e-05, "loss": 3.0356, "step": 49400 }, { "epoch": 1.8767060964513194, "grad_norm": 8.582369804382324, "learning_rate": 1.9535376992979872e-05, "loss": 3.0425, "step": 49500 }, { "epoch": 1.8804974218986956, "grad_norm": 9.363085746765137, "learning_rate": 1.953138152033957e-05, "loss": 3.0549, "step": 49600 }, { "epoch": 1.8842887473460723, "grad_norm": 9.094735145568848, "learning_rate": 1.9527369353997573e-05, "loss": 3.0167, "step": 49700 }, { "epoch": 1.8880800727934486, "grad_norm": 8.768986701965332, "learning_rate": 1.952334050098098e-05, "loss": 3.0416, "step": 49800 }, { "epoch": 1.8918713982408248, "grad_norm": 9.712780952453613, "learning_rate": 1.951929496834611e-05, "loss": 3.0354, "step": 49900 }, { "epoch": 1.8956627236882015, "grad_norm": 8.942931175231934, "learning_rate": 1.9515232763178488e-05, "loss": 3.0256, "step": 50000 }, { "epoch": 1.8994540491355778, "grad_norm": 10.055891036987305, "learning_rate": 1.9511153892592856e-05, "loss": 3.0218, "step": 50100 }, { "epoch": 1.903245374582954, "grad_norm": 9.622611999511719, "learning_rate": 1.9507058363733132e-05, "loss": 3.0293, "step": 50200 }, { "epoch": 1.9070367000303308, "grad_norm": 10.56961441040039, "learning_rate": 1.950294618377242e-05, "loss": 3.0059, "step": 50300 }, { "epoch": 1.910828025477707, "grad_norm": 9.72450065612793, "learning_rate": 1.9498817359912973e-05, "loss": 2.993, "step": 50400 }, { "epoch": 1.9146193509250833, "grad_norm": 10.062677383422852, "learning_rate": 1.949467189938621e-05, "loss": 2.9962, "step": 50500 }, { "epoch": 1.9184106763724598, "grad_norm": 10.106364250183105, "learning_rate": 1.949050980945268e-05, "loss": 3.0142, "step": 50600 }, { "epoch": 1.9222020018198362, "grad_norm": 10.315230369567871, "learning_rate": 1.9486331097402062e-05, "loss": 3.0252, "step": 50700 }, { "epoch": 1.9259933272672125, "grad_norm": 10.710204124450684, "learning_rate": 1.948213577055314e-05, "loss": 2.9636, "step": 50800 }, { "epoch": 1.929784652714589, "grad_norm": 9.532504081726074, "learning_rate": 1.9477923836253807e-05, "loss": 2.9782, "step": 50900 }, { "epoch": 1.9335759781619655, "grad_norm": 9.351314544677734, "learning_rate": 1.947369530188104e-05, "loss": 2.9872, "step": 51000 }, { "epoch": 1.9373673036093417, "grad_norm": 10.265904426574707, "learning_rate": 1.946945017484089e-05, "loss": 2.9684, "step": 51100 }, { "epoch": 1.9411586290567182, "grad_norm": 10.1696195602417, "learning_rate": 1.9465188462568462e-05, "loss": 3.0338, "step": 51200 }, { "epoch": 1.9449499545040947, "grad_norm": 10.403696060180664, "learning_rate": 1.9460910172527922e-05, "loss": 2.9885, "step": 51300 }, { "epoch": 1.948741279951471, "grad_norm": 9.708459854125977, "learning_rate": 1.9456615312212462e-05, "loss": 2.9632, "step": 51400 }, { "epoch": 1.9525326053988474, "grad_norm": 9.33719253540039, "learning_rate": 1.94523038891443e-05, "loss": 2.9812, "step": 51500 }, { "epoch": 1.956323930846224, "grad_norm": 10.63798999786377, "learning_rate": 1.944797591087466e-05, "loss": 2.9641, "step": 51600 }, { "epoch": 1.9601152562936002, "grad_norm": 9.456490516662598, "learning_rate": 1.9443631384983765e-05, "loss": 2.9417, "step": 51700 }, { "epoch": 1.9639065817409767, "grad_norm": 10.023907661437988, "learning_rate": 1.9439270319080818e-05, "loss": 2.9847, "step": 51800 }, { "epoch": 1.9676979071883531, "grad_norm": 9.885372161865234, "learning_rate": 1.943489272080399e-05, "loss": 2.9869, "step": 51900 }, { "epoch": 1.9714892326357294, "grad_norm": 9.402313232421875, "learning_rate": 1.9430498597820405e-05, "loss": 2.9752, "step": 52000 }, { "epoch": 1.9752805580831059, "grad_norm": 10.209196090698242, "learning_rate": 1.9426087957826142e-05, "loss": 2.9701, "step": 52100 }, { "epoch": 1.9790718835304824, "grad_norm": 9.64765739440918, "learning_rate": 1.942166080854619e-05, "loss": 2.9717, "step": 52200 }, { "epoch": 1.9828632089778586, "grad_norm": 8.506855964660645, "learning_rate": 1.941721715773447e-05, "loss": 2.9359, "step": 52300 }, { "epoch": 1.986654534425235, "grad_norm": 10.606724739074707, "learning_rate": 1.9412757013173792e-05, "loss": 2.9269, "step": 52400 }, { "epoch": 1.9904458598726116, "grad_norm": 9.08190631866455, "learning_rate": 1.9408280382675862e-05, "loss": 2.9644, "step": 52500 }, { "epoch": 1.9942371853199878, "grad_norm": 9.14134693145752, "learning_rate": 1.9403787274081257e-05, "loss": 2.9683, "step": 52600 }, { "epoch": 1.9980285107673643, "grad_norm": 9.692643165588379, "learning_rate": 1.9399277695259418e-05, "loss": 2.9397, "step": 52700 }, { "epoch": 2.0, "eval_accuracy": 0.4195319090452404, "eval_loss": 2.850438117980957, "eval_runtime": 934.9167, "eval_samples_per_second": 902.78, "eval_steps_per_second": 7.053, "step": 52752 }, { "epoch": 2.001819836214741, "grad_norm": 9.1592435836792, "learning_rate": 1.9394751654108622e-05, "loss": 2.8454, "step": 52800 }, { "epoch": 2.005611161662117, "grad_norm": 10.475399017333984, "learning_rate": 1.9390209158555993e-05, "loss": 2.8006, "step": 52900 }, { "epoch": 2.0094024871094933, "grad_norm": 9.16231918334961, "learning_rate": 1.9385650216557464e-05, "loss": 2.7779, "step": 53000 }, { "epoch": 2.01319381255687, "grad_norm": 9.519156455993652, "learning_rate": 1.938107483609778e-05, "loss": 2.7734, "step": 53100 }, { "epoch": 2.0169851380042463, "grad_norm": 9.14332103729248, "learning_rate": 1.937648302519047e-05, "loss": 2.7672, "step": 53200 }, { "epoch": 2.0207764634516225, "grad_norm": 9.102188110351562, "learning_rate": 1.9371874791877843e-05, "loss": 2.7719, "step": 53300 }, { "epoch": 2.0245677888989992, "grad_norm": 9.996235847473145, "learning_rate": 1.9367250144230974e-05, "loss": 2.7586, "step": 53400 }, { "epoch": 2.0283591143463755, "grad_norm": 9.926887512207031, "learning_rate": 1.9362609090349685e-05, "loss": 2.7795, "step": 53500 }, { "epoch": 2.0321504397937518, "grad_norm": 10.159876823425293, "learning_rate": 1.9357951638362534e-05, "loss": 2.7434, "step": 53600 }, { "epoch": 2.0359417652411285, "grad_norm": 10.211610794067383, "learning_rate": 1.9353277796426788e-05, "loss": 2.7669, "step": 53700 }, { "epoch": 2.0397330906885047, "grad_norm": 9.650534629821777, "learning_rate": 1.934858757272844e-05, "loss": 2.763, "step": 53800 }, { "epoch": 2.043524416135881, "grad_norm": 9.410533905029297, "learning_rate": 1.934388097548216e-05, "loss": 2.7898, "step": 53900 }, { "epoch": 2.0473157415832577, "grad_norm": 9.720155715942383, "learning_rate": 1.93391580129313e-05, "loss": 2.7697, "step": 54000 }, { "epoch": 2.051107067030634, "grad_norm": 10.379890441894531, "learning_rate": 1.933441869334787e-05, "loss": 2.7813, "step": 54100 }, { "epoch": 2.05489839247801, "grad_norm": 8.97773551940918, "learning_rate": 1.9329663025032545e-05, "loss": 2.7395, "step": 54200 }, { "epoch": 2.058689717925387, "grad_norm": 10.00265121459961, "learning_rate": 1.9324891016314614e-05, "loss": 2.7482, "step": 54300 }, { "epoch": 2.062481043372763, "grad_norm": 10.32107162475586, "learning_rate": 1.932010267555199e-05, "loss": 2.7445, "step": 54400 }, { "epoch": 2.0662723688201394, "grad_norm": 10.097314834594727, "learning_rate": 1.93152980111312e-05, "loss": 2.7641, "step": 54500 }, { "epoch": 2.070063694267516, "grad_norm": 9.958163261413574, "learning_rate": 1.9310477031467357e-05, "loss": 2.7456, "step": 54600 }, { "epoch": 2.0738550197148924, "grad_norm": 11.162203788757324, "learning_rate": 1.930563974500414e-05, "loss": 2.7393, "step": 54700 }, { "epoch": 2.0776463451622686, "grad_norm": 9.901957511901855, "learning_rate": 1.9300786160213804e-05, "loss": 2.7728, "step": 54800 }, { "epoch": 2.0814376706096454, "grad_norm": 10.857941627502441, "learning_rate": 1.9295916285597134e-05, "loss": 2.7354, "step": 54900 }, { "epoch": 2.0852289960570216, "grad_norm": 11.06561279296875, "learning_rate": 1.929103012968346e-05, "loss": 2.7198, "step": 55000 }, { "epoch": 2.089020321504398, "grad_norm": 10.65528392791748, "learning_rate": 1.9286127701030615e-05, "loss": 2.7376, "step": 55100 }, { "epoch": 2.092811646951774, "grad_norm": 9.537734985351562, "learning_rate": 1.9281209008224943e-05, "loss": 2.7248, "step": 55200 }, { "epoch": 2.096602972399151, "grad_norm": 9.704449653625488, "learning_rate": 1.9276274059881265e-05, "loss": 2.73, "step": 55300 }, { "epoch": 2.100394297846527, "grad_norm": 10.117986679077148, "learning_rate": 1.9271322864642884e-05, "loss": 2.7279, "step": 55400 }, { "epoch": 2.1041856232939034, "grad_norm": 10.865509986877441, "learning_rate": 1.926635543118155e-05, "loss": 2.7211, "step": 55500 }, { "epoch": 2.10797694874128, "grad_norm": 10.102231979370117, "learning_rate": 1.926137176819745e-05, "loss": 2.7053, "step": 55600 }, { "epoch": 2.1117682741886563, "grad_norm": 10.646932601928711, "learning_rate": 1.9256371884419208e-05, "loss": 2.7176, "step": 55700 }, { "epoch": 2.1155595996360326, "grad_norm": 10.735231399536133, "learning_rate": 1.9251355788603846e-05, "loss": 2.706, "step": 55800 }, { "epoch": 2.1193509250834093, "grad_norm": 9.475335121154785, "learning_rate": 1.9246323489536795e-05, "loss": 2.7263, "step": 55900 }, { "epoch": 2.1231422505307855, "grad_norm": 9.986281394958496, "learning_rate": 1.9241274996031844e-05, "loss": 2.724, "step": 56000 }, { "epoch": 2.126933575978162, "grad_norm": 11.062609672546387, "learning_rate": 1.9236210316931165e-05, "loss": 2.7309, "step": 56100 }, { "epoch": 2.1307249014255385, "grad_norm": 9.928868293762207, "learning_rate": 1.9231129461105267e-05, "loss": 2.6911, "step": 56200 }, { "epoch": 2.1345162268729148, "grad_norm": 10.622503280639648, "learning_rate": 1.9226032437452998e-05, "loss": 2.7369, "step": 56300 }, { "epoch": 2.138307552320291, "grad_norm": 9.557900428771973, "learning_rate": 1.922091925490152e-05, "loss": 2.7221, "step": 56400 }, { "epoch": 2.1420988777676677, "grad_norm": 10.240161895751953, "learning_rate": 1.92157899224063e-05, "loss": 2.7256, "step": 56500 }, { "epoch": 2.145890203215044, "grad_norm": 9.75886058807373, "learning_rate": 1.9210644448951083e-05, "loss": 2.7372, "step": 56600 }, { "epoch": 2.1496815286624202, "grad_norm": 11.09937858581543, "learning_rate": 1.920548284354789e-05, "loss": 2.7079, "step": 56700 }, { "epoch": 2.153472854109797, "grad_norm": 9.598326683044434, "learning_rate": 1.9200305115236996e-05, "loss": 2.7152, "step": 56800 }, { "epoch": 2.157264179557173, "grad_norm": 9.500566482543945, "learning_rate": 1.9195111273086914e-05, "loss": 2.7319, "step": 56900 }, { "epoch": 2.1610555050045495, "grad_norm": 10.082803726196289, "learning_rate": 1.9189901326194377e-05, "loss": 2.7115, "step": 57000 }, { "epoch": 2.164846830451926, "grad_norm": 9.727087020874023, "learning_rate": 1.918467528368433e-05, "loss": 2.6799, "step": 57100 }, { "epoch": 2.1686381558993024, "grad_norm": 11.060436248779297, "learning_rate": 1.9179433154709902e-05, "loss": 2.7279, "step": 57200 }, { "epoch": 2.1724294813466787, "grad_norm": 10.901908874511719, "learning_rate": 1.91741749484524e-05, "loss": 2.7245, "step": 57300 }, { "epoch": 2.1762208067940554, "grad_norm": 12.0674409866333, "learning_rate": 1.9168900674121287e-05, "loss": 2.7013, "step": 57400 }, { "epoch": 2.1800121322414316, "grad_norm": 9.407752990722656, "learning_rate": 1.9163610340954175e-05, "loss": 2.6972, "step": 57500 }, { "epoch": 2.183803457688808, "grad_norm": 10.755790710449219, "learning_rate": 1.9158303958216797e-05, "loss": 2.7209, "step": 57600 }, { "epoch": 2.1875947831361846, "grad_norm": 9.51340103149414, "learning_rate": 1.9152981535202995e-05, "loss": 2.6486, "step": 57700 }, { "epoch": 2.191386108583561, "grad_norm": 10.846084594726562, "learning_rate": 1.914764308123471e-05, "loss": 2.6892, "step": 57800 }, { "epoch": 2.195177434030937, "grad_norm": 10.45319652557373, "learning_rate": 1.914228860566195e-05, "loss": 2.652, "step": 57900 }, { "epoch": 2.198968759478314, "grad_norm": 9.247453689575195, "learning_rate": 1.9136918117862797e-05, "loss": 2.6651, "step": 58000 }, { "epoch": 2.20276008492569, "grad_norm": 9.506221771240234, "learning_rate": 1.9131531627243367e-05, "loss": 2.6972, "step": 58100 }, { "epoch": 2.2065514103730663, "grad_norm": 11.089963912963867, "learning_rate": 1.9126129143237814e-05, "loss": 2.665, "step": 58200 }, { "epoch": 2.2103427358204426, "grad_norm": 10.59587287902832, "learning_rate": 1.912071067530829e-05, "loss": 2.698, "step": 58300 }, { "epoch": 2.2141340612678193, "grad_norm": 10.834369659423828, "learning_rate": 1.911527623294496e-05, "loss": 2.6795, "step": 58400 }, { "epoch": 2.2179253867151956, "grad_norm": 9.742694854736328, "learning_rate": 1.9109825825665948e-05, "loss": 2.6678, "step": 58500 }, { "epoch": 2.221716712162572, "grad_norm": 10.867846488952637, "learning_rate": 1.9104359463017354e-05, "loss": 2.7003, "step": 58600 }, { "epoch": 2.2255080376099485, "grad_norm": 9.668331146240234, "learning_rate": 1.9098877154573218e-05, "loss": 2.6256, "step": 58700 }, { "epoch": 2.229299363057325, "grad_norm": 10.75864315032959, "learning_rate": 1.9093378909935503e-05, "loss": 2.6765, "step": 58800 }, { "epoch": 2.233090688504701, "grad_norm": 9.25772762298584, "learning_rate": 1.9087864738734095e-05, "loss": 2.6503, "step": 58900 }, { "epoch": 2.2368820139520778, "grad_norm": 9.09449577331543, "learning_rate": 1.9082334650626764e-05, "loss": 2.6524, "step": 59000 }, { "epoch": 2.240673339399454, "grad_norm": 10.231986045837402, "learning_rate": 1.9076788655299163e-05, "loss": 2.6579, "step": 59100 }, { "epoch": 2.2444646648468303, "grad_norm": 10.503318786621094, "learning_rate": 1.9071226762464804e-05, "loss": 2.6578, "step": 59200 }, { "epoch": 2.248255990294207, "grad_norm": 9.985982894897461, "learning_rate": 1.9065648981865045e-05, "loss": 2.6715, "step": 59300 }, { "epoch": 2.2520473157415832, "grad_norm": 10.01918888092041, "learning_rate": 1.9060055323269065e-05, "loss": 2.6559, "step": 59400 }, { "epoch": 2.2558386411889595, "grad_norm": 10.892840385437012, "learning_rate": 1.9054445796473857e-05, "loss": 2.6596, "step": 59500 }, { "epoch": 2.259629966636336, "grad_norm": 9.959122657775879, "learning_rate": 1.9048820411304208e-05, "loss": 2.6827, "step": 59600 }, { "epoch": 2.2634212920837125, "grad_norm": 10.351717948913574, "learning_rate": 1.9043179177612675e-05, "loss": 2.645, "step": 59700 }, { "epoch": 2.2672126175310887, "grad_norm": 10.030474662780762, "learning_rate": 1.9037522105279574e-05, "loss": 2.6777, "step": 59800 }, { "epoch": 2.2710039429784654, "grad_norm": 9.536321640014648, "learning_rate": 1.9031849204212963e-05, "loss": 2.6638, "step": 59900 }, { "epoch": 2.2747952684258417, "grad_norm": 9.479663848876953, "learning_rate": 1.9026160484348626e-05, "loss": 2.617, "step": 60000 }, { "epoch": 2.278586593873218, "grad_norm": 9.3283109664917, "learning_rate": 1.9020455955650044e-05, "loss": 2.6042, "step": 60100 }, { "epoch": 2.2823779193205946, "grad_norm": 11.007224082946777, "learning_rate": 1.90147356281084e-05, "loss": 2.644, "step": 60200 }, { "epoch": 2.286169244767971, "grad_norm": 10.463188171386719, "learning_rate": 1.9008999511742532e-05, "loss": 2.655, "step": 60300 }, { "epoch": 2.289960570215347, "grad_norm": 10.239069938659668, "learning_rate": 1.900324761659894e-05, "loss": 2.6796, "step": 60400 }, { "epoch": 2.293751895662724, "grad_norm": 10.59797191619873, "learning_rate": 1.8997479952751768e-05, "loss": 2.6262, "step": 60500 }, { "epoch": 2.2975432211101, "grad_norm": 10.357266426086426, "learning_rate": 1.899169653030276e-05, "loss": 2.6244, "step": 60600 }, { "epoch": 2.3013345465574764, "grad_norm": 10.252458572387695, "learning_rate": 1.898589735938127e-05, "loss": 2.6239, "step": 60700 }, { "epoch": 2.305125872004853, "grad_norm": 10.563300132751465, "learning_rate": 1.8980082450144234e-05, "loss": 2.649, "step": 60800 }, { "epoch": 2.3089171974522293, "grad_norm": 10.723817825317383, "learning_rate": 1.8974251812776155e-05, "loss": 2.6035, "step": 60900 }, { "epoch": 2.3127085228996056, "grad_norm": 9.476155281066895, "learning_rate": 1.8968405457489078e-05, "loss": 2.6062, "step": 61000 }, { "epoch": 2.3164998483469823, "grad_norm": 10.6336669921875, "learning_rate": 1.8962543394522578e-05, "loss": 2.638, "step": 61100 }, { "epoch": 2.3202911737943586, "grad_norm": 10.537781715393066, "learning_rate": 1.8956665634143744e-05, "loss": 2.6137, "step": 61200 }, { "epoch": 2.324082499241735, "grad_norm": 10.331385612487793, "learning_rate": 1.895077218664716e-05, "loss": 2.6255, "step": 61300 }, { "epoch": 2.327873824689111, "grad_norm": 10.404614448547363, "learning_rate": 1.8944863062354884e-05, "loss": 2.6724, "step": 61400 }, { "epoch": 2.331665150136488, "grad_norm": 10.337014198303223, "learning_rate": 1.8938938271616422e-05, "loss": 2.6607, "step": 61500 }, { "epoch": 2.335456475583864, "grad_norm": 10.708700180053711, "learning_rate": 1.893299782480873e-05, "loss": 2.6351, "step": 61600 }, { "epoch": 2.3392478010312407, "grad_norm": 10.159525871276855, "learning_rate": 1.8927041732336174e-05, "loss": 2.6311, "step": 61700 }, { "epoch": 2.343039126478617, "grad_norm": 9.886256217956543, "learning_rate": 1.8921070004630545e-05, "loss": 2.5867, "step": 61800 }, { "epoch": 2.3468304519259933, "grad_norm": 10.485790252685547, "learning_rate": 1.891508265215099e-05, "loss": 2.6252, "step": 61900 }, { "epoch": 2.3506217773733695, "grad_norm": 10.338577270507812, "learning_rate": 1.8909079685384032e-05, "loss": 2.6361, "step": 62000 }, { "epoch": 2.3544131028207462, "grad_norm": 11.164861679077148, "learning_rate": 1.8903061114843555e-05, "loss": 2.6344, "step": 62100 }, { "epoch": 2.3582044282681225, "grad_norm": 10.025540351867676, "learning_rate": 1.8897026951070755e-05, "loss": 2.5899, "step": 62200 }, { "epoch": 2.3619957537154987, "grad_norm": 11.263103485107422, "learning_rate": 1.8890977204634147e-05, "loss": 2.635, "step": 62300 }, { "epoch": 2.3657870791628755, "grad_norm": 10.278914451599121, "learning_rate": 1.8884911886129535e-05, "loss": 2.5991, "step": 62400 }, { "epoch": 2.3695784046102517, "grad_norm": 9.811322212219238, "learning_rate": 1.8878831006179997e-05, "loss": 2.62, "step": 62500 }, { "epoch": 2.373369730057628, "grad_norm": 10.584925651550293, "learning_rate": 1.887273457543587e-05, "loss": 2.6151, "step": 62600 }, { "epoch": 2.3771610555050047, "grad_norm": 10.335131645202637, "learning_rate": 1.8866622604574723e-05, "loss": 2.6148, "step": 62700 }, { "epoch": 2.380952380952381, "grad_norm": 11.379439353942871, "learning_rate": 1.8860495104301346e-05, "loss": 2.5925, "step": 62800 }, { "epoch": 2.384743706399757, "grad_norm": 9.846826553344727, "learning_rate": 1.885435208534772e-05, "loss": 2.6075, "step": 62900 }, { "epoch": 2.388535031847134, "grad_norm": 10.366350173950195, "learning_rate": 1.8848193558473014e-05, "loss": 2.5858, "step": 63000 }, { "epoch": 2.39232635729451, "grad_norm": 10.25044059753418, "learning_rate": 1.8842019534463562e-05, "loss": 2.6086, "step": 63100 }, { "epoch": 2.3961176827418864, "grad_norm": 9.870427131652832, "learning_rate": 1.8835830024132828e-05, "loss": 2.6283, "step": 63200 }, { "epoch": 2.399909008189263, "grad_norm": 10.823317527770996, "learning_rate": 1.8829625038321412e-05, "loss": 2.6169, "step": 63300 }, { "epoch": 2.4037003336366394, "grad_norm": 10.395232200622559, "learning_rate": 1.8823404587897004e-05, "loss": 2.5699, "step": 63400 }, { "epoch": 2.4074916590840156, "grad_norm": 10.336236953735352, "learning_rate": 1.8817168683754396e-05, "loss": 2.5773, "step": 63500 }, { "epoch": 2.4112829845313923, "grad_norm": 10.173839569091797, "learning_rate": 1.8810917336815438e-05, "loss": 2.5778, "step": 63600 }, { "epoch": 2.4150743099787686, "grad_norm": 9.004371643066406, "learning_rate": 1.8804650558029022e-05, "loss": 2.6275, "step": 63700 }, { "epoch": 2.418865635426145, "grad_norm": 9.86668872833252, "learning_rate": 1.8798368358371084e-05, "loss": 2.5892, "step": 63800 }, { "epoch": 2.4226569608735216, "grad_norm": 11.039083480834961, "learning_rate": 1.8792070748844543e-05, "loss": 2.5926, "step": 63900 }, { "epoch": 2.426448286320898, "grad_norm": 9.682880401611328, "learning_rate": 1.8785757740479334e-05, "loss": 2.6312, "step": 64000 }, { "epoch": 2.430239611768274, "grad_norm": 10.437568664550781, "learning_rate": 1.8779429344332355e-05, "loss": 2.5798, "step": 64100 }, { "epoch": 2.434030937215651, "grad_norm": 10.102533340454102, "learning_rate": 1.877308557148744e-05, "loss": 2.5579, "step": 64200 }, { "epoch": 2.437822262663027, "grad_norm": 10.573121070861816, "learning_rate": 1.8766726433055375e-05, "loss": 2.5951, "step": 64300 }, { "epoch": 2.4416135881104033, "grad_norm": 12.57343578338623, "learning_rate": 1.8760351940173842e-05, "loss": 2.5984, "step": 64400 }, { "epoch": 2.4454049135577796, "grad_norm": 9.93972396850586, "learning_rate": 1.8753962104007426e-05, "loss": 2.5862, "step": 64500 }, { "epoch": 2.4491962390051563, "grad_norm": 9.906899452209473, "learning_rate": 1.8747556935747583e-05, "loss": 2.6014, "step": 64600 }, { "epoch": 2.4529875644525325, "grad_norm": 10.353975296020508, "learning_rate": 1.8741136446612613e-05, "loss": 2.5923, "step": 64700 }, { "epoch": 2.4567788898999092, "grad_norm": 9.897231101989746, "learning_rate": 1.8734700647847666e-05, "loss": 2.5625, "step": 64800 }, { "epoch": 2.4605702153472855, "grad_norm": 10.340551376342773, "learning_rate": 1.872824955072469e-05, "loss": 2.574, "step": 64900 }, { "epoch": 2.4643615407946617, "grad_norm": 11.397128105163574, "learning_rate": 1.8721783166542437e-05, "loss": 2.5897, "step": 65000 }, { "epoch": 2.468152866242038, "grad_norm": 10.75988483428955, "learning_rate": 1.8715301506626435e-05, "loss": 2.5498, "step": 65100 }, { "epoch": 2.4719441916894147, "grad_norm": 10.069684028625488, "learning_rate": 1.8708804582328955e-05, "loss": 2.5683, "step": 65200 }, { "epoch": 2.475735517136791, "grad_norm": 10.63200855255127, "learning_rate": 1.8702292405029014e-05, "loss": 2.6129, "step": 65300 }, { "epoch": 2.4795268425841677, "grad_norm": 10.153289794921875, "learning_rate": 1.869576498613234e-05, "loss": 2.5647, "step": 65400 }, { "epoch": 2.483318168031544, "grad_norm": 9.7765531539917, "learning_rate": 1.8689222337071355e-05, "loss": 2.5875, "step": 65500 }, { "epoch": 2.48710949347892, "grad_norm": 10.844930648803711, "learning_rate": 1.868266446930516e-05, "loss": 2.5363, "step": 65600 }, { "epoch": 2.4909008189262964, "grad_norm": 9.83458423614502, "learning_rate": 1.8676091394319503e-05, "loss": 2.5626, "step": 65700 }, { "epoch": 2.494692144373673, "grad_norm": 9.094827651977539, "learning_rate": 1.8669503123626772e-05, "loss": 2.5471, "step": 65800 }, { "epoch": 2.4984834698210494, "grad_norm": 10.15859317779541, "learning_rate": 1.8662899668765967e-05, "loss": 2.5704, "step": 65900 }, { "epoch": 2.502274795268426, "grad_norm": 10.096708297729492, "learning_rate": 1.865628104130269e-05, "loss": 2.5451, "step": 66000 }, { "epoch": 2.5060661207158024, "grad_norm": 10.526820182800293, "learning_rate": 1.8649647252829106e-05, "loss": 2.5643, "step": 66100 }, { "epoch": 2.5098574461631786, "grad_norm": 10.55100154876709, "learning_rate": 1.864299831496394e-05, "loss": 2.5347, "step": 66200 }, { "epoch": 2.513648771610555, "grad_norm": 11.573785781860352, "learning_rate": 1.8636334239352445e-05, "loss": 2.6035, "step": 66300 }, { "epoch": 2.5174400970579316, "grad_norm": 10.106539726257324, "learning_rate": 1.8629655037666396e-05, "loss": 2.5658, "step": 66400 }, { "epoch": 2.521231422505308, "grad_norm": 10.151406288146973, "learning_rate": 1.862296072160405e-05, "loss": 2.5578, "step": 66500 }, { "epoch": 2.525022747952684, "grad_norm": 9.91120433807373, "learning_rate": 1.8616251302890145e-05, "loss": 2.5338, "step": 66600 }, { "epoch": 2.528814073400061, "grad_norm": 11.85058307647705, "learning_rate": 1.860952679327587e-05, "loss": 2.5626, "step": 66700 }, { "epoch": 2.532605398847437, "grad_norm": 10.59245491027832, "learning_rate": 1.860278720453883e-05, "loss": 2.5736, "step": 66800 }, { "epoch": 2.5363967242948133, "grad_norm": 11.823192596435547, "learning_rate": 1.8596032548483067e-05, "loss": 2.5478, "step": 66900 }, { "epoch": 2.54018804974219, "grad_norm": 10.727869033813477, "learning_rate": 1.8589262836938985e-05, "loss": 2.5691, "step": 67000 }, { "epoch": 2.5439793751895663, "grad_norm": 10.453368186950684, "learning_rate": 1.858247808176338e-05, "loss": 2.5458, "step": 67100 }, { "epoch": 2.5477707006369426, "grad_norm": 10.365242958068848, "learning_rate": 1.857567829483937e-05, "loss": 2.5681, "step": 67200 }, { "epoch": 2.5515620260843193, "grad_norm": 11.235737800598145, "learning_rate": 1.8568863488076437e-05, "loss": 2.5869, "step": 67300 }, { "epoch": 2.5553533515316955, "grad_norm": 10.74031925201416, "learning_rate": 1.856203367341033e-05, "loss": 2.5414, "step": 67400 }, { "epoch": 2.5591446769790718, "grad_norm": 11.334955215454102, "learning_rate": 1.8555188862803107e-05, "loss": 2.5648, "step": 67500 }, { "epoch": 2.562936002426448, "grad_norm": 9.866023063659668, "learning_rate": 1.8548329068243092e-05, "loss": 2.598, "step": 67600 }, { "epoch": 2.5667273278738247, "grad_norm": 9.952630043029785, "learning_rate": 1.8541454301744838e-05, "loss": 2.5745, "step": 67700 }, { "epoch": 2.570518653321201, "grad_norm": 10.400540351867676, "learning_rate": 1.8534564575349134e-05, "loss": 2.5477, "step": 67800 }, { "epoch": 2.5743099787685777, "grad_norm": 11.05518913269043, "learning_rate": 1.8527659901122957e-05, "loss": 2.5448, "step": 67900 }, { "epoch": 2.578101304215954, "grad_norm": 11.457334518432617, "learning_rate": 1.852074029115948e-05, "loss": 2.5528, "step": 68000 }, { "epoch": 2.58189262966333, "grad_norm": 10.250699043273926, "learning_rate": 1.8513805757578027e-05, "loss": 2.5386, "step": 68100 }, { "epoch": 2.5856839551107065, "grad_norm": 10.882891654968262, "learning_rate": 1.8506856312524055e-05, "loss": 2.5858, "step": 68200 }, { "epoch": 2.589475280558083, "grad_norm": 10.193925857543945, "learning_rate": 1.849989196816915e-05, "loss": 2.5239, "step": 68300 }, { "epoch": 2.5932666060054594, "grad_norm": 10.932377815246582, "learning_rate": 1.849291273671098e-05, "loss": 2.5165, "step": 68400 }, { "epoch": 2.597057931452836, "grad_norm": 10.330302238464355, "learning_rate": 1.8485918630373295e-05, "loss": 2.549, "step": 68500 }, { "epoch": 2.6008492569002124, "grad_norm": 11.282591819763184, "learning_rate": 1.8478909661405895e-05, "loss": 2.5424, "step": 68600 }, { "epoch": 2.6046405823475887, "grad_norm": 12.716691970825195, "learning_rate": 1.847188584208462e-05, "loss": 2.5131, "step": 68700 }, { "epoch": 2.608431907794965, "grad_norm": 10.126006126403809, "learning_rate": 1.8464847184711297e-05, "loss": 2.5098, "step": 68800 }, { "epoch": 2.6122232332423416, "grad_norm": 10.837462425231934, "learning_rate": 1.8457793701613767e-05, "loss": 2.4951, "step": 68900 }, { "epoch": 2.616014558689718, "grad_norm": 10.660540580749512, "learning_rate": 1.845072540514582e-05, "loss": 2.5274, "step": 69000 }, { "epoch": 2.6198058841370946, "grad_norm": 10.759454727172852, "learning_rate": 1.84436423076872e-05, "loss": 2.5358, "step": 69100 }, { "epoch": 2.623597209584471, "grad_norm": 10.765948295593262, "learning_rate": 1.843654442164357e-05, "loss": 2.5143, "step": 69200 }, { "epoch": 2.627388535031847, "grad_norm": 11.309243202209473, "learning_rate": 1.84294317594465e-05, "loss": 2.4924, "step": 69300 }, { "epoch": 2.6311798604792234, "grad_norm": 11.350716590881348, "learning_rate": 1.8422304333553425e-05, "loss": 2.5296, "step": 69400 }, { "epoch": 2.6349711859266, "grad_norm": 10.150238990783691, "learning_rate": 1.8415162156447655e-05, "loss": 2.5102, "step": 69500 }, { "epoch": 2.6387625113739763, "grad_norm": 11.10338020324707, "learning_rate": 1.8408005240638328e-05, "loss": 2.5217, "step": 69600 }, { "epoch": 2.6425538368213526, "grad_norm": 10.31495475769043, "learning_rate": 1.8400833598660392e-05, "loss": 2.512, "step": 69700 }, { "epoch": 2.6463451622687293, "grad_norm": 10.861299514770508, "learning_rate": 1.8393647243074598e-05, "loss": 2.5197, "step": 69800 }, { "epoch": 2.6501364877161055, "grad_norm": 10.341408729553223, "learning_rate": 1.838644618646746e-05, "loss": 2.5403, "step": 69900 }, { "epoch": 2.653927813163482, "grad_norm": 10.429519653320312, "learning_rate": 1.8379230441451238e-05, "loss": 2.5437, "step": 70000 }, { "epoch": 2.6577191386108585, "grad_norm": 10.463393211364746, "learning_rate": 1.8372000020663924e-05, "loss": 2.514, "step": 70100 }, { "epoch": 2.6615104640582348, "grad_norm": 9.859869003295898, "learning_rate": 1.836475493676921e-05, "loss": 2.525, "step": 70200 }, { "epoch": 2.665301789505611, "grad_norm": 11.388726234436035, "learning_rate": 1.8357495202456465e-05, "loss": 2.5568, "step": 70300 }, { "epoch": 2.6690931149529877, "grad_norm": 10.553744316101074, "learning_rate": 1.835022083044073e-05, "loss": 2.4746, "step": 70400 }, { "epoch": 2.672884440400364, "grad_norm": 11.124783515930176, "learning_rate": 1.8342931833462675e-05, "loss": 2.5243, "step": 70500 }, { "epoch": 2.6766757658477403, "grad_norm": 10.64106559753418, "learning_rate": 1.8335628224288586e-05, "loss": 2.5145, "step": 70600 }, { "epoch": 2.6804670912951165, "grad_norm": 10.55736255645752, "learning_rate": 1.8328310015710336e-05, "loss": 2.5148, "step": 70700 }, { "epoch": 2.684258416742493, "grad_norm": 10.906028747558594, "learning_rate": 1.8320977220545384e-05, "loss": 2.4984, "step": 70800 }, { "epoch": 2.6880497421898695, "grad_norm": 10.138858795166016, "learning_rate": 1.8313629851636717e-05, "loss": 2.4998, "step": 70900 }, { "epoch": 2.691841067637246, "grad_norm": 10.640596389770508, "learning_rate": 1.8306267921852856e-05, "loss": 2.5028, "step": 71000 }, { "epoch": 2.6956323930846224, "grad_norm": 9.935202598571777, "learning_rate": 1.8298891444087828e-05, "loss": 2.5293, "step": 71100 }, { "epoch": 2.6994237185319987, "grad_norm": 10.189600944519043, "learning_rate": 1.829150043126114e-05, "loss": 2.4988, "step": 71200 }, { "epoch": 2.703215043979375, "grad_norm": 10.917332649230957, "learning_rate": 1.8284094896317746e-05, "loss": 2.5246, "step": 71300 }, { "epoch": 2.7070063694267517, "grad_norm": 11.418713569641113, "learning_rate": 1.8276674852228044e-05, "loss": 2.5057, "step": 71400 }, { "epoch": 2.710797694874128, "grad_norm": 10.238081932067871, "learning_rate": 1.8269240311987846e-05, "loss": 2.4992, "step": 71500 }, { "epoch": 2.7145890203215046, "grad_norm": 11.222386360168457, "learning_rate": 1.8261791288618346e-05, "loss": 2.5017, "step": 71600 }, { "epoch": 2.718380345768881, "grad_norm": 11.123385429382324, "learning_rate": 1.8254327795166104e-05, "loss": 2.4623, "step": 71700 }, { "epoch": 2.722171671216257, "grad_norm": 9.721126556396484, "learning_rate": 1.824684984470303e-05, "loss": 2.5054, "step": 71800 }, { "epoch": 2.7259629966636334, "grad_norm": 10.80428695678711, "learning_rate": 1.8239357450326357e-05, "loss": 2.4755, "step": 71900 }, { "epoch": 2.72975432211101, "grad_norm": 11.705205917358398, "learning_rate": 1.8231850625158603e-05, "loss": 2.528, "step": 72000 }, { "epoch": 2.7335456475583864, "grad_norm": 11.12421989440918, "learning_rate": 1.8224329382347573e-05, "loss": 2.5196, "step": 72100 }, { "epoch": 2.737336973005763, "grad_norm": 10.163957595825195, "learning_rate": 1.8216793735066317e-05, "loss": 2.4858, "step": 72200 }, { "epoch": 2.7411282984531393, "grad_norm": 10.764104843139648, "learning_rate": 1.820924369651311e-05, "loss": 2.4795, "step": 72300 }, { "epoch": 2.7449196239005156, "grad_norm": 10.332773208618164, "learning_rate": 1.820167927991145e-05, "loss": 2.4966, "step": 72400 }, { "epoch": 2.748710949347892, "grad_norm": 10.64539909362793, "learning_rate": 1.8194100498509995e-05, "loss": 2.4667, "step": 72500 }, { "epoch": 2.7525022747952685, "grad_norm": 10.582850456237793, "learning_rate": 1.818650736558258e-05, "loss": 2.4863, "step": 72600 }, { "epoch": 2.756293600242645, "grad_norm": 10.294204711914062, "learning_rate": 1.8178899894428165e-05, "loss": 2.4823, "step": 72700 }, { "epoch": 2.7600849256900215, "grad_norm": 9.588306427001953, "learning_rate": 1.8171278098370825e-05, "loss": 2.4718, "step": 72800 }, { "epoch": 2.7638762511373978, "grad_norm": 10.462890625, "learning_rate": 1.816364199075973e-05, "loss": 2.4931, "step": 72900 }, { "epoch": 2.767667576584774, "grad_norm": 10.21527099609375, "learning_rate": 1.8155991584969104e-05, "loss": 2.4926, "step": 73000 }, { "epoch": 2.7714589020321503, "grad_norm": 10.587074279785156, "learning_rate": 1.8148326894398228e-05, "loss": 2.5378, "step": 73100 }, { "epoch": 2.775250227479527, "grad_norm": 10.747403144836426, "learning_rate": 1.8140647932471394e-05, "loss": 2.4811, "step": 73200 }, { "epoch": 2.7790415529269032, "grad_norm": 10.153411865234375, "learning_rate": 1.8132954712637887e-05, "loss": 2.479, "step": 73300 }, { "epoch": 2.7828328783742795, "grad_norm": 11.69788646697998, "learning_rate": 1.812524724837197e-05, "loss": 2.4775, "step": 73400 }, { "epoch": 2.786624203821656, "grad_norm": 11.218607902526855, "learning_rate": 1.8117525553172853e-05, "loss": 2.4527, "step": 73500 }, { "epoch": 2.7904155292690325, "grad_norm": 11.266960144042969, "learning_rate": 1.8109789640564666e-05, "loss": 2.4762, "step": 73600 }, { "epoch": 2.7942068547164087, "grad_norm": 9.961610794067383, "learning_rate": 1.8102039524096446e-05, "loss": 2.456, "step": 73700 }, { "epoch": 2.797998180163785, "grad_norm": 9.828187942504883, "learning_rate": 1.8094275217342104e-05, "loss": 2.4596, "step": 73800 }, { "epoch": 2.8017895056111617, "grad_norm": 10.045008659362793, "learning_rate": 1.8086496733900404e-05, "loss": 2.4765, "step": 73900 }, { "epoch": 2.805580831058538, "grad_norm": 10.501665115356445, "learning_rate": 1.8078704087394946e-05, "loss": 2.5069, "step": 74000 }, { "epoch": 2.8093721565059147, "grad_norm": 10.694310188293457, "learning_rate": 1.8070897291474126e-05, "loss": 2.4956, "step": 74100 }, { "epoch": 2.813163481953291, "grad_norm": 11.078299522399902, "learning_rate": 1.8063076359811127e-05, "loss": 2.4817, "step": 74200 }, { "epoch": 2.816954807400667, "grad_norm": 9.503125190734863, "learning_rate": 1.8055241306103892e-05, "loss": 2.5104, "step": 74300 }, { "epoch": 2.8207461328480434, "grad_norm": 10.430237770080566, "learning_rate": 1.8047392144075094e-05, "loss": 2.4595, "step": 74400 }, { "epoch": 2.82453745829542, "grad_norm": 9.593660354614258, "learning_rate": 1.8039528887472122e-05, "loss": 2.4915, "step": 74500 }, { "epoch": 2.8283287837427964, "grad_norm": 10.127099990844727, "learning_rate": 1.8031651550067043e-05, "loss": 2.447, "step": 74600 }, { "epoch": 2.832120109190173, "grad_norm": 11.928866386413574, "learning_rate": 1.8023760145656588e-05, "loss": 2.4417, "step": 74700 }, { "epoch": 2.8359114346375494, "grad_norm": 10.476520538330078, "learning_rate": 1.8015854688062128e-05, "loss": 2.5096, "step": 74800 }, { "epoch": 2.8397027600849256, "grad_norm": 9.948707580566406, "learning_rate": 1.8007935191129647e-05, "loss": 2.4645, "step": 74900 }, { "epoch": 2.843494085532302, "grad_norm": 9.71849250793457, "learning_rate": 1.8000001668729723e-05, "loss": 2.49, "step": 75000 }, { "epoch": 2.8472854109796786, "grad_norm": 12.20787525177002, "learning_rate": 1.7992054134757485e-05, "loss": 2.4767, "step": 75100 }, { "epoch": 2.851076736427055, "grad_norm": 9.71546459197998, "learning_rate": 1.7984092603132616e-05, "loss": 2.4536, "step": 75200 }, { "epoch": 2.8548680618744315, "grad_norm": 10.997023582458496, "learning_rate": 1.7976117087799307e-05, "loss": 2.4659, "step": 75300 }, { "epoch": 2.858659387321808, "grad_norm": 11.535093307495117, "learning_rate": 1.7968127602726245e-05, "loss": 2.4558, "step": 75400 }, { "epoch": 2.862450712769184, "grad_norm": 9.964519500732422, "learning_rate": 1.7960124161906584e-05, "loss": 2.4617, "step": 75500 }, { "epoch": 2.8662420382165603, "grad_norm": 10.597661972045898, "learning_rate": 1.7952106779357922e-05, "loss": 2.4459, "step": 75600 }, { "epoch": 2.870033363663937, "grad_norm": 10.95278549194336, "learning_rate": 1.7944075469122267e-05, "loss": 2.4585, "step": 75700 }, { "epoch": 2.8738246891113133, "grad_norm": 10.38538932800293, "learning_rate": 1.793603024526603e-05, "loss": 2.4844, "step": 75800 }, { "epoch": 2.87761601455869, "grad_norm": 11.11662483215332, "learning_rate": 1.7927971121879987e-05, "loss": 2.4504, "step": 75900 }, { "epoch": 2.8814073400060662, "grad_norm": 11.534825325012207, "learning_rate": 1.7919898113079263e-05, "loss": 2.4533, "step": 76000 }, { "epoch": 2.8851986654534425, "grad_norm": 10.056855201721191, "learning_rate": 1.791181123300329e-05, "loss": 2.438, "step": 76100 }, { "epoch": 2.8889899909008188, "grad_norm": 9.64684772491455, "learning_rate": 1.790371049581581e-05, "loss": 2.4322, "step": 76200 }, { "epoch": 2.8927813163481955, "grad_norm": 9.715087890625, "learning_rate": 1.7895595915704827e-05, "loss": 2.432, "step": 76300 }, { "epoch": 2.8965726417955717, "grad_norm": 9.955890655517578, "learning_rate": 1.788746750688259e-05, "loss": 2.433, "step": 76400 }, { "epoch": 2.900363967242948, "grad_norm": 10.797928810119629, "learning_rate": 1.7879325283585565e-05, "loss": 2.4482, "step": 76500 }, { "epoch": 2.9041552926903247, "grad_norm": 9.759758949279785, "learning_rate": 1.7871169260074427e-05, "loss": 2.4536, "step": 76600 }, { "epoch": 2.907946618137701, "grad_norm": 9.156854629516602, "learning_rate": 1.7862999450634005e-05, "loss": 2.4636, "step": 76700 }, { "epoch": 2.911737943585077, "grad_norm": 9.788328170776367, "learning_rate": 1.7854815869573285e-05, "loss": 2.4349, "step": 76800 }, { "epoch": 2.915529269032454, "grad_norm": 11.064518928527832, "learning_rate": 1.7846618531225365e-05, "loss": 2.4449, "step": 76900 }, { "epoch": 2.91932059447983, "grad_norm": 10.824737548828125, "learning_rate": 1.7838407449947447e-05, "loss": 2.4075, "step": 77000 }, { "epoch": 2.9231119199272064, "grad_norm": 10.419591903686523, "learning_rate": 1.783018264012079e-05, "loss": 2.4994, "step": 77100 }, { "epoch": 2.926903245374583, "grad_norm": 10.197402000427246, "learning_rate": 1.7821944116150714e-05, "loss": 2.428, "step": 77200 }, { "epoch": 2.9306945708219594, "grad_norm": 10.414329528808594, "learning_rate": 1.781369189246655e-05, "loss": 2.4581, "step": 77300 }, { "epoch": 2.9344858962693356, "grad_norm": 9.751555442810059, "learning_rate": 1.7805425983521613e-05, "loss": 2.4226, "step": 77400 }, { "epoch": 2.938277221716712, "grad_norm": 10.696304321289062, "learning_rate": 1.7797146403793212e-05, "loss": 2.4836, "step": 77500 }, { "epoch": 2.9420685471640886, "grad_norm": 11.065367698669434, "learning_rate": 1.7788853167782573e-05, "loss": 2.4359, "step": 77600 }, { "epoch": 2.945859872611465, "grad_norm": 9.913238525390625, "learning_rate": 1.7780546290014858e-05, "loss": 2.4311, "step": 77700 }, { "epoch": 2.9496511980588416, "grad_norm": 9.38996410369873, "learning_rate": 1.7772225785039118e-05, "loss": 2.4365, "step": 77800 }, { "epoch": 2.953442523506218, "grad_norm": 10.532707214355469, "learning_rate": 1.7763891667428264e-05, "loss": 2.4628, "step": 77900 }, { "epoch": 2.957233848953594, "grad_norm": 9.785970687866211, "learning_rate": 1.7755543951779058e-05, "loss": 2.4527, "step": 78000 }, { "epoch": 2.9610251744009704, "grad_norm": 10.999120712280273, "learning_rate": 1.7747182652712075e-05, "loss": 2.425, "step": 78100 }, { "epoch": 2.964816499848347, "grad_norm": 10.615059852600098, "learning_rate": 1.7738807784871682e-05, "loss": 2.4844, "step": 78200 }, { "epoch": 2.9686078252957233, "grad_norm": 9.266522407531738, "learning_rate": 1.7730419362926005e-05, "loss": 2.4274, "step": 78300 }, { "epoch": 2.9723991507431, "grad_norm": 10.955419540405273, "learning_rate": 1.7722017401566916e-05, "loss": 2.4429, "step": 78400 }, { "epoch": 2.9761904761904763, "grad_norm": 9.02269172668457, "learning_rate": 1.771360191551e-05, "loss": 2.4047, "step": 78500 }, { "epoch": 2.9799818016378525, "grad_norm": 10.718005180358887, "learning_rate": 1.7705172919494523e-05, "loss": 2.4404, "step": 78600 }, { "epoch": 2.983773127085229, "grad_norm": 10.003349304199219, "learning_rate": 1.769673042828342e-05, "loss": 2.4493, "step": 78700 }, { "epoch": 2.9875644525326055, "grad_norm": 11.51830768585205, "learning_rate": 1.768827445666326e-05, "loss": 2.4418, "step": 78800 }, { "epoch": 2.9913557779799818, "grad_norm": 11.086520195007324, "learning_rate": 1.7679805019444224e-05, "loss": 2.4405, "step": 78900 }, { "epoch": 2.9951471034273585, "grad_norm": 9.733136177062988, "learning_rate": 1.767132213146007e-05, "loss": 2.447, "step": 79000 }, { "epoch": 2.9989384288747347, "grad_norm": 10.287933349609375, "learning_rate": 1.7662825807568125e-05, "loss": 2.444, "step": 79100 }, { "epoch": 3.0, "eval_accuracy": 0.4763099153578571, "eval_loss": 2.4296343326568604, "eval_runtime": 935.87, "eval_samples_per_second": 901.86, "eval_steps_per_second": 7.046, "step": 79128 }, { "epoch": 3.002729754322111, "grad_norm": 10.842921257019043, "learning_rate": 1.7654316062649242e-05, "loss": 2.3224, "step": 79200 }, { "epoch": 3.0065210797694872, "grad_norm": 10.672722816467285, "learning_rate": 1.7645792911607782e-05, "loss": 2.2091, "step": 79300 }, { "epoch": 3.010312405216864, "grad_norm": 11.10242748260498, "learning_rate": 1.7637256369371584e-05, "loss": 2.2005, "step": 79400 }, { "epoch": 3.01410373066424, "grad_norm": 9.948885917663574, "learning_rate": 1.7628706450891937e-05, "loss": 2.2352, "step": 79500 }, { "epoch": 3.0178950561116165, "grad_norm": 9.7949800491333, "learning_rate": 1.7620143171143575e-05, "loss": 2.2385, "step": 79600 }, { "epoch": 3.021686381558993, "grad_norm": 10.188003540039062, "learning_rate": 1.761156654512461e-05, "loss": 2.2348, "step": 79700 }, { "epoch": 3.0254777070063694, "grad_norm": 10.501203536987305, "learning_rate": 1.7602976587856547e-05, "loss": 2.1997, "step": 79800 }, { "epoch": 3.0292690324537457, "grad_norm": 10.733768463134766, "learning_rate": 1.759437331438423e-05, "loss": 2.2164, "step": 79900 }, { "epoch": 3.0330603579011224, "grad_norm": 10.229263305664062, "learning_rate": 1.758575673977583e-05, "loss": 2.2007, "step": 80000 }, { "epoch": 3.0368516833484986, "grad_norm": 10.923152923583984, "learning_rate": 1.757712687912282e-05, "loss": 2.2248, "step": 80100 }, { "epoch": 3.040643008795875, "grad_norm": 9.65584659576416, "learning_rate": 1.7568483747539932e-05, "loss": 2.2163, "step": 80200 }, { "epoch": 3.0444343342432516, "grad_norm": 10.482006072998047, "learning_rate": 1.755982736016514e-05, "loss": 2.2458, "step": 80300 }, { "epoch": 3.048225659690628, "grad_norm": 10.584166526794434, "learning_rate": 1.755115773215965e-05, "loss": 2.2206, "step": 80400 }, { "epoch": 3.052016985138004, "grad_norm": 10.731414794921875, "learning_rate": 1.7542474878707845e-05, "loss": 2.2368, "step": 80500 }, { "epoch": 3.055808310585381, "grad_norm": 10.501351356506348, "learning_rate": 1.753377881501727e-05, "loss": 2.2446, "step": 80600 }, { "epoch": 3.059599636032757, "grad_norm": 10.767316818237305, "learning_rate": 1.7525069556318627e-05, "loss": 2.215, "step": 80700 }, { "epoch": 3.0633909614801333, "grad_norm": 10.410860061645508, "learning_rate": 1.7516347117865697e-05, "loss": 2.2193, "step": 80800 }, { "epoch": 3.06718228692751, "grad_norm": 9.975448608398438, "learning_rate": 1.7507611514935372e-05, "loss": 2.2362, "step": 80900 }, { "epoch": 3.0709736123748863, "grad_norm": 10.701716423034668, "learning_rate": 1.7498862762827588e-05, "loss": 2.2321, "step": 81000 }, { "epoch": 3.0747649378222626, "grad_norm": 11.177474021911621, "learning_rate": 1.749010087686531e-05, "loss": 2.2312, "step": 81100 }, { "epoch": 3.0785562632696393, "grad_norm": 10.597122192382812, "learning_rate": 1.7481325872394513e-05, "loss": 2.2747, "step": 81200 }, { "epoch": 3.0823475887170155, "grad_norm": 10.264633178710938, "learning_rate": 1.7472537764784144e-05, "loss": 2.2035, "step": 81300 }, { "epoch": 3.086138914164392, "grad_norm": 9.498851776123047, "learning_rate": 1.74637365694261e-05, "loss": 2.2202, "step": 81400 }, { "epoch": 3.0899302396117685, "grad_norm": 11.126235961914062, "learning_rate": 1.7454922301735204e-05, "loss": 2.2143, "step": 81500 }, { "epoch": 3.0937215650591448, "grad_norm": 10.264562606811523, "learning_rate": 1.7446094977149167e-05, "loss": 2.2458, "step": 81600 }, { "epoch": 3.097512890506521, "grad_norm": 9.844819068908691, "learning_rate": 1.7437254611128577e-05, "loss": 2.2699, "step": 81700 }, { "epoch": 3.1013042159538973, "grad_norm": 11.433173179626465, "learning_rate": 1.7428401219156856e-05, "loss": 2.2483, "step": 81800 }, { "epoch": 3.105095541401274, "grad_norm": 10.675647735595703, "learning_rate": 1.7419534816740245e-05, "loss": 2.2239, "step": 81900 }, { "epoch": 3.1088868668486502, "grad_norm": 9.860465049743652, "learning_rate": 1.741065541940777e-05, "loss": 2.2085, "step": 82000 }, { "epoch": 3.1126781922960265, "grad_norm": 11.228464126586914, "learning_rate": 1.740176304271122e-05, "loss": 2.2265, "step": 82100 }, { "epoch": 3.116469517743403, "grad_norm": 10.60427188873291, "learning_rate": 1.739285770222511e-05, "loss": 2.1892, "step": 82200 }, { "epoch": 3.1202608431907795, "grad_norm": 10.356694221496582, "learning_rate": 1.7383939413546663e-05, "loss": 2.2399, "step": 82300 }, { "epoch": 3.1240521686381557, "grad_norm": 9.981148719787598, "learning_rate": 1.737500819229579e-05, "loss": 2.2216, "step": 82400 }, { "epoch": 3.1278434940855324, "grad_norm": 11.359503746032715, "learning_rate": 1.736606405411504e-05, "loss": 2.2142, "step": 82500 }, { "epoch": 3.1316348195329087, "grad_norm": 10.581196784973145, "learning_rate": 1.735710701466958e-05, "loss": 2.1981, "step": 82600 }, { "epoch": 3.135426144980285, "grad_norm": 12.91972827911377, "learning_rate": 1.7348137089647198e-05, "loss": 2.2348, "step": 82700 }, { "epoch": 3.1392174704276616, "grad_norm": 10.036688804626465, "learning_rate": 1.733915429475823e-05, "loss": 2.1946, "step": 82800 }, { "epoch": 3.143008795875038, "grad_norm": 10.461673736572266, "learning_rate": 1.7330158645735554e-05, "loss": 2.2539, "step": 82900 }, { "epoch": 3.146800121322414, "grad_norm": 10.377921104431152, "learning_rate": 1.7321150158334567e-05, "loss": 2.2112, "step": 83000 }, { "epoch": 3.150591446769791, "grad_norm": 11.274285316467285, "learning_rate": 1.731212884833315e-05, "loss": 2.1736, "step": 83100 }, { "epoch": 3.154382772217167, "grad_norm": 10.742070198059082, "learning_rate": 1.730309473153164e-05, "loss": 2.1978, "step": 83200 }, { "epoch": 3.1581740976645434, "grad_norm": 10.376641273498535, "learning_rate": 1.729404782375281e-05, "loss": 2.2138, "step": 83300 }, { "epoch": 3.16196542311192, "grad_norm": 10.88852596282959, "learning_rate": 1.7284988140841825e-05, "loss": 2.2517, "step": 83400 }, { "epoch": 3.1657567485592963, "grad_norm": 10.41225528717041, "learning_rate": 1.7275915698666242e-05, "loss": 2.2065, "step": 83500 }, { "epoch": 3.1695480740066726, "grad_norm": 10.50618839263916, "learning_rate": 1.7266830513115948e-05, "loss": 2.2065, "step": 83600 }, { "epoch": 3.1733393994540493, "grad_norm": 10.752445220947266, "learning_rate": 1.725773260010316e-05, "loss": 2.1827, "step": 83700 }, { "epoch": 3.1771307249014256, "grad_norm": 11.265571594238281, "learning_rate": 1.724862197556238e-05, "loss": 2.2159, "step": 83800 }, { "epoch": 3.180922050348802, "grad_norm": 10.301066398620605, "learning_rate": 1.7239498655450382e-05, "loss": 2.2315, "step": 83900 }, { "epoch": 3.1847133757961785, "grad_norm": 11.312590599060059, "learning_rate": 1.723036265574616e-05, "loss": 2.222, "step": 84000 }, { "epoch": 3.188504701243555, "grad_norm": 10.670496940612793, "learning_rate": 1.7221213992450936e-05, "loss": 2.2141, "step": 84100 }, { "epoch": 3.192296026690931, "grad_norm": 9.893882751464844, "learning_rate": 1.7212052681588093e-05, "loss": 2.1978, "step": 84200 }, { "epoch": 3.1960873521383077, "grad_norm": 10.959532737731934, "learning_rate": 1.720287873920318e-05, "loss": 2.2237, "step": 84300 }, { "epoch": 3.199878677585684, "grad_norm": 10.389450073242188, "learning_rate": 1.7193692181363865e-05, "loss": 2.2132, "step": 84400 }, { "epoch": 3.2036700030330603, "grad_norm": 10.18624496459961, "learning_rate": 1.7184493024159903e-05, "loss": 2.2065, "step": 84500 }, { "epoch": 3.207461328480437, "grad_norm": 9.912386894226074, "learning_rate": 1.7175281283703124e-05, "loss": 2.2201, "step": 84600 }, { "epoch": 3.2112526539278132, "grad_norm": 10.398030281066895, "learning_rate": 1.7166056976127397e-05, "loss": 2.2144, "step": 84700 }, { "epoch": 3.2150439793751895, "grad_norm": 10.822487831115723, "learning_rate": 1.7156820117588603e-05, "loss": 2.2232, "step": 84800 }, { "epoch": 3.2188353048225657, "grad_norm": 11.098958969116211, "learning_rate": 1.7147570724264598e-05, "loss": 2.2166, "step": 84900 }, { "epoch": 3.2226266302699424, "grad_norm": 12.533442497253418, "learning_rate": 1.7138308812355198e-05, "loss": 2.2068, "step": 85000 }, { "epoch": 3.2264179557173187, "grad_norm": 9.18089485168457, "learning_rate": 1.7129034398082142e-05, "loss": 2.1981, "step": 85100 }, { "epoch": 3.2302092811646954, "grad_norm": 11.308269500732422, "learning_rate": 1.7119747497689072e-05, "loss": 2.1866, "step": 85200 }, { "epoch": 3.2340006066120717, "grad_norm": 10.231182098388672, "learning_rate": 1.7110448127441493e-05, "loss": 2.2033, "step": 85300 }, { "epoch": 3.237791932059448, "grad_norm": 10.401562690734863, "learning_rate": 1.7101136303626746e-05, "loss": 2.1914, "step": 85400 }, { "epoch": 3.241583257506824, "grad_norm": 11.473837852478027, "learning_rate": 1.709181204255399e-05, "loss": 2.1842, "step": 85500 }, { "epoch": 3.245374582954201, "grad_norm": 11.6417236328125, "learning_rate": 1.7082475360554173e-05, "loss": 2.2233, "step": 85600 }, { "epoch": 3.249165908401577, "grad_norm": 10.620051383972168, "learning_rate": 1.7073126273979988e-05, "loss": 2.1987, "step": 85700 }, { "epoch": 3.2529572338489534, "grad_norm": 10.672351837158203, "learning_rate": 1.7063764799205854e-05, "loss": 2.2187, "step": 85800 }, { "epoch": 3.25674855929633, "grad_norm": 10.030927658081055, "learning_rate": 1.7054390952627893e-05, "loss": 2.2135, "step": 85900 }, { "epoch": 3.2605398847437064, "grad_norm": 11.169106483459473, "learning_rate": 1.704500475066389e-05, "loss": 2.1902, "step": 86000 }, { "epoch": 3.2643312101910826, "grad_norm": 11.558984756469727, "learning_rate": 1.7035606209753276e-05, "loss": 2.2388, "step": 86100 }, { "epoch": 3.2681225356384593, "grad_norm": 11.556367874145508, "learning_rate": 1.7026195346357087e-05, "loss": 2.1868, "step": 86200 }, { "epoch": 3.2719138610858356, "grad_norm": 10.864015579223633, "learning_rate": 1.7016772176957945e-05, "loss": 2.2152, "step": 86300 }, { "epoch": 3.275705186533212, "grad_norm": 12.170553207397461, "learning_rate": 1.7007336718060025e-05, "loss": 2.2345, "step": 86400 }, { "epoch": 3.2794965119805886, "grad_norm": 10.055994033813477, "learning_rate": 1.6997888986189018e-05, "loss": 2.1967, "step": 86500 }, { "epoch": 3.283287837427965, "grad_norm": 12.153730392456055, "learning_rate": 1.698842899789213e-05, "loss": 2.199, "step": 86600 }, { "epoch": 3.287079162875341, "grad_norm": 10.817435264587402, "learning_rate": 1.6978956769738014e-05, "loss": 2.2006, "step": 86700 }, { "epoch": 3.290870488322718, "grad_norm": 11.610159873962402, "learning_rate": 1.6969472318316768e-05, "loss": 2.2136, "step": 86800 }, { "epoch": 3.294661813770094, "grad_norm": 10.64565372467041, "learning_rate": 1.6959975660239897e-05, "loss": 2.1801, "step": 86900 }, { "epoch": 3.2984531392174703, "grad_norm": 10.419108390808105, "learning_rate": 1.6950466812140292e-05, "loss": 2.2142, "step": 87000 }, { "epoch": 3.302244464664847, "grad_norm": 10.87584400177002, "learning_rate": 1.6940945790672187e-05, "loss": 2.1858, "step": 87100 }, { "epoch": 3.3060357901122233, "grad_norm": 9.981945991516113, "learning_rate": 1.693141261251113e-05, "loss": 2.1842, "step": 87200 }, { "epoch": 3.3098271155595995, "grad_norm": 9.814298629760742, "learning_rate": 1.692186729435398e-05, "loss": 2.1993, "step": 87300 }, { "epoch": 3.3136184410069762, "grad_norm": 10.629115104675293, "learning_rate": 1.6912309852918842e-05, "loss": 2.2014, "step": 87400 }, { "epoch": 3.3174097664543525, "grad_norm": 10.399810791015625, "learning_rate": 1.690274030494506e-05, "loss": 2.1929, "step": 87500 }, { "epoch": 3.3212010919017287, "grad_norm": 9.957448959350586, "learning_rate": 1.6893158667193193e-05, "loss": 2.2208, "step": 87600 }, { "epoch": 3.3249924173491054, "grad_norm": 10.052605628967285, "learning_rate": 1.688356495644495e-05, "loss": 2.183, "step": 87700 }, { "epoch": 3.3287837427964817, "grad_norm": 10.550665855407715, "learning_rate": 1.6873959189503204e-05, "loss": 2.2454, "step": 87800 }, { "epoch": 3.332575068243858, "grad_norm": 10.793706893920898, "learning_rate": 1.6864341383191938e-05, "loss": 2.2031, "step": 87900 }, { "epoch": 3.3363663936912342, "grad_norm": 11.190234184265137, "learning_rate": 1.6854711554356223e-05, "loss": 2.1857, "step": 88000 }, { "epoch": 3.340157719138611, "grad_norm": 10.46500301361084, "learning_rate": 1.6845069719862186e-05, "loss": 2.1818, "step": 88100 }, { "epoch": 3.343949044585987, "grad_norm": 10.842758178710938, "learning_rate": 1.6835415896596984e-05, "loss": 2.2025, "step": 88200 }, { "epoch": 3.347740370033364, "grad_norm": 10.455790519714355, "learning_rate": 1.6825750101468764e-05, "loss": 2.2024, "step": 88300 }, { "epoch": 3.35153169548074, "grad_norm": 11.140132904052734, "learning_rate": 1.6816072351406644e-05, "loss": 2.2257, "step": 88400 }, { "epoch": 3.3553230209281164, "grad_norm": 10.047802925109863, "learning_rate": 1.680638266336069e-05, "loss": 2.2156, "step": 88500 }, { "epoch": 3.3591143463754927, "grad_norm": 11.181809425354004, "learning_rate": 1.6796681054301865e-05, "loss": 2.1842, "step": 88600 }, { "epoch": 3.3629056718228694, "grad_norm": 10.280974388122559, "learning_rate": 1.6786967541222015e-05, "loss": 2.1736, "step": 88700 }, { "epoch": 3.3666969972702456, "grad_norm": 9.589885711669922, "learning_rate": 1.6777242141133837e-05, "loss": 2.1961, "step": 88800 }, { "epoch": 3.370488322717622, "grad_norm": 11.207756996154785, "learning_rate": 1.676750487107084e-05, "loss": 2.186, "step": 88900 }, { "epoch": 3.3742796481649986, "grad_norm": 10.144556045532227, "learning_rate": 1.6757755748087333e-05, "loss": 2.1856, "step": 89000 }, { "epoch": 3.378070973612375, "grad_norm": 10.009651184082031, "learning_rate": 1.6747994789258385e-05, "loss": 2.1572, "step": 89100 }, { "epoch": 3.381862299059751, "grad_norm": 10.207440376281738, "learning_rate": 1.673822201167978e-05, "loss": 2.181, "step": 89200 }, { "epoch": 3.385653624507128, "grad_norm": 11.25203800201416, "learning_rate": 1.6728437432468018e-05, "loss": 2.1784, "step": 89300 }, { "epoch": 3.389444949954504, "grad_norm": 9.874098777770996, "learning_rate": 1.6718641068760263e-05, "loss": 2.1654, "step": 89400 }, { "epoch": 3.3932362754018803, "grad_norm": 11.288037300109863, "learning_rate": 1.6708832937714312e-05, "loss": 2.1974, "step": 89500 }, { "epoch": 3.397027600849257, "grad_norm": 12.287949562072754, "learning_rate": 1.669901305650859e-05, "loss": 2.1966, "step": 89600 }, { "epoch": 3.4008189262966333, "grad_norm": 10.651175498962402, "learning_rate": 1.668918144234208e-05, "loss": 2.1664, "step": 89700 }, { "epoch": 3.4046102517440096, "grad_norm": 10.924527168273926, "learning_rate": 1.6679338112434332e-05, "loss": 2.1835, "step": 89800 }, { "epoch": 3.4084015771913863, "grad_norm": 10.804643630981445, "learning_rate": 1.66694830840254e-05, "loss": 2.1726, "step": 89900 }, { "epoch": 3.4121929026387625, "grad_norm": 10.51124382019043, "learning_rate": 1.665961637437585e-05, "loss": 2.1888, "step": 90000 }, { "epoch": 3.4159842280861388, "grad_norm": 10.605982780456543, "learning_rate": 1.6649738000766678e-05, "loss": 2.1946, "step": 90100 }, { "epoch": 3.4197755535335155, "grad_norm": 11.442093849182129, "learning_rate": 1.663984798049933e-05, "loss": 2.2558, "step": 90200 }, { "epoch": 3.4235668789808917, "grad_norm": 11.297812461853027, "learning_rate": 1.6629946330895647e-05, "loss": 2.1721, "step": 90300 }, { "epoch": 3.427358204428268, "grad_norm": 9.919473648071289, "learning_rate": 1.6620033069297833e-05, "loss": 2.1909, "step": 90400 }, { "epoch": 3.4311495298756447, "grad_norm": 9.822258949279785, "learning_rate": 1.6610108213068428e-05, "loss": 2.1826, "step": 90500 }, { "epoch": 3.434940855323021, "grad_norm": 11.015668869018555, "learning_rate": 1.6600171779590288e-05, "loss": 2.1642, "step": 90600 }, { "epoch": 3.438732180770397, "grad_norm": 11.665396690368652, "learning_rate": 1.6590223786266545e-05, "loss": 2.2078, "step": 90700 }, { "epoch": 3.442523506217774, "grad_norm": 11.159843444824219, "learning_rate": 1.6580264250520564e-05, "loss": 2.2057, "step": 90800 }, { "epoch": 3.44631483166515, "grad_norm": 10.427332878112793, "learning_rate": 1.657029318979594e-05, "loss": 2.1964, "step": 90900 }, { "epoch": 3.4501061571125264, "grad_norm": 10.83226490020752, "learning_rate": 1.656031062155645e-05, "loss": 2.1873, "step": 91000 }, { "epoch": 3.4538974825599027, "grad_norm": 10.850767135620117, "learning_rate": 1.6550316563286034e-05, "loss": 2.195, "step": 91100 }, { "epoch": 3.4576888080072794, "grad_norm": 11.248734474182129, "learning_rate": 1.654031103248873e-05, "loss": 2.1998, "step": 91200 }, { "epoch": 3.4614801334546557, "grad_norm": 11.751529693603516, "learning_rate": 1.6530294046688697e-05, "loss": 2.1617, "step": 91300 }, { "epoch": 3.4652714589020324, "grad_norm": 11.94430923461914, "learning_rate": 1.6520265623430143e-05, "loss": 2.2133, "step": 91400 }, { "epoch": 3.4690627843494086, "grad_norm": 10.22391414642334, "learning_rate": 1.6510225780277313e-05, "loss": 2.1933, "step": 91500 }, { "epoch": 3.472854109796785, "grad_norm": 10.774738311767578, "learning_rate": 1.6500174534814452e-05, "loss": 2.1916, "step": 91600 }, { "epoch": 3.476645435244161, "grad_norm": 11.335387229919434, "learning_rate": 1.6490111904645777e-05, "loss": 2.1996, "step": 91700 }, { "epoch": 3.480436760691538, "grad_norm": 12.09738826751709, "learning_rate": 1.648003790739544e-05, "loss": 2.1759, "step": 91800 }, { "epoch": 3.484228086138914, "grad_norm": 10.800116539001465, "learning_rate": 1.6469952560707505e-05, "loss": 2.1753, "step": 91900 }, { "epoch": 3.488019411586291, "grad_norm": 11.237314224243164, "learning_rate": 1.6459855882245914e-05, "loss": 2.1772, "step": 92000 }, { "epoch": 3.491810737033667, "grad_norm": 9.903791427612305, "learning_rate": 1.6449747889694458e-05, "loss": 2.2102, "step": 92100 }, { "epoch": 3.4956020624810433, "grad_norm": 11.344173431396484, "learning_rate": 1.6439628600756743e-05, "loss": 2.1989, "step": 92200 }, { "epoch": 3.4993933879284196, "grad_norm": 9.604011535644531, "learning_rate": 1.6429498033156153e-05, "loss": 2.1533, "step": 92300 }, { "epoch": 3.5031847133757963, "grad_norm": 11.808192253112793, "learning_rate": 1.641935620463584e-05, "loss": 2.1906, "step": 92400 }, { "epoch": 3.5069760388231725, "grad_norm": 11.220489501953125, "learning_rate": 1.6409203132958664e-05, "loss": 2.168, "step": 92500 }, { "epoch": 3.5107673642705493, "grad_norm": 11.173920631408691, "learning_rate": 1.6399038835907184e-05, "loss": 2.1728, "step": 92600 }, { "epoch": 3.5145586897179255, "grad_norm": 10.632450103759766, "learning_rate": 1.6388863331283622e-05, "loss": 2.1811, "step": 92700 }, { "epoch": 3.5183500151653018, "grad_norm": 11.022995948791504, "learning_rate": 1.6378676636909823e-05, "loss": 2.1647, "step": 92800 }, { "epoch": 3.522141340612678, "grad_norm": 11.686514854431152, "learning_rate": 1.6368478770627242e-05, "loss": 2.1764, "step": 92900 }, { "epoch": 3.5259326660600547, "grad_norm": 11.46933364868164, "learning_rate": 1.6358269750296886e-05, "loss": 2.1833, "step": 93000 }, { "epoch": 3.529723991507431, "grad_norm": 11.132427215576172, "learning_rate": 1.6348049593799302e-05, "loss": 2.1778, "step": 93100 }, { "epoch": 3.5335153169548073, "grad_norm": 10.660680770874023, "learning_rate": 1.633781831903455e-05, "loss": 2.1747, "step": 93200 }, { "epoch": 3.537306642402184, "grad_norm": 11.281957626342773, "learning_rate": 1.6327575943922157e-05, "loss": 2.1781, "step": 93300 }, { "epoch": 3.54109796784956, "grad_norm": 11.38676643371582, "learning_rate": 1.631732248640109e-05, "loss": 2.1745, "step": 93400 }, { "epoch": 3.5448892932969365, "grad_norm": 10.602873802185059, "learning_rate": 1.6307057964429726e-05, "loss": 2.1632, "step": 93500 }, { "epoch": 3.548680618744313, "grad_norm": 10.749499320983887, "learning_rate": 1.6296782395985823e-05, "loss": 2.146, "step": 93600 }, { "epoch": 3.5524719441916894, "grad_norm": 10.26092529296875, "learning_rate": 1.6286495799066488e-05, "loss": 2.1614, "step": 93700 }, { "epoch": 3.5562632696390657, "grad_norm": 12.2695894241333, "learning_rate": 1.627619819168814e-05, "loss": 2.1984, "step": 93800 }, { "epoch": 3.5600545950864424, "grad_norm": 11.573020935058594, "learning_rate": 1.6265889591886487e-05, "loss": 2.1801, "step": 93900 }, { "epoch": 3.5638459205338187, "grad_norm": 10.343268394470215, "learning_rate": 1.6255570017716486e-05, "loss": 2.1556, "step": 94000 }, { "epoch": 3.567637245981195, "grad_norm": 11.381085395812988, "learning_rate": 1.624523948725231e-05, "loss": 2.1939, "step": 94100 }, { "epoch": 3.571428571428571, "grad_norm": 10.95803451538086, "learning_rate": 1.6234898018587336e-05, "loss": 2.1862, "step": 94200 }, { "epoch": 3.575219896875948, "grad_norm": 10.743924140930176, "learning_rate": 1.6224545629834082e-05, "loss": 2.163, "step": 94300 }, { "epoch": 3.579011222323324, "grad_norm": 11.401995658874512, "learning_rate": 1.6214182339124208e-05, "loss": 2.1381, "step": 94400 }, { "epoch": 3.582802547770701, "grad_norm": 9.468779563903809, "learning_rate": 1.6203808164608454e-05, "loss": 2.1548, "step": 94500 }, { "epoch": 3.586593873218077, "grad_norm": 11.444217681884766, "learning_rate": 1.6193423124456624e-05, "loss": 2.1768, "step": 94600 }, { "epoch": 3.5903851986654534, "grad_norm": 11.838251113891602, "learning_rate": 1.6183027236857566e-05, "loss": 2.158, "step": 94700 }, { "epoch": 3.5941765241128296, "grad_norm": 11.16830062866211, "learning_rate": 1.617262052001911e-05, "loss": 2.1946, "step": 94800 }, { "epoch": 3.5979678495602063, "grad_norm": 9.855988502502441, "learning_rate": 1.6162202992168066e-05, "loss": 2.1434, "step": 94900 }, { "epoch": 3.6017591750075826, "grad_norm": 12.42287826538086, "learning_rate": 1.6151774671550176e-05, "loss": 2.1683, "step": 95000 }, { "epoch": 3.6055505004549593, "grad_norm": 10.651538848876953, "learning_rate": 1.6141335576430074e-05, "loss": 2.1623, "step": 95100 }, { "epoch": 3.6093418259023355, "grad_norm": 11.422640800476074, "learning_rate": 1.6130885725091276e-05, "loss": 2.1583, "step": 95200 }, { "epoch": 3.613133151349712, "grad_norm": 9.847782135009766, "learning_rate": 1.6120425135836133e-05, "loss": 2.1799, "step": 95300 }, { "epoch": 3.616924476797088, "grad_norm": 11.603657722473145, "learning_rate": 1.6109953826985807e-05, "loss": 2.1551, "step": 95400 }, { "epoch": 3.6207158022444648, "grad_norm": 10.780939102172852, "learning_rate": 1.6099471816880232e-05, "loss": 2.197, "step": 95500 }, { "epoch": 3.624507127691841, "grad_norm": 10.223684310913086, "learning_rate": 1.608897912387808e-05, "loss": 2.1867, "step": 95600 }, { "epoch": 3.6282984531392177, "grad_norm": 11.094612121582031, "learning_rate": 1.6078475766356746e-05, "loss": 2.1698, "step": 95700 }, { "epoch": 3.632089778586594, "grad_norm": 11.526836395263672, "learning_rate": 1.6067961762712284e-05, "loss": 2.1759, "step": 95800 }, { "epoch": 3.6358811040339702, "grad_norm": 9.965375900268555, "learning_rate": 1.605743713135942e-05, "loss": 2.1358, "step": 95900 }, { "epoch": 3.6396724294813465, "grad_norm": 10.4207181930542, "learning_rate": 1.6046901890731465e-05, "loss": 2.1791, "step": 96000 }, { "epoch": 3.643463754928723, "grad_norm": 11.209970474243164, "learning_rate": 1.6036356059280333e-05, "loss": 2.1669, "step": 96100 }, { "epoch": 3.6472550803760995, "grad_norm": 10.588861465454102, "learning_rate": 1.6025799655476482e-05, "loss": 2.147, "step": 96200 }, { "epoch": 3.6510464058234757, "grad_norm": 11.17502212524414, "learning_rate": 1.601523269780888e-05, "loss": 2.1672, "step": 96300 }, { "epoch": 3.6548377312708524, "grad_norm": 10.606661796569824, "learning_rate": 1.6004655204784987e-05, "loss": 2.1553, "step": 96400 }, { "epoch": 3.6586290567182287, "grad_norm": 11.1652193069458, "learning_rate": 1.599406719493071e-05, "loss": 2.1922, "step": 96500 }, { "epoch": 3.662420382165605, "grad_norm": 10.518769264221191, "learning_rate": 1.5983468686790386e-05, "loss": 2.1632, "step": 96600 }, { "epoch": 3.6662117076129817, "grad_norm": 10.307948112487793, "learning_rate": 1.5972859698926724e-05, "loss": 2.1652, "step": 96700 }, { "epoch": 3.670003033060358, "grad_norm": 10.779983520507812, "learning_rate": 1.59622402499208e-05, "loss": 2.145, "step": 96800 }, { "epoch": 3.673794358507734, "grad_norm": 11.095219612121582, "learning_rate": 1.5951610358372002e-05, "loss": 2.1514, "step": 96900 }, { "epoch": 3.677585683955111, "grad_norm": 10.915760040283203, "learning_rate": 1.5940970042898017e-05, "loss": 2.1767, "step": 97000 }, { "epoch": 3.681377009402487, "grad_norm": 11.00518798828125, "learning_rate": 1.5930319322134783e-05, "loss": 2.1993, "step": 97100 }, { "epoch": 3.6851683348498634, "grad_norm": 10.921394348144531, "learning_rate": 1.5919658214736463e-05, "loss": 2.1481, "step": 97200 }, { "epoch": 3.6889596602972397, "grad_norm": 10.580720901489258, "learning_rate": 1.5908986739375417e-05, "loss": 2.2099, "step": 97300 }, { "epoch": 3.6927509857446164, "grad_norm": 11.143759727478027, "learning_rate": 1.5898304914742156e-05, "loss": 2.149, "step": 97400 }, { "epoch": 3.6965423111919926, "grad_norm": 12.072694778442383, "learning_rate": 1.5887612759545326e-05, "loss": 2.1614, "step": 97500 }, { "epoch": 3.7003336366393693, "grad_norm": 11.177020072937012, "learning_rate": 1.5876910292511652e-05, "loss": 2.1321, "step": 97600 }, { "epoch": 3.7041249620867456, "grad_norm": 11.169164657592773, "learning_rate": 1.5866197532385936e-05, "loss": 2.1633, "step": 97700 }, { "epoch": 3.707916287534122, "grad_norm": 9.892111778259277, "learning_rate": 1.5855474497930998e-05, "loss": 2.1509, "step": 97800 }, { "epoch": 3.711707612981498, "grad_norm": 10.73902416229248, "learning_rate": 1.5844741207927662e-05, "loss": 2.1507, "step": 97900 }, { "epoch": 3.715498938428875, "grad_norm": 11.225214004516602, "learning_rate": 1.5833997681174697e-05, "loss": 2.1586, "step": 98000 }, { "epoch": 3.719290263876251, "grad_norm": 10.778451919555664, "learning_rate": 1.582324393648882e-05, "loss": 2.1564, "step": 98100 }, { "epoch": 3.7230815893236278, "grad_norm": 11.017683029174805, "learning_rate": 1.5812479992704634e-05, "loss": 2.1411, "step": 98200 }, { "epoch": 3.726872914771004, "grad_norm": 13.926115989685059, "learning_rate": 1.58017058686746e-05, "loss": 2.1258, "step": 98300 }, { "epoch": 3.7306642402183803, "grad_norm": 10.288016319274902, "learning_rate": 1.579092158326903e-05, "loss": 2.1743, "step": 98400 }, { "epoch": 3.7344555656657565, "grad_norm": 10.850835800170898, "learning_rate": 1.578012715537601e-05, "loss": 2.1308, "step": 98500 }, { "epoch": 3.7382468911131332, "grad_norm": 10.27934741973877, "learning_rate": 1.57693226039014e-05, "loss": 2.14, "step": 98600 }, { "epoch": 3.7420382165605095, "grad_norm": 10.327507972717285, "learning_rate": 1.5758507947768794e-05, "loss": 2.172, "step": 98700 }, { "epoch": 3.745829542007886, "grad_norm": 10.979050636291504, "learning_rate": 1.5747683205919475e-05, "loss": 2.159, "step": 98800 }, { "epoch": 3.7496208674552625, "grad_norm": 11.211260795593262, "learning_rate": 1.57368483973124e-05, "loss": 2.1966, "step": 98900 }, { "epoch": 3.7534121929026387, "grad_norm": 10.73819637298584, "learning_rate": 1.5726003540924154e-05, "loss": 2.159, "step": 99000 }, { "epoch": 3.757203518350015, "grad_norm": 12.128928184509277, "learning_rate": 1.5715148655748916e-05, "loss": 2.0941, "step": 99100 }, { "epoch": 3.7609948437973917, "grad_norm": 11.638015747070312, "learning_rate": 1.5704283760798432e-05, "loss": 2.141, "step": 99200 }, { "epoch": 3.764786169244768, "grad_norm": 10.704824447631836, "learning_rate": 1.5693408875101982e-05, "loss": 2.1455, "step": 99300 }, { "epoch": 3.7685774946921446, "grad_norm": 11.37407398223877, "learning_rate": 1.5682524017706346e-05, "loss": 2.1636, "step": 99400 }, { "epoch": 3.772368820139521, "grad_norm": 10.8186616897583, "learning_rate": 1.5671629207675762e-05, "loss": 2.1772, "step": 99500 }, { "epoch": 3.776160145586897, "grad_norm": 10.355612754821777, "learning_rate": 1.5660724464091906e-05, "loss": 2.1373, "step": 99600 }, { "epoch": 3.7799514710342734, "grad_norm": 9.7418794631958, "learning_rate": 1.5649809806053847e-05, "loss": 2.1932, "step": 99700 }, { "epoch": 3.78374279648165, "grad_norm": 11.162092208862305, "learning_rate": 1.5638885252678025e-05, "loss": 2.1381, "step": 99800 }, { "epoch": 3.7875341219290264, "grad_norm": 10.994059562683105, "learning_rate": 1.5627950823098205e-05, "loss": 2.1502, "step": 99900 }, { "epoch": 3.7913254473764026, "grad_norm": 10.995444297790527, "learning_rate": 1.561700653646545e-05, "loss": 2.1796, "step": 100000 }, { "epoch": 3.7951167728237793, "grad_norm": 10.901473045349121, "learning_rate": 1.560605241194809e-05, "loss": 2.1627, "step": 100100 }, { "epoch": 3.7989080982711556, "grad_norm": 11.722579002380371, "learning_rate": 1.559508846873168e-05, "loss": 2.1723, "step": 100200 }, { "epoch": 3.802699423718532, "grad_norm": 10.581812858581543, "learning_rate": 1.5584114726018987e-05, "loss": 2.0966, "step": 100300 }, { "epoch": 3.806490749165908, "grad_norm": 11.551488876342773, "learning_rate": 1.5573131203029925e-05, "loss": 2.1492, "step": 100400 }, { "epoch": 3.810282074613285, "grad_norm": 10.69703197479248, "learning_rate": 1.556213791900154e-05, "loss": 2.164, "step": 100500 }, { "epoch": 3.814073400060661, "grad_norm": 10.121752738952637, "learning_rate": 1.5551134893187983e-05, "loss": 2.1738, "step": 100600 }, { "epoch": 3.817864725508038, "grad_norm": 10.302600860595703, "learning_rate": 1.554012214486046e-05, "loss": 2.1549, "step": 100700 }, { "epoch": 3.821656050955414, "grad_norm": 10.538665771484375, "learning_rate": 1.5529099693307205e-05, "loss": 2.1067, "step": 100800 }, { "epoch": 3.8254473764027903, "grad_norm": 11.270750045776367, "learning_rate": 1.551806755783345e-05, "loss": 2.1641, "step": 100900 }, { "epoch": 3.8292387018501666, "grad_norm": 11.1707124710083, "learning_rate": 1.550702575776138e-05, "loss": 2.115, "step": 101000 }, { "epoch": 3.8330300272975433, "grad_norm": 11.29411506652832, "learning_rate": 1.5495974312430123e-05, "loss": 2.0937, "step": 101100 }, { "epoch": 3.8368213527449195, "grad_norm": 11.973257064819336, "learning_rate": 1.5484913241195688e-05, "loss": 2.1389, "step": 101200 }, { "epoch": 3.8406126781922962, "grad_norm": 12.167219161987305, "learning_rate": 1.5473842563430944e-05, "loss": 2.1197, "step": 101300 }, { "epoch": 3.8444040036396725, "grad_norm": 12.713446617126465, "learning_rate": 1.546276229852559e-05, "loss": 2.1852, "step": 101400 }, { "epoch": 3.8481953290870488, "grad_norm": 11.35719108581543, "learning_rate": 1.5451672465886112e-05, "loss": 2.136, "step": 101500 }, { "epoch": 3.851986654534425, "grad_norm": 11.35055923461914, "learning_rate": 1.544057308493575e-05, "loss": 2.1654, "step": 101600 }, { "epoch": 3.8557779799818017, "grad_norm": 10.609726905822754, "learning_rate": 1.542946417511448e-05, "loss": 2.1411, "step": 101700 }, { "epoch": 3.859569305429178, "grad_norm": 10.404659271240234, "learning_rate": 1.5418345755878955e-05, "loss": 2.1155, "step": 101800 }, { "epoch": 3.8633606308765547, "grad_norm": 11.297307014465332, "learning_rate": 1.5407217846702493e-05, "loss": 2.1438, "step": 101900 }, { "epoch": 3.867151956323931, "grad_norm": 10.898469924926758, "learning_rate": 1.539608046707502e-05, "loss": 2.1366, "step": 102000 }, { "epoch": 3.870943281771307, "grad_norm": 10.74442195892334, "learning_rate": 1.5384933636503063e-05, "loss": 2.1446, "step": 102100 }, { "epoch": 3.8747346072186835, "grad_norm": 11.316722869873047, "learning_rate": 1.5373777374509694e-05, "loss": 2.1345, "step": 102200 }, { "epoch": 3.87852593266606, "grad_norm": 10.431078910827637, "learning_rate": 1.5362611700634505e-05, "loss": 2.1335, "step": 102300 }, { "epoch": 3.8823172581134364, "grad_norm": 10.6659517288208, "learning_rate": 1.535143663443357e-05, "loss": 2.1387, "step": 102400 }, { "epoch": 3.886108583560813, "grad_norm": 10.353126525878906, "learning_rate": 1.5340252195479427e-05, "loss": 2.1284, "step": 102500 }, { "epoch": 3.8898999090081894, "grad_norm": 12.218612670898438, "learning_rate": 1.5329058403361003e-05, "loss": 2.1274, "step": 102600 }, { "epoch": 3.8936912344555656, "grad_norm": 10.593243598937988, "learning_rate": 1.5317855277683636e-05, "loss": 2.154, "step": 102700 }, { "epoch": 3.897482559902942, "grad_norm": 11.26771354675293, "learning_rate": 1.530664283806899e-05, "loss": 2.1293, "step": 102800 }, { "epoch": 3.9012738853503186, "grad_norm": 11.112632751464844, "learning_rate": 1.5295421104155047e-05, "loss": 2.1322, "step": 102900 }, { "epoch": 3.905065210797695, "grad_norm": 9.875391960144043, "learning_rate": 1.528419009559608e-05, "loss": 2.1394, "step": 103000 }, { "epoch": 3.908856536245071, "grad_norm": 11.363945007324219, "learning_rate": 1.5272949832062583e-05, "loss": 2.146, "step": 103100 }, { "epoch": 3.912647861692448, "grad_norm": 9.702890396118164, "learning_rate": 1.526170033324129e-05, "loss": 2.1104, "step": 103200 }, { "epoch": 3.916439187139824, "grad_norm": 11.023165702819824, "learning_rate": 1.5250441618835074e-05, "loss": 2.1393, "step": 103300 }, { "epoch": 3.9202305125872003, "grad_norm": 11.855907440185547, "learning_rate": 1.5239173708562978e-05, "loss": 2.1481, "step": 103400 }, { "epoch": 3.924021838034577, "grad_norm": 10.756991386413574, "learning_rate": 1.522789662216014e-05, "loss": 2.1497, "step": 103500 }, { "epoch": 3.9278131634819533, "grad_norm": 10.655333518981934, "learning_rate": 1.5216610379377765e-05, "loss": 2.1368, "step": 103600 }, { "epoch": 3.9316044889293296, "grad_norm": 11.413888931274414, "learning_rate": 1.5205314999983104e-05, "loss": 2.152, "step": 103700 }, { "epoch": 3.9353958143767063, "grad_norm": 9.739007949829102, "learning_rate": 1.5194010503759403e-05, "loss": 2.1261, "step": 103800 }, { "epoch": 3.9391871398240825, "grad_norm": 10.531883239746094, "learning_rate": 1.5182696910505883e-05, "loss": 2.1438, "step": 103900 }, { "epoch": 3.942978465271459, "grad_norm": 11.078007698059082, "learning_rate": 1.5171374240037688e-05, "loss": 2.1178, "step": 104000 }, { "epoch": 3.946769790718835, "grad_norm": 11.209328651428223, "learning_rate": 1.516004251218587e-05, "loss": 2.1271, "step": 104100 }, { "epoch": 3.9505611161662118, "grad_norm": 11.715791702270508, "learning_rate": 1.514870174679734e-05, "loss": 2.1155, "step": 104200 }, { "epoch": 3.954352441613588, "grad_norm": 11.203324317932129, "learning_rate": 1.5137351963734834e-05, "loss": 2.1301, "step": 104300 }, { "epoch": 3.9581437670609647, "grad_norm": 10.675749778747559, "learning_rate": 1.5125993182876895e-05, "loss": 2.1433, "step": 104400 }, { "epoch": 3.961935092508341, "grad_norm": 11.353755950927734, "learning_rate": 1.5114625424117805e-05, "loss": 2.1245, "step": 104500 }, { "epoch": 3.9657264179557172, "grad_norm": 11.362961769104004, "learning_rate": 1.510324870736759e-05, "loss": 2.1089, "step": 104600 }, { "epoch": 3.9695177434030935, "grad_norm": 10.52084732055664, "learning_rate": 1.5091863052551954e-05, "loss": 2.138, "step": 104700 }, { "epoch": 3.97330906885047, "grad_norm": 10.043548583984375, "learning_rate": 1.5080468479612259e-05, "loss": 2.1593, "step": 104800 }, { "epoch": 3.9771003942978465, "grad_norm": 10.941576957702637, "learning_rate": 1.5069065008505483e-05, "loss": 2.1357, "step": 104900 }, { "epoch": 3.980891719745223, "grad_norm": 12.165728569030762, "learning_rate": 1.5057652659204198e-05, "loss": 2.1666, "step": 105000 }, { "epoch": 3.9846830451925994, "grad_norm": 10.201214790344238, "learning_rate": 1.5046231451696516e-05, "loss": 2.1162, "step": 105100 }, { "epoch": 3.9884743706399757, "grad_norm": 11.426102638244629, "learning_rate": 1.5034801405986068e-05, "loss": 2.1996, "step": 105200 }, { "epoch": 3.992265696087352, "grad_norm": 11.485660552978516, "learning_rate": 1.5023362542091962e-05, "loss": 2.1204, "step": 105300 }, { "epoch": 3.9960570215347286, "grad_norm": 11.895570755004883, "learning_rate": 1.5011914880048753e-05, "loss": 2.1554, "step": 105400 }, { "epoch": 3.999848346982105, "grad_norm": 10.690635681152344, "learning_rate": 1.5000458439906407e-05, "loss": 2.1399, "step": 105500 }, { "epoch": 4.0, "eval_accuracy": 0.5015414253623119, "eval_loss": 2.2586095333099365, "eval_runtime": 932.5916, "eval_samples_per_second": 905.031, "eval_steps_per_second": 7.071, "step": 105504 }, { "epoch": 4.003639672429482, "grad_norm": 10.491771697998047, "learning_rate": 1.4988993241730256e-05, "loss": 1.9326, "step": 105600 }, { "epoch": 4.007430997876858, "grad_norm": 11.346717834472656, "learning_rate": 1.4977519305600989e-05, "loss": 1.9135, "step": 105700 }, { "epoch": 4.011222323324234, "grad_norm": 11.817505836486816, "learning_rate": 1.496603665161458e-05, "loss": 1.9212, "step": 105800 }, { "epoch": 4.01501364877161, "grad_norm": 10.135194778442383, "learning_rate": 1.4954545299882278e-05, "loss": 1.9222, "step": 105900 }, { "epoch": 4.018804974218987, "grad_norm": 10.500676155090332, "learning_rate": 1.4943045270530569e-05, "loss": 1.914, "step": 106000 }, { "epoch": 4.022596299666364, "grad_norm": 10.659652709960938, "learning_rate": 1.4931536583701134e-05, "loss": 1.9132, "step": 106100 }, { "epoch": 4.02638762511374, "grad_norm": 9.949440002441406, "learning_rate": 1.4920019259550824e-05, "loss": 1.9008, "step": 106200 }, { "epoch": 4.030178950561116, "grad_norm": 10.805390357971191, "learning_rate": 1.4908493318251604e-05, "loss": 1.8923, "step": 106300 }, { "epoch": 4.033970276008493, "grad_norm": 11.208863258361816, "learning_rate": 1.4896958779990549e-05, "loss": 1.9279, "step": 106400 }, { "epoch": 4.037761601455869, "grad_norm": 10.365504264831543, "learning_rate": 1.4885415664969776e-05, "loss": 1.9341, "step": 106500 }, { "epoch": 4.041552926903245, "grad_norm": 11.66167163848877, "learning_rate": 1.4873863993406428e-05, "loss": 1.9375, "step": 106600 }, { "epoch": 4.045344252350621, "grad_norm": 10.387350082397461, "learning_rate": 1.4862303785532643e-05, "loss": 1.9543, "step": 106700 }, { "epoch": 4.0491355777979985, "grad_norm": 11.573199272155762, "learning_rate": 1.4850735061595496e-05, "loss": 1.917, "step": 106800 }, { "epoch": 4.052926903245375, "grad_norm": 10.970754623413086, "learning_rate": 1.4839157841856986e-05, "loss": 1.9432, "step": 106900 }, { "epoch": 4.056718228692751, "grad_norm": 10.620475769042969, "learning_rate": 1.4827572146593992e-05, "loss": 1.9123, "step": 107000 }, { "epoch": 4.060509554140127, "grad_norm": 11.474949836730957, "learning_rate": 1.4815977996098234e-05, "loss": 1.924, "step": 107100 }, { "epoch": 4.0643008795875035, "grad_norm": 11.040342330932617, "learning_rate": 1.4804375410676244e-05, "loss": 1.9215, "step": 107200 }, { "epoch": 4.06809220503488, "grad_norm": 11.14341926574707, "learning_rate": 1.4792764410649328e-05, "loss": 1.9137, "step": 107300 }, { "epoch": 4.071883530482257, "grad_norm": 11.149221420288086, "learning_rate": 1.4781145016353519e-05, "loss": 1.9126, "step": 107400 }, { "epoch": 4.075674855929633, "grad_norm": 10.057478904724121, "learning_rate": 1.4769517248139569e-05, "loss": 1.9596, "step": 107500 }, { "epoch": 4.0794661813770094, "grad_norm": 12.40358829498291, "learning_rate": 1.4757881126372887e-05, "loss": 1.9266, "step": 107600 }, { "epoch": 4.083257506824386, "grad_norm": 11.125639915466309, "learning_rate": 1.474623667143351e-05, "loss": 1.9473, "step": 107700 }, { "epoch": 4.087048832271762, "grad_norm": 10.538691520690918, "learning_rate": 1.4734583903716078e-05, "loss": 1.9438, "step": 107800 }, { "epoch": 4.090840157719138, "grad_norm": 10.740836143493652, "learning_rate": 1.4722922843629782e-05, "loss": 1.9354, "step": 107900 }, { "epoch": 4.094631483166515, "grad_norm": 10.686046600341797, "learning_rate": 1.4711253511598346e-05, "loss": 1.9548, "step": 108000 }, { "epoch": 4.098422808613892, "grad_norm": 10.822304725646973, "learning_rate": 1.4699575928059976e-05, "loss": 1.932, "step": 108100 }, { "epoch": 4.102214134061268, "grad_norm": 9.834940910339355, "learning_rate": 1.4687890113467327e-05, "loss": 1.9135, "step": 108200 }, { "epoch": 4.106005459508644, "grad_norm": 10.90197467803955, "learning_rate": 1.4676196088287478e-05, "loss": 1.9447, "step": 108300 }, { "epoch": 4.10979678495602, "grad_norm": 10.136431694030762, "learning_rate": 1.4664493873001884e-05, "loss": 1.9288, "step": 108400 }, { "epoch": 4.113588110403397, "grad_norm": 10.868680953979492, "learning_rate": 1.4652783488106347e-05, "loss": 1.9388, "step": 108500 }, { "epoch": 4.117379435850774, "grad_norm": 11.148466110229492, "learning_rate": 1.4641064954110969e-05, "loss": 1.9368, "step": 108600 }, { "epoch": 4.12117076129815, "grad_norm": 11.8153715133667, "learning_rate": 1.4629338291540141e-05, "loss": 1.9395, "step": 108700 }, { "epoch": 4.124962086745526, "grad_norm": 10.775195121765137, "learning_rate": 1.4617603520932475e-05, "loss": 1.9037, "step": 108800 }, { "epoch": 4.128753412192903, "grad_norm": 10.796618461608887, "learning_rate": 1.4605860662840792e-05, "loss": 1.9428, "step": 108900 }, { "epoch": 4.132544737640279, "grad_norm": 11.219659805297852, "learning_rate": 1.4594109737832073e-05, "loss": 1.951, "step": 109000 }, { "epoch": 4.136336063087655, "grad_norm": 11.19848346710205, "learning_rate": 1.4582350766487437e-05, "loss": 1.9116, "step": 109100 }, { "epoch": 4.140127388535032, "grad_norm": 10.443684577941895, "learning_rate": 1.4570583769402085e-05, "loss": 1.9203, "step": 109200 }, { "epoch": 4.1439187139824085, "grad_norm": 10.545578956604004, "learning_rate": 1.4558808767185278e-05, "loss": 1.9355, "step": 109300 }, { "epoch": 4.147710039429785, "grad_norm": 11.244470596313477, "learning_rate": 1.45470257804603e-05, "loss": 1.9074, "step": 109400 }, { "epoch": 4.151501364877161, "grad_norm": 11.276346206665039, "learning_rate": 1.4535234829864417e-05, "loss": 1.9381, "step": 109500 }, { "epoch": 4.155292690324537, "grad_norm": 10.635172843933105, "learning_rate": 1.4523435936048847e-05, "loss": 1.8906, "step": 109600 }, { "epoch": 4.159084015771914, "grad_norm": 10.601938247680664, "learning_rate": 1.4511629119678716e-05, "loss": 1.9298, "step": 109700 }, { "epoch": 4.162875341219291, "grad_norm": 10.854449272155762, "learning_rate": 1.4499814401433028e-05, "loss": 1.9358, "step": 109800 }, { "epoch": 4.166666666666667, "grad_norm": 12.302973747253418, "learning_rate": 1.4487991802004625e-05, "loss": 1.9143, "step": 109900 }, { "epoch": 4.170457992114043, "grad_norm": 11.987966537475586, "learning_rate": 1.4476161342100152e-05, "loss": 1.9312, "step": 110000 }, { "epoch": 4.1742493175614195, "grad_norm": 12.077518463134766, "learning_rate": 1.4464323042440024e-05, "loss": 1.9388, "step": 110100 }, { "epoch": 4.178040643008796, "grad_norm": 10.604219436645508, "learning_rate": 1.4452476923758388e-05, "loss": 1.9321, "step": 110200 }, { "epoch": 4.181831968456172, "grad_norm": 11.18627643585205, "learning_rate": 1.4440623006803082e-05, "loss": 1.9554, "step": 110300 }, { "epoch": 4.185623293903548, "grad_norm": 10.951568603515625, "learning_rate": 1.4428761312335601e-05, "loss": 1.928, "step": 110400 }, { "epoch": 4.189414619350925, "grad_norm": 10.588134765625, "learning_rate": 1.4416891861131068e-05, "loss": 1.937, "step": 110500 }, { "epoch": 4.193205944798302, "grad_norm": 11.442967414855957, "learning_rate": 1.4405014673978185e-05, "loss": 1.9002, "step": 110600 }, { "epoch": 4.196997270245678, "grad_norm": 11.712267875671387, "learning_rate": 1.4393129771679208e-05, "loss": 1.9218, "step": 110700 }, { "epoch": 4.200788595693054, "grad_norm": 11.000720024108887, "learning_rate": 1.4381237175049904e-05, "loss": 1.9701, "step": 110800 }, { "epoch": 4.20457992114043, "grad_norm": 10.674052238464355, "learning_rate": 1.4369336904919511e-05, "loss": 1.8939, "step": 110900 }, { "epoch": 4.208371246587807, "grad_norm": 12.58296012878418, "learning_rate": 1.4357428982130719e-05, "loss": 1.931, "step": 111000 }, { "epoch": 4.212162572035184, "grad_norm": 9.671144485473633, "learning_rate": 1.4345513427539606e-05, "loss": 1.957, "step": 111100 }, { "epoch": 4.21595389748256, "grad_norm": 11.268157005310059, "learning_rate": 1.4333590262015633e-05, "loss": 1.9385, "step": 111200 }, { "epoch": 4.219745222929936, "grad_norm": 9.60888385772705, "learning_rate": 1.4321659506441577e-05, "loss": 1.9473, "step": 111300 }, { "epoch": 4.223536548377313, "grad_norm": 11.549823760986328, "learning_rate": 1.4309721181713511e-05, "loss": 1.8985, "step": 111400 }, { "epoch": 4.227327873824689, "grad_norm": 11.02795124053955, "learning_rate": 1.429777530874078e-05, "loss": 1.9284, "step": 111500 }, { "epoch": 4.231119199272065, "grad_norm": 12.101762771606445, "learning_rate": 1.4285821908445923e-05, "loss": 1.9047, "step": 111600 }, { "epoch": 4.234910524719442, "grad_norm": 9.515430450439453, "learning_rate": 1.4273861001764689e-05, "loss": 1.9439, "step": 111700 }, { "epoch": 4.2387018501668186, "grad_norm": 11.586247444152832, "learning_rate": 1.4261892609645958e-05, "loss": 1.9138, "step": 111800 }, { "epoch": 4.242493175614195, "grad_norm": 10.5563383102417, "learning_rate": 1.424991675305172e-05, "loss": 1.9331, "step": 111900 }, { "epoch": 4.246284501061571, "grad_norm": 12.3113374710083, "learning_rate": 1.4237933452957048e-05, "loss": 1.9739, "step": 112000 }, { "epoch": 4.250075826508947, "grad_norm": 10.923721313476562, "learning_rate": 1.422594273035005e-05, "loss": 1.9214, "step": 112100 }, { "epoch": 4.253867151956324, "grad_norm": 11.418472290039062, "learning_rate": 1.4213944606231823e-05, "loss": 1.933, "step": 112200 }, { "epoch": 4.257658477403701, "grad_norm": 11.344213485717773, "learning_rate": 1.4201939101616444e-05, "loss": 1.9203, "step": 112300 }, { "epoch": 4.261449802851077, "grad_norm": 11.372957229614258, "learning_rate": 1.4189926237530902e-05, "loss": 1.9246, "step": 112400 }, { "epoch": 4.265241128298453, "grad_norm": 11.557223320007324, "learning_rate": 1.4177906035015085e-05, "loss": 1.9346, "step": 112500 }, { "epoch": 4.2690324537458295, "grad_norm": 11.431739807128906, "learning_rate": 1.4165878515121729e-05, "loss": 1.936, "step": 112600 }, { "epoch": 4.272823779193206, "grad_norm": 10.40214729309082, "learning_rate": 1.415384369891639e-05, "loss": 1.911, "step": 112700 }, { "epoch": 4.276615104640582, "grad_norm": 9.965800285339355, "learning_rate": 1.4141801607477402e-05, "loss": 1.9651, "step": 112800 }, { "epoch": 4.280406430087959, "grad_norm": 10.631170272827148, "learning_rate": 1.412975226189583e-05, "loss": 1.9269, "step": 112900 }, { "epoch": 4.284197755535335, "grad_norm": 11.634657859802246, "learning_rate": 1.4117695683275464e-05, "loss": 1.9452, "step": 113000 }, { "epoch": 4.287989080982712, "grad_norm": 11.915509223937988, "learning_rate": 1.4105631892732754e-05, "loss": 1.9272, "step": 113100 }, { "epoch": 4.291780406430088, "grad_norm": 11.198233604431152, "learning_rate": 1.4093560911396772e-05, "loss": 1.9203, "step": 113200 }, { "epoch": 4.295571731877464, "grad_norm": 9.2674560546875, "learning_rate": 1.40814827604092e-05, "loss": 1.9311, "step": 113300 }, { "epoch": 4.2993630573248405, "grad_norm": 11.295232772827148, "learning_rate": 1.4069397460924259e-05, "loss": 1.9375, "step": 113400 }, { "epoch": 4.303154382772217, "grad_norm": 12.30738639831543, "learning_rate": 1.4057305034108713e-05, "loss": 1.9172, "step": 113500 }, { "epoch": 4.306945708219594, "grad_norm": 11.133030891418457, "learning_rate": 1.4045205501141782e-05, "loss": 1.9051, "step": 113600 }, { "epoch": 4.31073703366697, "grad_norm": 12.085975646972656, "learning_rate": 1.4033098883215163e-05, "loss": 1.9318, "step": 113700 }, { "epoch": 4.314528359114346, "grad_norm": 11.143294334411621, "learning_rate": 1.4020985201532935e-05, "loss": 1.9417, "step": 113800 }, { "epoch": 4.318319684561723, "grad_norm": 11.539852142333984, "learning_rate": 1.4008864477311564e-05, "loss": 1.9421, "step": 113900 }, { "epoch": 4.322111010009099, "grad_norm": 11.774057388305664, "learning_rate": 1.3996736731779843e-05, "loss": 1.9488, "step": 114000 }, { "epoch": 4.325902335456475, "grad_norm": 10.981790542602539, "learning_rate": 1.3984601986178862e-05, "loss": 1.9387, "step": 114100 }, { "epoch": 4.329693660903852, "grad_norm": 10.843935012817383, "learning_rate": 1.3972460261761986e-05, "loss": 1.9329, "step": 114200 }, { "epoch": 4.333484986351229, "grad_norm": 11.053011894226074, "learning_rate": 1.3960311579794783e-05, "loss": 1.9408, "step": 114300 }, { "epoch": 4.337276311798605, "grad_norm": 10.420573234558105, "learning_rate": 1.3948155961555019e-05, "loss": 1.9216, "step": 114400 }, { "epoch": 4.341067637245981, "grad_norm": 10.473284721374512, "learning_rate": 1.3935993428332602e-05, "loss": 1.9327, "step": 114500 }, { "epoch": 4.344858962693357, "grad_norm": 11.456507682800293, "learning_rate": 1.3923824001429557e-05, "loss": 1.9571, "step": 114600 }, { "epoch": 4.348650288140734, "grad_norm": 11.065102577209473, "learning_rate": 1.391164770215998e-05, "loss": 1.9444, "step": 114700 }, { "epoch": 4.352441613588111, "grad_norm": 12.202268600463867, "learning_rate": 1.3899464551850001e-05, "loss": 1.9449, "step": 114800 }, { "epoch": 4.356232939035487, "grad_norm": 11.401569366455078, "learning_rate": 1.3887274571837757e-05, "loss": 1.9797, "step": 114900 }, { "epoch": 4.360024264482863, "grad_norm": 9.881141662597656, "learning_rate": 1.3875077783473337e-05, "loss": 1.9197, "step": 115000 }, { "epoch": 4.3638155899302395, "grad_norm": 11.307880401611328, "learning_rate": 1.386287420811876e-05, "loss": 1.9086, "step": 115100 }, { "epoch": 4.367606915377616, "grad_norm": 11.50818157196045, "learning_rate": 1.3850663867147933e-05, "loss": 1.9004, "step": 115200 }, { "epoch": 4.371398240824992, "grad_norm": 11.159831047058105, "learning_rate": 1.3838446781946616e-05, "loss": 1.9307, "step": 115300 }, { "epoch": 4.375189566272369, "grad_norm": 11.2947998046875, "learning_rate": 1.3826222973912364e-05, "loss": 1.9604, "step": 115400 }, { "epoch": 4.3789808917197455, "grad_norm": 11.43346118927002, "learning_rate": 1.3813992464454526e-05, "loss": 1.9368, "step": 115500 }, { "epoch": 4.382772217167122, "grad_norm": 10.940098762512207, "learning_rate": 1.3801755274994184e-05, "loss": 1.9174, "step": 115600 }, { "epoch": 4.386563542614498, "grad_norm": 11.442471504211426, "learning_rate": 1.378951142696411e-05, "loss": 1.9412, "step": 115700 }, { "epoch": 4.390354868061874, "grad_norm": 10.503515243530273, "learning_rate": 1.3777260941808753e-05, "loss": 1.9191, "step": 115800 }, { "epoch": 4.3941461935092505, "grad_norm": 10.953397750854492, "learning_rate": 1.3765003840984166e-05, "loss": 1.9439, "step": 115900 }, { "epoch": 4.397937518956628, "grad_norm": 11.160265922546387, "learning_rate": 1.3752740145958013e-05, "loss": 1.9177, "step": 116000 }, { "epoch": 4.401728844404004, "grad_norm": 10.922441482543945, "learning_rate": 1.3740469878209491e-05, "loss": 1.9261, "step": 116100 }, { "epoch": 4.40552016985138, "grad_norm": 10.987194061279297, "learning_rate": 1.3728193059229313e-05, "loss": 1.9173, "step": 116200 }, { "epoch": 4.409311495298756, "grad_norm": 10.938337326049805, "learning_rate": 1.3715909710519668e-05, "loss": 1.9323, "step": 116300 }, { "epoch": 4.413102820746133, "grad_norm": 10.6255464553833, "learning_rate": 1.3703619853594177e-05, "loss": 1.9577, "step": 116400 }, { "epoch": 4.416894146193509, "grad_norm": 10.767485618591309, "learning_rate": 1.369132350997787e-05, "loss": 1.9219, "step": 116500 }, { "epoch": 4.420685471640885, "grad_norm": 10.336567878723145, "learning_rate": 1.3679020701207122e-05, "loss": 1.8901, "step": 116600 }, { "epoch": 4.424476797088262, "grad_norm": 11.28994083404541, "learning_rate": 1.3666711448829648e-05, "loss": 1.9689, "step": 116700 }, { "epoch": 4.428268122535639, "grad_norm": 10.334284782409668, "learning_rate": 1.3654395774404436e-05, "loss": 1.9256, "step": 116800 }, { "epoch": 4.432059447983015, "grad_norm": 11.982410430908203, "learning_rate": 1.3642073699501728e-05, "loss": 1.9215, "step": 116900 }, { "epoch": 4.435850773430391, "grad_norm": 14.58484172821045, "learning_rate": 1.3629745245702976e-05, "loss": 1.9328, "step": 117000 }, { "epoch": 4.439642098877767, "grad_norm": 10.952180862426758, "learning_rate": 1.3617410434600804e-05, "loss": 1.9367, "step": 117100 }, { "epoch": 4.443433424325144, "grad_norm": 11.426154136657715, "learning_rate": 1.360506928779897e-05, "loss": 1.9246, "step": 117200 }, { "epoch": 4.447224749772521, "grad_norm": 10.834053039550781, "learning_rate": 1.3592721826912324e-05, "loss": 1.9279, "step": 117300 }, { "epoch": 4.451016075219897, "grad_norm": 10.896519660949707, "learning_rate": 1.358036807356678e-05, "loss": 1.922, "step": 117400 }, { "epoch": 4.454807400667273, "grad_norm": 11.125563621520996, "learning_rate": 1.3568008049399278e-05, "loss": 1.9393, "step": 117500 }, { "epoch": 4.45859872611465, "grad_norm": 12.220664024353027, "learning_rate": 1.3555641776057729e-05, "loss": 1.9196, "step": 117600 }, { "epoch": 4.462390051562026, "grad_norm": 11.411626815795898, "learning_rate": 1.3543269275200996e-05, "loss": 1.8964, "step": 117700 }, { "epoch": 4.466181377009402, "grad_norm": 11.519455909729004, "learning_rate": 1.3530890568498853e-05, "loss": 1.9321, "step": 117800 }, { "epoch": 4.469972702456779, "grad_norm": 11.501522064208984, "learning_rate": 1.3518505677631933e-05, "loss": 1.9037, "step": 117900 }, { "epoch": 4.4737640279041555, "grad_norm": 10.664104461669922, "learning_rate": 1.3506114624291706e-05, "loss": 1.9346, "step": 118000 }, { "epoch": 4.477555353351532, "grad_norm": 12.200970649719238, "learning_rate": 1.3493717430180437e-05, "loss": 1.8909, "step": 118100 }, { "epoch": 4.481346678798908, "grad_norm": 11.85975456237793, "learning_rate": 1.3481314117011145e-05, "loss": 1.9261, "step": 118200 }, { "epoch": 4.485138004246284, "grad_norm": 11.915306091308594, "learning_rate": 1.3468904706507564e-05, "loss": 1.9092, "step": 118300 }, { "epoch": 4.4889293296936605, "grad_norm": 11.404444694519043, "learning_rate": 1.3456489220404106e-05, "loss": 1.9269, "step": 118400 }, { "epoch": 4.492720655141038, "grad_norm": 12.244658470153809, "learning_rate": 1.3444067680445834e-05, "loss": 1.9471, "step": 118500 }, { "epoch": 4.496511980588414, "grad_norm": 11.258451461791992, "learning_rate": 1.3431640108388397e-05, "loss": 1.9365, "step": 118600 }, { "epoch": 4.50030330603579, "grad_norm": 12.544364929199219, "learning_rate": 1.3419206525998026e-05, "loss": 1.94, "step": 118700 }, { "epoch": 4.5040946314831665, "grad_norm": 11.16454792022705, "learning_rate": 1.3406766955051471e-05, "loss": 1.9239, "step": 118800 }, { "epoch": 4.507885956930543, "grad_norm": 12.47703742980957, "learning_rate": 1.3394321417335963e-05, "loss": 1.9276, "step": 118900 }, { "epoch": 4.511677282377919, "grad_norm": 11.438806533813477, "learning_rate": 1.33818699346492e-05, "loss": 1.9008, "step": 119000 }, { "epoch": 4.515468607825296, "grad_norm": 11.39981460571289, "learning_rate": 1.336941252879928e-05, "loss": 1.9258, "step": 119100 }, { "epoch": 4.519259933272672, "grad_norm": 12.068925857543945, "learning_rate": 1.3356949221604683e-05, "loss": 1.8952, "step": 119200 }, { "epoch": 4.523051258720049, "grad_norm": 11.431039810180664, "learning_rate": 1.3344480034894214e-05, "loss": 1.8866, "step": 119300 }, { "epoch": 4.526842584167425, "grad_norm": 10.363943099975586, "learning_rate": 1.3332004990506986e-05, "loss": 1.9176, "step": 119400 }, { "epoch": 4.530633909614801, "grad_norm": 12.397361755371094, "learning_rate": 1.3319524110292368e-05, "loss": 1.8896, "step": 119500 }, { "epoch": 4.534425235062177, "grad_norm": 11.372133255004883, "learning_rate": 1.3307037416109947e-05, "loss": 1.9496, "step": 119600 }, { "epoch": 4.538216560509554, "grad_norm": 12.080363273620605, "learning_rate": 1.3294544929829503e-05, "loss": 1.9127, "step": 119700 }, { "epoch": 4.542007885956931, "grad_norm": 11.209160804748535, "learning_rate": 1.3282046673330944e-05, "loss": 1.8968, "step": 119800 }, { "epoch": 4.545799211404307, "grad_norm": 11.013001441955566, "learning_rate": 1.3269542668504301e-05, "loss": 1.9265, "step": 119900 }, { "epoch": 4.549590536851683, "grad_norm": 12.487321853637695, "learning_rate": 1.3257032937249659e-05, "loss": 1.9321, "step": 120000 }, { "epoch": 4.55338186229906, "grad_norm": 10.845413208007812, "learning_rate": 1.3244517501477145e-05, "loss": 1.9036, "step": 120100 }, { "epoch": 4.557173187746436, "grad_norm": 12.747235298156738, "learning_rate": 1.3231996383106865e-05, "loss": 1.9505, "step": 120200 }, { "epoch": 4.560964513193813, "grad_norm": 11.283121109008789, "learning_rate": 1.3219469604068888e-05, "loss": 1.9389, "step": 120300 }, { "epoch": 4.564755838641189, "grad_norm": 10.935291290283203, "learning_rate": 1.3206937186303194e-05, "loss": 1.9034, "step": 120400 }, { "epoch": 4.5685471640885655, "grad_norm": 13.923113822937012, "learning_rate": 1.3194399151759636e-05, "loss": 1.9109, "step": 120500 }, { "epoch": 4.572338489535942, "grad_norm": 11.842820167541504, "learning_rate": 1.3181855522397909e-05, "loss": 1.949, "step": 120600 }, { "epoch": 4.576129814983318, "grad_norm": 12.174302101135254, "learning_rate": 1.3169306320187498e-05, "loss": 1.9123, "step": 120700 }, { "epoch": 4.579921140430694, "grad_norm": 11.70753288269043, "learning_rate": 1.3156751567107664e-05, "loss": 1.9238, "step": 120800 }, { "epoch": 4.583712465878071, "grad_norm": 11.765509605407715, "learning_rate": 1.3144191285147374e-05, "loss": 1.9414, "step": 120900 }, { "epoch": 4.587503791325448, "grad_norm": 11.573448181152344, "learning_rate": 1.3131625496305291e-05, "loss": 1.9129, "step": 121000 }, { "epoch": 4.591295116772824, "grad_norm": 11.252988815307617, "learning_rate": 1.3119054222589715e-05, "loss": 1.9066, "step": 121100 }, { "epoch": 4.5950864422202, "grad_norm": 11.349669456481934, "learning_rate": 1.3106477486018556e-05, "loss": 1.9204, "step": 121200 }, { "epoch": 4.5988777676675765, "grad_norm": 11.811543464660645, "learning_rate": 1.3093895308619295e-05, "loss": 1.8861, "step": 121300 }, { "epoch": 4.602669093114953, "grad_norm": 10.95641040802002, "learning_rate": 1.3081307712428928e-05, "loss": 1.9102, "step": 121400 }, { "epoch": 4.606460418562329, "grad_norm": 10.530266761779785, "learning_rate": 1.3068714719493964e-05, "loss": 1.9454, "step": 121500 }, { "epoch": 4.610251744009706, "grad_norm": 12.134538650512695, "learning_rate": 1.3056116351870345e-05, "loss": 1.9016, "step": 121600 }, { "epoch": 4.614043069457082, "grad_norm": 11.605910301208496, "learning_rate": 1.3043512631623437e-05, "loss": 1.9135, "step": 121700 }, { "epoch": 4.617834394904459, "grad_norm": 11.290129661560059, "learning_rate": 1.3030903580827974e-05, "loss": 1.9471, "step": 121800 }, { "epoch": 4.621625720351835, "grad_norm": 10.559269905090332, "learning_rate": 1.3018289221568032e-05, "loss": 1.9358, "step": 121900 }, { "epoch": 4.625417045799211, "grad_norm": 10.500186920166016, "learning_rate": 1.3005669575936986e-05, "loss": 1.9425, "step": 122000 }, { "epoch": 4.6292083712465875, "grad_norm": 10.709505081176758, "learning_rate": 1.299304466603746e-05, "loss": 1.9218, "step": 122100 }, { "epoch": 4.632999696693965, "grad_norm": 10.840487480163574, "learning_rate": 1.2980414513981305e-05, "loss": 1.9066, "step": 122200 }, { "epoch": 4.636791022141341, "grad_norm": 10.977898597717285, "learning_rate": 1.2967779141889558e-05, "loss": 1.9148, "step": 122300 }, { "epoch": 4.640582347588717, "grad_norm": 12.445085525512695, "learning_rate": 1.2955138571892385e-05, "loss": 1.9475, "step": 122400 }, { "epoch": 4.644373673036093, "grad_norm": 11.903800964355469, "learning_rate": 1.2942492826129072e-05, "loss": 1.8928, "step": 122500 }, { "epoch": 4.64816499848347, "grad_norm": 10.243343353271484, "learning_rate": 1.292984192674796e-05, "loss": 1.9202, "step": 122600 }, { "epoch": 4.651956323930846, "grad_norm": 11.011580467224121, "learning_rate": 1.291718589590641e-05, "loss": 1.912, "step": 122700 }, { "epoch": 4.655747649378222, "grad_norm": 10.708888053894043, "learning_rate": 1.2904524755770789e-05, "loss": 1.8922, "step": 122800 }, { "epoch": 4.659538974825599, "grad_norm": 11.132603645324707, "learning_rate": 1.2891858528516398e-05, "loss": 1.9081, "step": 122900 }, { "epoch": 4.663330300272976, "grad_norm": 13.312323570251465, "learning_rate": 1.2879187236327452e-05, "loss": 1.9192, "step": 123000 }, { "epoch": 4.667121625720352, "grad_norm": 9.47733211517334, "learning_rate": 1.2866510901397042e-05, "loss": 1.8826, "step": 123100 }, { "epoch": 4.670912951167728, "grad_norm": 10.120606422424316, "learning_rate": 1.2853829545927076e-05, "loss": 1.9265, "step": 123200 }, { "epoch": 4.674704276615104, "grad_norm": 11.59416389465332, "learning_rate": 1.284114319212828e-05, "loss": 1.8659, "step": 123300 }, { "epoch": 4.6784956020624815, "grad_norm": 11.398164749145508, "learning_rate": 1.2828451862220108e-05, "loss": 1.9215, "step": 123400 }, { "epoch": 4.682286927509858, "grad_norm": 11.457133293151855, "learning_rate": 1.2815755578430745e-05, "loss": 1.9167, "step": 123500 }, { "epoch": 4.686078252957234, "grad_norm": 11.205010414123535, "learning_rate": 1.2803054362997047e-05, "loss": 1.9137, "step": 123600 }, { "epoch": 4.68986957840461, "grad_norm": 12.399345397949219, "learning_rate": 1.2790348238164512e-05, "loss": 1.934, "step": 123700 }, { "epoch": 4.6936609038519865, "grad_norm": 10.483298301696777, "learning_rate": 1.2777637226187234e-05, "loss": 1.9423, "step": 123800 }, { "epoch": 4.697452229299363, "grad_norm": 11.052417755126953, "learning_rate": 1.2764921349327864e-05, "loss": 1.9138, "step": 123900 }, { "epoch": 4.701243554746739, "grad_norm": 11.046278953552246, "learning_rate": 1.2752200629857577e-05, "loss": 1.9087, "step": 124000 }, { "epoch": 4.705034880194116, "grad_norm": 11.375521659851074, "learning_rate": 1.2739475090056028e-05, "loss": 1.9291, "step": 124100 }, { "epoch": 4.7088262056414925, "grad_norm": 11.460271835327148, "learning_rate": 1.2726744752211315e-05, "loss": 1.9194, "step": 124200 }, { "epoch": 4.712617531088869, "grad_norm": 12.008980751037598, "learning_rate": 1.271400963861994e-05, "loss": 1.9119, "step": 124300 }, { "epoch": 4.716408856536245, "grad_norm": 11.219669342041016, "learning_rate": 1.2701269771586769e-05, "loss": 1.9335, "step": 124400 }, { "epoch": 4.720200181983621, "grad_norm": 10.55120849609375, "learning_rate": 1.2688525173424992e-05, "loss": 1.9079, "step": 124500 }, { "epoch": 4.7239915074309975, "grad_norm": 11.582011222839355, "learning_rate": 1.2675775866456088e-05, "loss": 1.9287, "step": 124600 }, { "epoch": 4.727782832878375, "grad_norm": 12.234732627868652, "learning_rate": 1.2663021873009782e-05, "loss": 1.9226, "step": 124700 }, { "epoch": 4.731574158325751, "grad_norm": 10.824315071105957, "learning_rate": 1.2650263215424008e-05, "loss": 1.944, "step": 124800 }, { "epoch": 4.735365483773127, "grad_norm": 11.112320899963379, "learning_rate": 1.2637499916044865e-05, "loss": 1.9253, "step": 124900 }, { "epoch": 4.739156809220503, "grad_norm": 10.616378784179688, "learning_rate": 1.2624731997226584e-05, "loss": 1.8977, "step": 125000 }, { "epoch": 4.74294813466788, "grad_norm": 11.635412216186523, "learning_rate": 1.2611959481331497e-05, "loss": 1.9139, "step": 125100 }, { "epoch": 4.746739460115256, "grad_norm": 10.587671279907227, "learning_rate": 1.2599182390729965e-05, "loss": 1.9008, "step": 125200 }, { "epoch": 4.750530785562633, "grad_norm": 11.479141235351562, "learning_rate": 1.2586400747800383e-05, "loss": 1.927, "step": 125300 }, { "epoch": 4.754322111010009, "grad_norm": 11.282635688781738, "learning_rate": 1.2573614574929108e-05, "loss": 1.9345, "step": 125400 }, { "epoch": 4.758113436457386, "grad_norm": 10.284213066101074, "learning_rate": 1.2560823894510433e-05, "loss": 1.9327, "step": 125500 }, { "epoch": 4.761904761904762, "grad_norm": 12.295828819274902, "learning_rate": 1.2548028728946548e-05, "loss": 1.9214, "step": 125600 }, { "epoch": 4.765696087352138, "grad_norm": 9.698297500610352, "learning_rate": 1.2535229100647493e-05, "loss": 1.8864, "step": 125700 }, { "epoch": 4.769487412799514, "grad_norm": 10.334789276123047, "learning_rate": 1.2522425032031136e-05, "loss": 1.9155, "step": 125800 }, { "epoch": 4.773278738246891, "grad_norm": 10.851923942565918, "learning_rate": 1.2509616545523104e-05, "loss": 1.9291, "step": 125900 }, { "epoch": 4.777070063694268, "grad_norm": 11.330126762390137, "learning_rate": 1.249680366355678e-05, "loss": 1.934, "step": 126000 }, { "epoch": 4.780861389141644, "grad_norm": 11.336546897888184, "learning_rate": 1.2483986408573231e-05, "loss": 1.915, "step": 126100 }, { "epoch": 4.78465271458902, "grad_norm": 12.119205474853516, "learning_rate": 1.2471164803021192e-05, "loss": 1.9062, "step": 126200 }, { "epoch": 4.788444040036397, "grad_norm": 11.13424301147461, "learning_rate": 1.2458338869357017e-05, "loss": 1.9455, "step": 126300 }, { "epoch": 4.792235365483773, "grad_norm": 11.497756958007812, "learning_rate": 1.2445508630044631e-05, "loss": 1.8838, "step": 126400 }, { "epoch": 4.79602669093115, "grad_norm": 11.98989486694336, "learning_rate": 1.2432674107555518e-05, "loss": 1.9213, "step": 126500 }, { "epoch": 4.799818016378526, "grad_norm": 10.803661346435547, "learning_rate": 1.2419835324368648e-05, "loss": 1.8973, "step": 126600 }, { "epoch": 4.8036093418259025, "grad_norm": 10.611248970031738, "learning_rate": 1.2406992302970456e-05, "loss": 1.9025, "step": 126700 }, { "epoch": 4.807400667273279, "grad_norm": 11.543232917785645, "learning_rate": 1.2394145065854808e-05, "loss": 1.9381, "step": 126800 }, { "epoch": 4.811191992720655, "grad_norm": 11.23303508758545, "learning_rate": 1.2381293635522942e-05, "loss": 1.9221, "step": 126900 }, { "epoch": 4.814983318168031, "grad_norm": 10.824135780334473, "learning_rate": 1.2368438034483452e-05, "loss": 1.8926, "step": 127000 }, { "epoch": 4.8187746436154075, "grad_norm": 12.173221588134766, "learning_rate": 1.235557828525223e-05, "loss": 1.9128, "step": 127100 }, { "epoch": 4.822565969062785, "grad_norm": 11.311306953430176, "learning_rate": 1.2342714410352434e-05, "loss": 1.879, "step": 127200 }, { "epoch": 4.826357294510161, "grad_norm": 12.006366729736328, "learning_rate": 1.2329846432314447e-05, "loss": 1.8813, "step": 127300 }, { "epoch": 4.830148619957537, "grad_norm": 11.701835632324219, "learning_rate": 1.2316974373675843e-05, "loss": 1.883, "step": 127400 }, { "epoch": 4.8339399454049135, "grad_norm": 12.140498161315918, "learning_rate": 1.2304098256981339e-05, "loss": 1.9127, "step": 127500 }, { "epoch": 4.83773127085229, "grad_norm": 11.829805374145508, "learning_rate": 1.2291218104782756e-05, "loss": 1.8986, "step": 127600 }, { "epoch": 4.841522596299667, "grad_norm": 12.151151657104492, "learning_rate": 1.2278333939638993e-05, "loss": 1.9377, "step": 127700 }, { "epoch": 4.845313921747043, "grad_norm": 11.584785461425781, "learning_rate": 1.226544578411597e-05, "loss": 1.93, "step": 127800 }, { "epoch": 4.849105247194419, "grad_norm": 12.953131675720215, "learning_rate": 1.2252553660786594e-05, "loss": 1.9169, "step": 127900 }, { "epoch": 4.852896572641796, "grad_norm": 10.287446975708008, "learning_rate": 1.2239657592230732e-05, "loss": 1.939, "step": 128000 }, { "epoch": 4.856687898089172, "grad_norm": 10.798286437988281, "learning_rate": 1.222675760103515e-05, "loss": 1.9228, "step": 128100 }, { "epoch": 4.860479223536548, "grad_norm": 11.859781265258789, "learning_rate": 1.2213853709793486e-05, "loss": 1.9108, "step": 128200 }, { "epoch": 4.864270548983924, "grad_norm": 11.37838363647461, "learning_rate": 1.2200945941106217e-05, "loss": 1.9222, "step": 128300 }, { "epoch": 4.868061874431302, "grad_norm": 12.677313804626465, "learning_rate": 1.2188034317580598e-05, "loss": 1.9311, "step": 128400 }, { "epoch": 4.871853199878678, "grad_norm": 12.022448539733887, "learning_rate": 1.2175118861830646e-05, "loss": 1.9175, "step": 128500 }, { "epoch": 4.875644525326054, "grad_norm": 11.225902557373047, "learning_rate": 1.216219959647709e-05, "loss": 1.8932, "step": 128600 }, { "epoch": 4.87943585077343, "grad_norm": 11.791585922241211, "learning_rate": 1.214927654414732e-05, "loss": 1.907, "step": 128700 }, { "epoch": 4.883227176220807, "grad_norm": 10.470597267150879, "learning_rate": 1.2136349727475373e-05, "loss": 1.9004, "step": 128800 }, { "epoch": 4.887018501668183, "grad_norm": 11.478292465209961, "learning_rate": 1.2123419169101871e-05, "loss": 1.8945, "step": 128900 }, { "epoch": 4.890809827115559, "grad_norm": 10.571637153625488, "learning_rate": 1.211048489167399e-05, "loss": 1.9163, "step": 129000 }, { "epoch": 4.894601152562936, "grad_norm": 13.062298774719238, "learning_rate": 1.2097546917845422e-05, "loss": 1.911, "step": 129100 }, { "epoch": 4.8983924780103125, "grad_norm": 13.227325439453125, "learning_rate": 1.2084605270276331e-05, "loss": 1.9229, "step": 129200 }, { "epoch": 4.902183803457689, "grad_norm": 11.808479309082031, "learning_rate": 1.2071659971633318e-05, "loss": 1.9195, "step": 129300 }, { "epoch": 4.905975128905065, "grad_norm": 12.768503189086914, "learning_rate": 1.205871104458937e-05, "loss": 1.9119, "step": 129400 }, { "epoch": 4.909766454352441, "grad_norm": 11.603421211242676, "learning_rate": 1.2045758511823847e-05, "loss": 1.9364, "step": 129500 }, { "epoch": 4.9135577797998184, "grad_norm": 11.669112205505371, "learning_rate": 1.2032802396022402e-05, "loss": 1.8876, "step": 129600 }, { "epoch": 4.917349105247195, "grad_norm": 10.420406341552734, "learning_rate": 1.2019842719876979e-05, "loss": 1.9327, "step": 129700 }, { "epoch": 4.921140430694571, "grad_norm": 11.812887191772461, "learning_rate": 1.2006879506085756e-05, "loss": 1.927, "step": 129800 }, { "epoch": 4.924931756141947, "grad_norm": 10.948456764221191, "learning_rate": 1.1993912777353098e-05, "loss": 1.898, "step": 129900 }, { "epoch": 4.9287230815893235, "grad_norm": 11.777837753295898, "learning_rate": 1.1980942556389542e-05, "loss": 1.9197, "step": 130000 }, { "epoch": 4.9325144070367, "grad_norm": 10.601753234863281, "learning_rate": 1.196796886591172e-05, "loss": 1.906, "step": 130100 }, { "epoch": 4.936305732484076, "grad_norm": 11.480425834655762, "learning_rate": 1.1954991728642358e-05, "loss": 1.9066, "step": 130200 }, { "epoch": 4.940097057931453, "grad_norm": 10.86575984954834, "learning_rate": 1.1942011167310217e-05, "loss": 1.9233, "step": 130300 }, { "epoch": 4.943888383378829, "grad_norm": 11.31550121307373, "learning_rate": 1.1929027204650046e-05, "loss": 1.9018, "step": 130400 }, { "epoch": 4.947679708826206, "grad_norm": 11.072230339050293, "learning_rate": 1.191603986340256e-05, "loss": 1.9246, "step": 130500 }, { "epoch": 4.951471034273582, "grad_norm": 11.473894119262695, "learning_rate": 1.1903049166314387e-05, "loss": 1.9194, "step": 130600 }, { "epoch": 4.955262359720958, "grad_norm": 10.987375259399414, "learning_rate": 1.1890055136138032e-05, "loss": 1.9342, "step": 130700 }, { "epoch": 4.959053685168335, "grad_norm": 11.523063659667969, "learning_rate": 1.1877057795631843e-05, "loss": 1.9137, "step": 130800 }, { "epoch": 4.962845010615712, "grad_norm": 12.07620620727539, "learning_rate": 1.1864057167559959e-05, "loss": 1.8968, "step": 130900 }, { "epoch": 4.966636336063088, "grad_norm": 11.937640190124512, "learning_rate": 1.1851053274692284e-05, "loss": 1.9313, "step": 131000 }, { "epoch": 4.970427661510464, "grad_norm": 10.577836990356445, "learning_rate": 1.1838046139804433e-05, "loss": 1.9318, "step": 131100 }, { "epoch": 4.97421898695784, "grad_norm": 12.628889083862305, "learning_rate": 1.1825035785677701e-05, "loss": 1.8826, "step": 131200 }, { "epoch": 4.978010312405217, "grad_norm": 11.190135955810547, "learning_rate": 1.181202223509903e-05, "loss": 1.9338, "step": 131300 }, { "epoch": 4.981801637852593, "grad_norm": 11.075469017028809, "learning_rate": 1.1799005510860948e-05, "loss": 1.9025, "step": 131400 }, { "epoch": 4.98559296329997, "grad_norm": 11.290766716003418, "learning_rate": 1.1785985635761547e-05, "loss": 1.9118, "step": 131500 }, { "epoch": 4.989384288747346, "grad_norm": 11.823698997497559, "learning_rate": 1.1772962632604439e-05, "loss": 1.906, "step": 131600 }, { "epoch": 4.9931756141947226, "grad_norm": 10.980758666992188, "learning_rate": 1.1759936524198712e-05, "loss": 1.9129, "step": 131700 }, { "epoch": 4.996966939642099, "grad_norm": 11.824015617370605, "learning_rate": 1.1746907333358895e-05, "loss": 1.9042, "step": 131800 }, { "epoch": 5.0, "eval_accuracy": 0.5143645204401771, "eval_loss": 2.180030345916748, "eval_runtime": 929.277, "eval_samples_per_second": 908.259, "eval_steps_per_second": 7.096, "step": 131880 }, { "epoch": 5.000758265089475, "grad_norm": 11.009821891784668, "learning_rate": 1.1733875082904911e-05, "loss": 1.8948, "step": 131900 }, { "epoch": 5.004549590536851, "grad_norm": 11.478715896606445, "learning_rate": 1.172083979566205e-05, "loss": 1.7078, "step": 132000 }, { "epoch": 5.0083409159842285, "grad_norm": 10.75846004486084, "learning_rate": 1.1707801494460913e-05, "loss": 1.6883, "step": 132100 }, { "epoch": 5.012132241431605, "grad_norm": 9.625381469726562, "learning_rate": 1.1694760202137385e-05, "loss": 1.6932, "step": 132200 }, { "epoch": 5.015923566878981, "grad_norm": 9.67786693572998, "learning_rate": 1.1681715941532583e-05, "loss": 1.6882, "step": 132300 }, { "epoch": 5.019714892326357, "grad_norm": 11.051650047302246, "learning_rate": 1.1668668735492831e-05, "loss": 1.7151, "step": 132400 }, { "epoch": 5.0235062177737335, "grad_norm": 10.82264518737793, "learning_rate": 1.1655618606869611e-05, "loss": 1.7174, "step": 132500 }, { "epoch": 5.02729754322111, "grad_norm": 10.503660202026367, "learning_rate": 1.1642565578519515e-05, "loss": 1.7093, "step": 132600 }, { "epoch": 5.031088868668487, "grad_norm": 11.620842933654785, "learning_rate": 1.1629509673304222e-05, "loss": 1.6757, "step": 132700 }, { "epoch": 5.034880194115863, "grad_norm": 10.410591125488281, "learning_rate": 1.1616450914090446e-05, "loss": 1.7182, "step": 132800 }, { "epoch": 5.038671519563239, "grad_norm": 10.560738563537598, "learning_rate": 1.1603389323749902e-05, "loss": 1.713, "step": 132900 }, { "epoch": 5.042462845010616, "grad_norm": 12.081815719604492, "learning_rate": 1.1590324925159262e-05, "loss": 1.7056, "step": 133000 }, { "epoch": 5.046254170457992, "grad_norm": 11.551833152770996, "learning_rate": 1.157725774120012e-05, "loss": 1.7216, "step": 133100 }, { "epoch": 5.050045495905368, "grad_norm": 11.211849212646484, "learning_rate": 1.156418779475894e-05, "loss": 1.7468, "step": 133200 }, { "epoch": 5.0538368213527445, "grad_norm": 11.250187873840332, "learning_rate": 1.1551115108727036e-05, "loss": 1.7289, "step": 133300 }, { "epoch": 5.057628146800122, "grad_norm": 10.917745590209961, "learning_rate": 1.1538039706000505e-05, "loss": 1.6951, "step": 133400 }, { "epoch": 5.061419472247498, "grad_norm": 12.086384773254395, "learning_rate": 1.1524961609480218e-05, "loss": 1.717, "step": 133500 }, { "epoch": 5.065210797694874, "grad_norm": 10.743468284606934, "learning_rate": 1.151188084207176e-05, "loss": 1.7235, "step": 133600 }, { "epoch": 5.06900212314225, "grad_norm": 12.799276351928711, "learning_rate": 1.1498797426685377e-05, "loss": 1.7356, "step": 133700 }, { "epoch": 5.072793448589627, "grad_norm": 12.468548774719238, "learning_rate": 1.1485711386235985e-05, "loss": 1.7437, "step": 133800 }, { "epoch": 5.076584774037003, "grad_norm": 12.131823539733887, "learning_rate": 1.1472622743643067e-05, "loss": 1.7162, "step": 133900 }, { "epoch": 5.08037609948438, "grad_norm": 12.57866096496582, "learning_rate": 1.1459531521830677e-05, "loss": 1.7464, "step": 134000 }, { "epoch": 5.084167424931756, "grad_norm": 11.059057235717773, "learning_rate": 1.1446437743727395e-05, "loss": 1.72, "step": 134100 }, { "epoch": 5.087958750379133, "grad_norm": 11.010120391845703, "learning_rate": 1.1433341432266254e-05, "loss": 1.7243, "step": 134200 }, { "epoch": 5.091750075826509, "grad_norm": 10.828229904174805, "learning_rate": 1.1420242610384753e-05, "loss": 1.7417, "step": 134300 }, { "epoch": 5.095541401273885, "grad_norm": 11.291287422180176, "learning_rate": 1.1407141301024762e-05, "loss": 1.7221, "step": 134400 }, { "epoch": 5.099332726721261, "grad_norm": 11.891501426696777, "learning_rate": 1.1394037527132524e-05, "loss": 1.7263, "step": 134500 }, { "epoch": 5.1031240521686385, "grad_norm": 10.322664260864258, "learning_rate": 1.1380931311658596e-05, "loss": 1.7249, "step": 134600 }, { "epoch": 5.106915377616015, "grad_norm": 10.805962562561035, "learning_rate": 1.1367822677557804e-05, "loss": 1.7435, "step": 134700 }, { "epoch": 5.110706703063391, "grad_norm": 10.88699722290039, "learning_rate": 1.135471164778922e-05, "loss": 1.7494, "step": 134800 }, { "epoch": 5.114498028510767, "grad_norm": 10.290661811828613, "learning_rate": 1.1341598245316101e-05, "loss": 1.7069, "step": 134900 }, { "epoch": 5.1182893539581436, "grad_norm": 10.941231727600098, "learning_rate": 1.132848249310587e-05, "loss": 1.7405, "step": 135000 }, { "epoch": 5.12208067940552, "grad_norm": 12.437928199768066, "learning_rate": 1.1315364414130061e-05, "loss": 1.6995, "step": 135100 }, { "epoch": 5.125872004852897, "grad_norm": 12.19390869140625, "learning_rate": 1.1302244031364282e-05, "loss": 1.7561, "step": 135200 }, { "epoch": 5.129663330300273, "grad_norm": 11.167451858520508, "learning_rate": 1.1289121367788176e-05, "loss": 1.7152, "step": 135300 }, { "epoch": 5.1334546557476495, "grad_norm": 12.008397102355957, "learning_rate": 1.1275996446385387e-05, "loss": 1.7386, "step": 135400 }, { "epoch": 5.137245981195026, "grad_norm": 10.736686706542969, "learning_rate": 1.1262869290143501e-05, "loss": 1.7546, "step": 135500 }, { "epoch": 5.141037306642402, "grad_norm": 10.752262115478516, "learning_rate": 1.1249739922054029e-05, "loss": 1.7133, "step": 135600 }, { "epoch": 5.144828632089778, "grad_norm": 11.149368286132812, "learning_rate": 1.1236608365112353e-05, "loss": 1.7393, "step": 135700 }, { "epoch": 5.148619957537155, "grad_norm": 12.972151756286621, "learning_rate": 1.1223474642317689e-05, "loss": 1.7122, "step": 135800 }, { "epoch": 5.152411282984532, "grad_norm": 10.753881454467773, "learning_rate": 1.1210338776673045e-05, "loss": 1.7225, "step": 135900 }, { "epoch": 5.156202608431908, "grad_norm": 10.623926162719727, "learning_rate": 1.1197200791185178e-05, "loss": 1.698, "step": 136000 }, { "epoch": 5.159993933879284, "grad_norm": 11.493051528930664, "learning_rate": 1.1184060708864573e-05, "loss": 1.7062, "step": 136100 }, { "epoch": 5.16378525932666, "grad_norm": 12.206787109375, "learning_rate": 1.1170918552725362e-05, "loss": 1.6961, "step": 136200 }, { "epoch": 5.167576584774037, "grad_norm": 10.889944076538086, "learning_rate": 1.115777434578534e-05, "loss": 1.6967, "step": 136300 }, { "epoch": 5.171367910221413, "grad_norm": 12.191668510437012, "learning_rate": 1.1144628111065867e-05, "loss": 1.7321, "step": 136400 }, { "epoch": 5.17515923566879, "grad_norm": 12.997697830200195, "learning_rate": 1.1131479871591869e-05, "loss": 1.7342, "step": 136500 }, { "epoch": 5.178950561116166, "grad_norm": 11.49769401550293, "learning_rate": 1.1118329650391781e-05, "loss": 1.7226, "step": 136600 }, { "epoch": 5.182741886563543, "grad_norm": 11.810810089111328, "learning_rate": 1.1105177470497502e-05, "loss": 1.7454, "step": 136700 }, { "epoch": 5.186533212010919, "grad_norm": 11.316378593444824, "learning_rate": 1.1092023354944375e-05, "loss": 1.7066, "step": 136800 }, { "epoch": 5.190324537458295, "grad_norm": 10.928295135498047, "learning_rate": 1.1078867326771121e-05, "loss": 1.7164, "step": 136900 }, { "epoch": 5.194115862905671, "grad_norm": 11.134885787963867, "learning_rate": 1.1065709409019816e-05, "loss": 1.7381, "step": 137000 }, { "epoch": 5.1979071883530485, "grad_norm": 10.09237289428711, "learning_rate": 1.1052549624735842e-05, "loss": 1.7022, "step": 137100 }, { "epoch": 5.201698513800425, "grad_norm": 12.09769058227539, "learning_rate": 1.1039387996967858e-05, "loss": 1.7465, "step": 137200 }, { "epoch": 5.205489839247801, "grad_norm": 12.016862869262695, "learning_rate": 1.1026224548767746e-05, "loss": 1.7221, "step": 137300 }, { "epoch": 5.209281164695177, "grad_norm": 11.410117149353027, "learning_rate": 1.1013059303190572e-05, "loss": 1.7255, "step": 137400 }, { "epoch": 5.213072490142554, "grad_norm": 11.960737228393555, "learning_rate": 1.0999892283294558e-05, "loss": 1.7219, "step": 137500 }, { "epoch": 5.21686381558993, "grad_norm": 10.090522766113281, "learning_rate": 1.0986723512141028e-05, "loss": 1.7077, "step": 137600 }, { "epoch": 5.220655141037307, "grad_norm": 12.375018119812012, "learning_rate": 1.0973553012794383e-05, "loss": 1.7452, "step": 137700 }, { "epoch": 5.224446466484683, "grad_norm": 11.757769584655762, "learning_rate": 1.0960380808322035e-05, "loss": 1.7492, "step": 137800 }, { "epoch": 5.2282377919320595, "grad_norm": 12.6821870803833, "learning_rate": 1.09472069217944e-05, "loss": 1.7242, "step": 137900 }, { "epoch": 5.232029117379436, "grad_norm": 11.043816566467285, "learning_rate": 1.0934031376284824e-05, "loss": 1.7144, "step": 138000 }, { "epoch": 5.235820442826812, "grad_norm": 11.677092552185059, "learning_rate": 1.0920854194869569e-05, "loss": 1.7067, "step": 138100 }, { "epoch": 5.239611768274188, "grad_norm": 12.579203605651855, "learning_rate": 1.0907675400627755e-05, "loss": 1.7221, "step": 138200 }, { "epoch": 5.243403093721565, "grad_norm": 12.220281600952148, "learning_rate": 1.0894495016641336e-05, "loss": 1.731, "step": 138300 }, { "epoch": 5.247194419168942, "grad_norm": 10.821172714233398, "learning_rate": 1.0881313065995043e-05, "loss": 1.7447, "step": 138400 }, { "epoch": 5.250985744616318, "grad_norm": 13.684066772460938, "learning_rate": 1.0868129571776351e-05, "loss": 1.718, "step": 138500 }, { "epoch": 5.254777070063694, "grad_norm": 11.24811840057373, "learning_rate": 1.0854944557075447e-05, "loss": 1.7079, "step": 138600 }, { "epoch": 5.2585683955110705, "grad_norm": 12.326638221740723, "learning_rate": 1.0841758044985164e-05, "loss": 1.7094, "step": 138700 }, { "epoch": 5.262359720958447, "grad_norm": 10.52296257019043, "learning_rate": 1.0828570058600978e-05, "loss": 1.742, "step": 138800 }, { "epoch": 5.266151046405824, "grad_norm": 11.652645111083984, "learning_rate": 1.0815380621020935e-05, "loss": 1.7313, "step": 138900 }, { "epoch": 5.2699423718532, "grad_norm": 11.875689506530762, "learning_rate": 1.0802189755345618e-05, "loss": 1.7583, "step": 139000 }, { "epoch": 5.273733697300576, "grad_norm": 11.106082916259766, "learning_rate": 1.0788997484678127e-05, "loss": 1.7306, "step": 139100 }, { "epoch": 5.277525022747953, "grad_norm": 12.288179397583008, "learning_rate": 1.0775803832124008e-05, "loss": 1.7116, "step": 139200 }, { "epoch": 5.281316348195329, "grad_norm": 12.356056213378906, "learning_rate": 1.0762608820791241e-05, "loss": 1.721, "step": 139300 }, { "epoch": 5.285107673642705, "grad_norm": 11.474201202392578, "learning_rate": 1.0749412473790168e-05, "loss": 1.7277, "step": 139400 }, { "epoch": 5.288898999090081, "grad_norm": 11.470195770263672, "learning_rate": 1.0736214814233488e-05, "loss": 1.7679, "step": 139500 }, { "epoch": 5.292690324537459, "grad_norm": 11.034538269042969, "learning_rate": 1.0723015865236188e-05, "loss": 1.7359, "step": 139600 }, { "epoch": 5.296481649984835, "grad_norm": 11.579090118408203, "learning_rate": 1.0709815649915515e-05, "loss": 1.7178, "step": 139700 }, { "epoch": 5.300272975432211, "grad_norm": 11.800409317016602, "learning_rate": 1.0696614191390944e-05, "loss": 1.7629, "step": 139800 }, { "epoch": 5.304064300879587, "grad_norm": 12.960884094238281, "learning_rate": 1.068341151278411e-05, "loss": 1.7183, "step": 139900 }, { "epoch": 5.307855626326964, "grad_norm": 11.995640754699707, "learning_rate": 1.0670207637218794e-05, "loss": 1.7311, "step": 140000 }, { "epoch": 5.311646951774341, "grad_norm": 10.874401092529297, "learning_rate": 1.0657002587820879e-05, "loss": 1.7225, "step": 140100 }, { "epoch": 5.315438277221717, "grad_norm": 11.993330001831055, "learning_rate": 1.0643796387718297e-05, "loss": 1.7191, "step": 140200 }, { "epoch": 5.319229602669093, "grad_norm": 11.458551406860352, "learning_rate": 1.0630589060040995e-05, "loss": 1.7348, "step": 140300 }, { "epoch": 5.3230209281164695, "grad_norm": 11.8816499710083, "learning_rate": 1.0617380627920897e-05, "loss": 1.7137, "step": 140400 }, { "epoch": 5.326812253563846, "grad_norm": 11.976946830749512, "learning_rate": 1.060417111449186e-05, "loss": 1.7316, "step": 140500 }, { "epoch": 5.330603579011222, "grad_norm": 10.636822700500488, "learning_rate": 1.0590960542889635e-05, "loss": 1.7151, "step": 140600 }, { "epoch": 5.334394904458598, "grad_norm": 10.775686264038086, "learning_rate": 1.0577748936251829e-05, "loss": 1.7411, "step": 140700 }, { "epoch": 5.3381862299059755, "grad_norm": 9.877192497253418, "learning_rate": 1.0564536317717858e-05, "loss": 1.757, "step": 140800 }, { "epoch": 5.341977555353352, "grad_norm": 11.85936164855957, "learning_rate": 1.0551322710428917e-05, "loss": 1.7597, "step": 140900 }, { "epoch": 5.345768880800728, "grad_norm": 10.8248872756958, "learning_rate": 1.053810813752792e-05, "loss": 1.7442, "step": 141000 }, { "epoch": 5.349560206248104, "grad_norm": 10.658221244812012, "learning_rate": 1.0524892622159489e-05, "loss": 1.7253, "step": 141100 }, { "epoch": 5.3533515316954805, "grad_norm": 11.263945579528809, "learning_rate": 1.0511676187469881e-05, "loss": 1.7394, "step": 141200 }, { "epoch": 5.357142857142857, "grad_norm": 10.778286933898926, "learning_rate": 1.0498458856606972e-05, "loss": 1.7485, "step": 141300 }, { "epoch": 5.360934182590234, "grad_norm": 12.57288646697998, "learning_rate": 1.048524065272021e-05, "loss": 1.7268, "step": 141400 }, { "epoch": 5.36472550803761, "grad_norm": 11.176697731018066, "learning_rate": 1.0472021598960558e-05, "loss": 1.7543, "step": 141500 }, { "epoch": 5.368516833484986, "grad_norm": 12.716745376586914, "learning_rate": 1.0458801718480486e-05, "loss": 1.7603, "step": 141600 }, { "epoch": 5.372308158932363, "grad_norm": 11.353650093078613, "learning_rate": 1.0445581034433899e-05, "loss": 1.7385, "step": 141700 }, { "epoch": 5.376099484379739, "grad_norm": 12.236551284790039, "learning_rate": 1.0432359569976114e-05, "loss": 1.7694, "step": 141800 }, { "epoch": 5.379890809827115, "grad_norm": 11.589540481567383, "learning_rate": 1.0419137348263815e-05, "loss": 1.7535, "step": 141900 }, { "epoch": 5.383682135274492, "grad_norm": 10.82952880859375, "learning_rate": 1.0405914392455013e-05, "loss": 1.772, "step": 142000 }, { "epoch": 5.387473460721869, "grad_norm": 11.829825401306152, "learning_rate": 1.0392690725709e-05, "loss": 1.7509, "step": 142100 }, { "epoch": 5.391264786169245, "grad_norm": 11.04835319519043, "learning_rate": 1.0379466371186315e-05, "loss": 1.7441, "step": 142200 }, { "epoch": 5.395056111616621, "grad_norm": 13.134116172790527, "learning_rate": 1.0366241352048712e-05, "loss": 1.7344, "step": 142300 }, { "epoch": 5.398847437063997, "grad_norm": 10.773009300231934, "learning_rate": 1.035301569145909e-05, "loss": 1.7113, "step": 142400 }, { "epoch": 5.402638762511374, "grad_norm": 11.159862518310547, "learning_rate": 1.0339789412581487e-05, "loss": 1.7437, "step": 142500 }, { "epoch": 5.40643008795875, "grad_norm": 11.78554630279541, "learning_rate": 1.0326562538581017e-05, "loss": 1.72, "step": 142600 }, { "epoch": 5.410221413406127, "grad_norm": 11.379344940185547, "learning_rate": 1.031333509262384e-05, "loss": 1.7482, "step": 142700 }, { "epoch": 5.414012738853503, "grad_norm": 11.918542861938477, "learning_rate": 1.0300107097877114e-05, "loss": 1.7237, "step": 142800 }, { "epoch": 5.41780406430088, "grad_norm": 10.664207458496094, "learning_rate": 1.028687857750896e-05, "loss": 1.7581, "step": 142900 }, { "epoch": 5.421595389748256, "grad_norm": 11.843918800354004, "learning_rate": 1.0273649554688418e-05, "loss": 1.7399, "step": 143000 }, { "epoch": 5.425386715195632, "grad_norm": 10.620604515075684, "learning_rate": 1.0260420052585412e-05, "loss": 1.7213, "step": 143100 }, { "epoch": 5.429178040643009, "grad_norm": 11.557217597961426, "learning_rate": 1.0247190094370699e-05, "loss": 1.7088, "step": 143200 }, { "epoch": 5.4329693660903855, "grad_norm": 12.7789945602417, "learning_rate": 1.0233959703215842e-05, "loss": 1.731, "step": 143300 }, { "epoch": 5.436760691537762, "grad_norm": 13.252955436706543, "learning_rate": 1.022072890229316e-05, "loss": 1.7091, "step": 143400 }, { "epoch": 5.440552016985138, "grad_norm": 12.209487915039062, "learning_rate": 1.0207497714775685e-05, "loss": 1.7143, "step": 143500 }, { "epoch": 5.444343342432514, "grad_norm": 15.49000358581543, "learning_rate": 1.0194266163837131e-05, "loss": 1.7484, "step": 143600 }, { "epoch": 5.4481346678798905, "grad_norm": 12.577410697937012, "learning_rate": 1.0181034272651848e-05, "loss": 1.7194, "step": 143700 }, { "epoch": 5.451925993327267, "grad_norm": 11.720094680786133, "learning_rate": 1.0167802064394784e-05, "loss": 1.6871, "step": 143800 }, { "epoch": 5.455717318774644, "grad_norm": 12.822196006774902, "learning_rate": 1.0154569562241434e-05, "loss": 1.724, "step": 143900 }, { "epoch": 5.45950864422202, "grad_norm": 11.771242141723633, "learning_rate": 1.0141336789367812e-05, "loss": 1.7053, "step": 144000 }, { "epoch": 5.4632999696693965, "grad_norm": 11.13105297088623, "learning_rate": 1.0128103768950416e-05, "loss": 1.7378, "step": 144100 }, { "epoch": 5.467091295116773, "grad_norm": 12.045822143554688, "learning_rate": 1.011487052416616e-05, "loss": 1.7291, "step": 144200 }, { "epoch": 5.470882620564149, "grad_norm": 12.303630828857422, "learning_rate": 1.0101637078192367e-05, "loss": 1.7595, "step": 144300 }, { "epoch": 5.474673946011525, "grad_norm": 12.733563423156738, "learning_rate": 1.0088403454206703e-05, "loss": 1.7331, "step": 144400 }, { "epoch": 5.478465271458902, "grad_norm": 11.700331687927246, "learning_rate": 1.0075169675387146e-05, "loss": 1.7207, "step": 144500 }, { "epoch": 5.482256596906279, "grad_norm": 11.098690032958984, "learning_rate": 1.0061935764911953e-05, "loss": 1.715, "step": 144600 }, { "epoch": 5.486047922353655, "grad_norm": 11.21896743774414, "learning_rate": 1.0048701745959593e-05, "loss": 1.7139, "step": 144700 }, { "epoch": 5.489839247801031, "grad_norm": 11.806435585021973, "learning_rate": 1.0035467641708755e-05, "loss": 1.725, "step": 144800 }, { "epoch": 5.493630573248407, "grad_norm": 10.83428955078125, "learning_rate": 1.0022233475338245e-05, "loss": 1.7341, "step": 144900 }, { "epoch": 5.497421898695784, "grad_norm": 11.496482849121094, "learning_rate": 1.0008999270027003e-05, "loss": 1.7174, "step": 145000 }, { "epoch": 5.501213224143161, "grad_norm": 12.381556510925293, "learning_rate": 9.995765048954021e-06, "loss": 1.6973, "step": 145100 }, { "epoch": 5.505004549590537, "grad_norm": 11.458586692810059, "learning_rate": 9.98253083529833e-06, "loss": 1.7409, "step": 145200 }, { "epoch": 5.508795875037913, "grad_norm": 12.05925464630127, "learning_rate": 9.969296652238943e-06, "loss": 1.7007, "step": 145300 }, { "epoch": 5.51258720048529, "grad_norm": 12.180753707885742, "learning_rate": 9.956062522954815e-06, "loss": 1.7315, "step": 145400 }, { "epoch": 5.516378525932666, "grad_norm": 12.770673751831055, "learning_rate": 9.942828470624811e-06, "loss": 1.719, "step": 145500 }, { "epoch": 5.520169851380042, "grad_norm": 11.70596981048584, "learning_rate": 9.929594518427663e-06, "loss": 1.6927, "step": 145600 }, { "epoch": 5.523961176827418, "grad_norm": 12.685752868652344, "learning_rate": 9.916360689541932e-06, "loss": 1.7373, "step": 145700 }, { "epoch": 5.5277525022747955, "grad_norm": 10.262012481689453, "learning_rate": 9.903127007145945e-06, "loss": 1.7242, "step": 145800 }, { "epoch": 5.531543827722172, "grad_norm": 12.036771774291992, "learning_rate": 9.889893494417793e-06, "loss": 1.7417, "step": 145900 }, { "epoch": 5.535335153169548, "grad_norm": 12.625959396362305, "learning_rate": 9.876660174535261e-06, "loss": 1.7385, "step": 146000 }, { "epoch": 5.539126478616924, "grad_norm": 12.047428131103516, "learning_rate": 9.863427070675792e-06, "loss": 1.7245, "step": 146100 }, { "epoch": 5.542917804064301, "grad_norm": 12.193134307861328, "learning_rate": 9.850194206016458e-06, "loss": 1.7507, "step": 146200 }, { "epoch": 5.546709129511678, "grad_norm": 12.812241554260254, "learning_rate": 9.83696160373391e-06, "loss": 1.7394, "step": 146300 }, { "epoch": 5.550500454959054, "grad_norm": 12.259443283081055, "learning_rate": 9.823729287004336e-06, "loss": 1.7548, "step": 146400 }, { "epoch": 5.55429178040643, "grad_norm": 11.989737510681152, "learning_rate": 9.810497279003424e-06, "loss": 1.6988, "step": 146500 }, { "epoch": 5.5580831058538065, "grad_norm": 11.403079986572266, "learning_rate": 9.797265602906332e-06, "loss": 1.7186, "step": 146600 }, { "epoch": 5.561874431301183, "grad_norm": 12.56582260131836, "learning_rate": 9.784034281887624e-06, "loss": 1.6952, "step": 146700 }, { "epoch": 5.565665756748559, "grad_norm": 12.625117301940918, "learning_rate": 9.770803339121246e-06, "loss": 1.7517, "step": 146800 }, { "epoch": 5.569457082195935, "grad_norm": 10.6593017578125, "learning_rate": 9.757572797780481e-06, "loss": 1.7313, "step": 146900 }, { "epoch": 5.573248407643312, "grad_norm": 12.82185173034668, "learning_rate": 9.744342681037916e-06, "loss": 1.7066, "step": 147000 }, { "epoch": 5.577039733090689, "grad_norm": 11.832967758178711, "learning_rate": 9.731113012065381e-06, "loss": 1.7287, "step": 147100 }, { "epoch": 5.580831058538065, "grad_norm": 10.941871643066406, "learning_rate": 9.71788381403393e-06, "loss": 1.7126, "step": 147200 }, { "epoch": 5.584622383985441, "grad_norm": 11.337785720825195, "learning_rate": 9.7046551101138e-06, "loss": 1.7152, "step": 147300 }, { "epoch": 5.5884137094328175, "grad_norm": 13.193537712097168, "learning_rate": 9.691426923474338e-06, "loss": 1.7631, "step": 147400 }, { "epoch": 5.592205034880195, "grad_norm": 12.104741096496582, "learning_rate": 9.678199277284015e-06, "loss": 1.7283, "step": 147500 }, { "epoch": 5.595996360327571, "grad_norm": 11.748642921447754, "learning_rate": 9.664972194710337e-06, "loss": 1.7382, "step": 147600 }, { "epoch": 5.599787685774947, "grad_norm": 12.315329551696777, "learning_rate": 9.651745698919828e-06, "loss": 1.7367, "step": 147700 }, { "epoch": 5.603579011222323, "grad_norm": 11.372101783752441, "learning_rate": 9.63851981307798e-06, "loss": 1.714, "step": 147800 }, { "epoch": 5.6073703366697, "grad_norm": 12.25269889831543, "learning_rate": 9.62529456034922e-06, "loss": 1.7483, "step": 147900 }, { "epoch": 5.611161662117076, "grad_norm": 11.96284008026123, "learning_rate": 9.612069963896873e-06, "loss": 1.7394, "step": 148000 }, { "epoch": 5.614952987564452, "grad_norm": 11.167996406555176, "learning_rate": 9.598846046883099e-06, "loss": 1.7218, "step": 148100 }, { "epoch": 5.618744313011829, "grad_norm": 11.83029556274414, "learning_rate": 9.585622832468882e-06, "loss": 1.7387, "step": 148200 }, { "epoch": 5.622535638459206, "grad_norm": 12.02774429321289, "learning_rate": 9.572400343813972e-06, "loss": 1.7374, "step": 148300 }, { "epoch": 5.626326963906582, "grad_norm": 10.942395210266113, "learning_rate": 9.559178604076842e-06, "loss": 1.7497, "step": 148400 }, { "epoch": 5.630118289353958, "grad_norm": 11.498610496520996, "learning_rate": 9.54595763641466e-06, "loss": 1.7327, "step": 148500 }, { "epoch": 5.633909614801334, "grad_norm": 10.533560752868652, "learning_rate": 9.532737463983234e-06, "loss": 1.7445, "step": 148600 }, { "epoch": 5.637700940248711, "grad_norm": 11.137718200683594, "learning_rate": 9.519518109936998e-06, "loss": 1.7401, "step": 148700 }, { "epoch": 5.641492265696087, "grad_norm": 11.746006965637207, "learning_rate": 9.506299597428919e-06, "loss": 1.7249, "step": 148800 }, { "epoch": 5.645283591143464, "grad_norm": 12.00092601776123, "learning_rate": 9.493081949610526e-06, "loss": 1.7209, "step": 148900 }, { "epoch": 5.64907491659084, "grad_norm": 11.97046184539795, "learning_rate": 9.479865189631815e-06, "loss": 1.7417, "step": 149000 }, { "epoch": 5.6528662420382165, "grad_norm": 11.254858016967773, "learning_rate": 9.466649340641224e-06, "loss": 1.7214, "step": 149100 }, { "epoch": 5.656657567485593, "grad_norm": 13.568397521972656, "learning_rate": 9.453434425785605e-06, "loss": 1.7143, "step": 149200 }, { "epoch": 5.660448892932969, "grad_norm": 12.557656288146973, "learning_rate": 9.440220468210172e-06, "loss": 1.7332, "step": 149300 }, { "epoch": 5.664240218380346, "grad_norm": 13.336578369140625, "learning_rate": 9.427007491058454e-06, "loss": 1.7341, "step": 149400 }, { "epoch": 5.6680315438277225, "grad_norm": 12.94596004486084, "learning_rate": 9.413795517472274e-06, "loss": 1.7178, "step": 149500 }, { "epoch": 5.671822869275099, "grad_norm": 11.668149948120117, "learning_rate": 9.400584570591692e-06, "loss": 1.7434, "step": 149600 }, { "epoch": 5.675614194722475, "grad_norm": 11.29442310333252, "learning_rate": 9.387374673554973e-06, "loss": 1.7077, "step": 149700 }, { "epoch": 5.679405520169851, "grad_norm": 12.328103065490723, "learning_rate": 9.374165849498535e-06, "loss": 1.703, "step": 149800 }, { "epoch": 5.6831968456172275, "grad_norm": 10.545650482177734, "learning_rate": 9.360958121556925e-06, "loss": 1.6898, "step": 149900 }, { "epoch": 5.686988171064604, "grad_norm": 10.089820861816406, "learning_rate": 9.34775151286277e-06, "loss": 1.7395, "step": 150000 }, { "epoch": 5.690779496511981, "grad_norm": 10.74809455871582, "learning_rate": 9.334546046546732e-06, "loss": 1.7485, "step": 150100 }, { "epoch": 5.694570821959357, "grad_norm": 10.95058536529541, "learning_rate": 9.32134174573747e-06, "loss": 1.7736, "step": 150200 }, { "epoch": 5.698362147406733, "grad_norm": 11.350272178649902, "learning_rate": 9.308138633561617e-06, "loss": 1.7129, "step": 150300 }, { "epoch": 5.70215347285411, "grad_norm": 11.029206275939941, "learning_rate": 9.2949367331437e-06, "loss": 1.7036, "step": 150400 }, { "epoch": 5.705944798301486, "grad_norm": 12.58275318145752, "learning_rate": 9.281736067606148e-06, "loss": 1.7031, "step": 150500 }, { "epoch": 5.709736123748863, "grad_norm": 11.721226692199707, "learning_rate": 9.26853666006921e-06, "loss": 1.7245, "step": 150600 }, { "epoch": 5.713527449196239, "grad_norm": 11.414534568786621, "learning_rate": 9.255338533650943e-06, "loss": 1.7375, "step": 150700 }, { "epoch": 5.717318774643616, "grad_norm": 12.276820182800293, "learning_rate": 9.242141711467148e-06, "loss": 1.7127, "step": 150800 }, { "epoch": 5.721110100090992, "grad_norm": 10.436358451843262, "learning_rate": 9.228946216631353e-06, "loss": 1.7288, "step": 150900 }, { "epoch": 5.724901425538368, "grad_norm": 12.101046562194824, "learning_rate": 9.215752072254758e-06, "loss": 1.7163, "step": 151000 }, { "epoch": 5.728692750985744, "grad_norm": 11.574102401733398, "learning_rate": 9.202559301446195e-06, "loss": 1.7383, "step": 151100 }, { "epoch": 5.732484076433121, "grad_norm": 12.919204711914062, "learning_rate": 9.18936792731209e-06, "loss": 1.707, "step": 151200 }, { "epoch": 5.736275401880498, "grad_norm": 11.479150772094727, "learning_rate": 9.17617797295643e-06, "loss": 1.7321, "step": 151300 }, { "epoch": 5.740066727327874, "grad_norm": 12.437186241149902, "learning_rate": 9.162989461480706e-06, "loss": 1.7565, "step": 151400 }, { "epoch": 5.74385805277525, "grad_norm": 12.013269424438477, "learning_rate": 9.149802415983886e-06, "loss": 1.7243, "step": 151500 }, { "epoch": 5.747649378222627, "grad_norm": 12.407118797302246, "learning_rate": 9.136616859562369e-06, "loss": 1.7585, "step": 151600 }, { "epoch": 5.751440703670003, "grad_norm": 11.591772079467773, "learning_rate": 9.123432815309953e-06, "loss": 1.7619, "step": 151700 }, { "epoch": 5.755232029117379, "grad_norm": 10.900681495666504, "learning_rate": 9.110250306317778e-06, "loss": 1.7395, "step": 151800 }, { "epoch": 5.759023354564756, "grad_norm": 11.835617065429688, "learning_rate": 9.097069355674298e-06, "loss": 1.7132, "step": 151900 }, { "epoch": 5.7628146800121325, "grad_norm": 12.367192268371582, "learning_rate": 9.083889986465242e-06, "loss": 1.7152, "step": 152000 }, { "epoch": 5.766606005459509, "grad_norm": 12.47367000579834, "learning_rate": 9.070712221773562e-06, "loss": 1.7139, "step": 152100 }, { "epoch": 5.770397330906885, "grad_norm": 12.715473175048828, "learning_rate": 9.057536084679404e-06, "loss": 1.7213, "step": 152200 }, { "epoch": 5.774188656354261, "grad_norm": 11.480833053588867, "learning_rate": 9.044361598260068e-06, "loss": 1.73, "step": 152300 }, { "epoch": 5.7779799818016375, "grad_norm": 11.188892364501953, "learning_rate": 9.03118878558995e-06, "loss": 1.7509, "step": 152400 }, { "epoch": 5.781771307249015, "grad_norm": 12.6340913772583, "learning_rate": 9.018017669740526e-06, "loss": 1.7206, "step": 152500 }, { "epoch": 5.785562632696391, "grad_norm": 12.988966941833496, "learning_rate": 9.004848273780298e-06, "loss": 1.7451, "step": 152600 }, { "epoch": 5.789353958143767, "grad_norm": 12.041772842407227, "learning_rate": 8.991680620774756e-06, "loss": 1.7116, "step": 152700 }, { "epoch": 5.7931452835911434, "grad_norm": 10.997750282287598, "learning_rate": 8.97851473378633e-06, "loss": 1.7192, "step": 152800 }, { "epoch": 5.79693660903852, "grad_norm": 11.281858444213867, "learning_rate": 8.965350635874361e-06, "loss": 1.7116, "step": 152900 }, { "epoch": 5.800727934485896, "grad_norm": 11.791625022888184, "learning_rate": 8.952188350095071e-06, "loss": 1.7361, "step": 153000 }, { "epoch": 5.804519259933272, "grad_norm": 11.20655632019043, "learning_rate": 8.939027899501477e-06, "loss": 1.7052, "step": 153100 }, { "epoch": 5.808310585380649, "grad_norm": 11.727370262145996, "learning_rate": 8.92586930714341e-06, "loss": 1.7417, "step": 153200 }, { "epoch": 5.812101910828026, "grad_norm": 10.649822235107422, "learning_rate": 8.912712596067439e-06, "loss": 1.7161, "step": 153300 }, { "epoch": 5.815893236275402, "grad_norm": 11.541646003723145, "learning_rate": 8.899557789316828e-06, "loss": 1.7357, "step": 153400 }, { "epoch": 5.819684561722778, "grad_norm": 11.971867561340332, "learning_rate": 8.886404909931516e-06, "loss": 1.7199, "step": 153500 }, { "epoch": 5.823475887170154, "grad_norm": 10.35626220703125, "learning_rate": 8.873253980948062e-06, "loss": 1.7331, "step": 153600 }, { "epoch": 5.8272672126175316, "grad_norm": 11.167160987854004, "learning_rate": 8.860105025399618e-06, "loss": 1.7218, "step": 153700 }, { "epoch": 5.831058538064908, "grad_norm": 12.583477020263672, "learning_rate": 8.846958066315858e-06, "loss": 1.7438, "step": 153800 }, { "epoch": 5.834849863512284, "grad_norm": 11.173572540283203, "learning_rate": 8.833813126722984e-06, "loss": 1.7071, "step": 153900 }, { "epoch": 5.83864118895966, "grad_norm": 11.98514175415039, "learning_rate": 8.820670229643647e-06, "loss": 1.7032, "step": 154000 }, { "epoch": 5.842432514407037, "grad_norm": 11.578864097595215, "learning_rate": 8.807529398096923e-06, "loss": 1.7042, "step": 154100 }, { "epoch": 5.846223839854413, "grad_norm": 11.643933296203613, "learning_rate": 8.79439065509827e-06, "loss": 1.7242, "step": 154200 }, { "epoch": 5.850015165301789, "grad_norm": 10.408468246459961, "learning_rate": 8.781254023659495e-06, "loss": 1.7377, "step": 154300 }, { "epoch": 5.853806490749166, "grad_norm": 10.830367088317871, "learning_rate": 8.768119526788693e-06, "loss": 1.7261, "step": 154400 }, { "epoch": 5.8575978161965425, "grad_norm": 10.611695289611816, "learning_rate": 8.75498718749023e-06, "loss": 1.7072, "step": 154500 }, { "epoch": 5.861389141643919, "grad_norm": 12.249737739562988, "learning_rate": 8.741857028764703e-06, "loss": 1.7474, "step": 154600 }, { "epoch": 5.865180467091295, "grad_norm": 13.015705108642578, "learning_rate": 8.728729073608858e-06, "loss": 1.7373, "step": 154700 }, { "epoch": 5.868971792538671, "grad_norm": 13.534759521484375, "learning_rate": 8.715603345015617e-06, "loss": 1.7234, "step": 154800 }, { "epoch": 5.8727631179860476, "grad_norm": 12.472187995910645, "learning_rate": 8.70247986597398e-06, "loss": 1.7279, "step": 154900 }, { "epoch": 5.876554443433425, "grad_norm": 12.15674877166748, "learning_rate": 8.689358659469021e-06, "loss": 1.7092, "step": 155000 }, { "epoch": 5.880345768880801, "grad_norm": 12.67679214477539, "learning_rate": 8.676239748481821e-06, "loss": 1.7513, "step": 155100 }, { "epoch": 5.884137094328177, "grad_norm": 11.683769226074219, "learning_rate": 8.663123155989445e-06, "loss": 1.7405, "step": 155200 }, { "epoch": 5.8879284197755535, "grad_norm": 12.486827850341797, "learning_rate": 8.650008904964909e-06, "loss": 1.7474, "step": 155300 }, { "epoch": 5.89171974522293, "grad_norm": 12.198080062866211, "learning_rate": 8.636897018377104e-06, "loss": 1.7358, "step": 155400 }, { "epoch": 5.895511070670306, "grad_norm": 11.509108543395996, "learning_rate": 8.6237875191908e-06, "loss": 1.7371, "step": 155500 }, { "epoch": 5.899302396117683, "grad_norm": 13.466974258422852, "learning_rate": 8.610680430366584e-06, "loss": 1.7392, "step": 155600 }, { "epoch": 5.903093721565059, "grad_norm": 10.47100830078125, "learning_rate": 8.597575774860811e-06, "loss": 1.707, "step": 155700 }, { "epoch": 5.906885047012436, "grad_norm": 12.88302230834961, "learning_rate": 8.584473575625578e-06, "loss": 1.6918, "step": 155800 }, { "epoch": 5.910676372459812, "grad_norm": 12.256343841552734, "learning_rate": 8.571373855608683e-06, "loss": 1.7437, "step": 155900 }, { "epoch": 5.914467697907188, "grad_norm": 11.425036430358887, "learning_rate": 8.558276637753588e-06, "loss": 1.7376, "step": 156000 }, { "epoch": 5.918259023354564, "grad_norm": 12.564769744873047, "learning_rate": 8.545181944999354e-06, "loss": 1.7163, "step": 156100 }, { "epoch": 5.922050348801941, "grad_norm": 11.846734046936035, "learning_rate": 8.532089800280638e-06, "loss": 1.7254, "step": 156200 }, { "epoch": 5.925841674249318, "grad_norm": 11.989322662353516, "learning_rate": 8.519000226527624e-06, "loss": 1.7254, "step": 156300 }, { "epoch": 5.929632999696694, "grad_norm": 12.433265686035156, "learning_rate": 8.505913246665996e-06, "loss": 1.7139, "step": 156400 }, { "epoch": 5.93342432514407, "grad_norm": 11.543211936950684, "learning_rate": 8.492828883616894e-06, "loss": 1.7539, "step": 156500 }, { "epoch": 5.937215650591447, "grad_norm": 12.185396194458008, "learning_rate": 8.47974716029688e-06, "loss": 1.6988, "step": 156600 }, { "epoch": 5.941006976038823, "grad_norm": 12.510396003723145, "learning_rate": 8.46666809961788e-06, "loss": 1.6917, "step": 156700 }, { "epoch": 5.9447983014862, "grad_norm": 12.396333694458008, "learning_rate": 8.45359172448717e-06, "loss": 1.7082, "step": 156800 }, { "epoch": 5.948589626933576, "grad_norm": 12.276369094848633, "learning_rate": 8.44051805780732e-06, "loss": 1.7293, "step": 156900 }, { "epoch": 5.9523809523809526, "grad_norm": 11.859585762023926, "learning_rate": 8.427447122476148e-06, "loss": 1.7184, "step": 157000 }, { "epoch": 5.956172277828329, "grad_norm": 11.67721939086914, "learning_rate": 8.414378941386697e-06, "loss": 1.7299, "step": 157100 }, { "epoch": 5.959963603275705, "grad_norm": 12.177579879760742, "learning_rate": 8.401313537427181e-06, "loss": 1.7174, "step": 157200 }, { "epoch": 5.963754928723081, "grad_norm": 11.747023582458496, "learning_rate": 8.388250933480954e-06, "loss": 1.7392, "step": 157300 }, { "epoch": 5.967546254170458, "grad_norm": 11.993696212768555, "learning_rate": 8.375191152426462e-06, "loss": 1.6995, "step": 157400 }, { "epoch": 5.971337579617835, "grad_norm": 10.05133056640625, "learning_rate": 8.362134217137205e-06, "loss": 1.7339, "step": 157500 }, { "epoch": 5.975128905065211, "grad_norm": 12.713106155395508, "learning_rate": 8.349080150481711e-06, "loss": 1.7157, "step": 157600 }, { "epoch": 5.978920230512587, "grad_norm": 11.697371482849121, "learning_rate": 8.336028975323468e-06, "loss": 1.7081, "step": 157700 }, { "epoch": 5.9827115559599635, "grad_norm": 13.024626731872559, "learning_rate": 8.322980714520907e-06, "loss": 1.7125, "step": 157800 }, { "epoch": 5.98650288140734, "grad_norm": 11.704550743103027, "learning_rate": 8.309935390927358e-06, "loss": 1.7207, "step": 157900 }, { "epoch": 5.990294206854716, "grad_norm": 11.219043731689453, "learning_rate": 8.296893027391003e-06, "loss": 1.7183, "step": 158000 }, { "epoch": 5.994085532302093, "grad_norm": 12.878378868103027, "learning_rate": 8.283853646754836e-06, "loss": 1.7518, "step": 158100 }, { "epoch": 5.997876857749469, "grad_norm": 11.511736869812012, "learning_rate": 8.270817271856633e-06, "loss": 1.7293, "step": 158200 }, { "epoch": 6.0, "eval_accuracy": 0.5227161786868619, "eval_loss": 2.1371841430664062, "eval_runtime": 935.7562, "eval_samples_per_second": 901.97, "eval_steps_per_second": 7.047, "step": 158256 }, { "epoch": 6.001668183196846, "grad_norm": 11.979788780212402, "learning_rate": 8.257783925528906e-06, "loss": 1.6489, "step": 158300 }, { "epoch": 6.005459508644222, "grad_norm": 11.79994010925293, "learning_rate": 8.244753630598849e-06, "loss": 1.5763, "step": 158400 }, { "epoch": 6.009250834091598, "grad_norm": 10.888725280761719, "learning_rate": 8.231726409888334e-06, "loss": 1.5846, "step": 158500 }, { "epoch": 6.0130421595389745, "grad_norm": 12.399412155151367, "learning_rate": 8.218702286213832e-06, "loss": 1.5208, "step": 158600 }, { "epoch": 6.016833484986352, "grad_norm": 11.128008842468262, "learning_rate": 8.205681282386393e-06, "loss": 1.5881, "step": 158700 }, { "epoch": 6.020624810433728, "grad_norm": 11.243239402770996, "learning_rate": 8.192663421211608e-06, "loss": 1.5806, "step": 158800 }, { "epoch": 6.024416135881104, "grad_norm": 10.675949096679688, "learning_rate": 8.179648725489554e-06, "loss": 1.5452, "step": 158900 }, { "epoch": 6.02820746132848, "grad_norm": 10.91494083404541, "learning_rate": 8.16663721801478e-06, "loss": 1.5791, "step": 159000 }, { "epoch": 6.031998786775857, "grad_norm": 11.521324157714844, "learning_rate": 8.15362892157623e-06, "loss": 1.5492, "step": 159100 }, { "epoch": 6.035790112223233, "grad_norm": 10.850177764892578, "learning_rate": 8.140623858957242e-06, "loss": 1.5172, "step": 159200 }, { "epoch": 6.03958143767061, "grad_norm": 10.737998962402344, "learning_rate": 8.127622052935483e-06, "loss": 1.5836, "step": 159300 }, { "epoch": 6.043372763117986, "grad_norm": 9.78996467590332, "learning_rate": 8.114623526282913e-06, "loss": 1.5683, "step": 159400 }, { "epoch": 6.047164088565363, "grad_norm": 12.65347671508789, "learning_rate": 8.101628301765752e-06, "loss": 1.6089, "step": 159500 }, { "epoch": 6.050955414012739, "grad_norm": 10.2801513671875, "learning_rate": 8.088636402144442e-06, "loss": 1.5719, "step": 159600 }, { "epoch": 6.054746739460115, "grad_norm": 11.827587127685547, "learning_rate": 8.075647850173588e-06, "loss": 1.5632, "step": 159700 }, { "epoch": 6.058538064907491, "grad_norm": 12.672281265258789, "learning_rate": 8.062662668601941e-06, "loss": 1.5426, "step": 159800 }, { "epoch": 6.0623293903548685, "grad_norm": 11.21347427368164, "learning_rate": 8.049680880172353e-06, "loss": 1.5792, "step": 159900 }, { "epoch": 6.066120715802245, "grad_norm": 11.369627952575684, "learning_rate": 8.036702507621727e-06, "loss": 1.5582, "step": 160000 }, { "epoch": 6.069912041249621, "grad_norm": 12.218276977539062, "learning_rate": 8.023727573680978e-06, "loss": 1.5495, "step": 160100 }, { "epoch": 6.073703366696997, "grad_norm": 10.674248695373535, "learning_rate": 8.010756101075008e-06, "loss": 1.5591, "step": 160200 }, { "epoch": 6.0774946921443735, "grad_norm": 12.03752613067627, "learning_rate": 7.997788112522656e-06, "loss": 1.5789, "step": 160300 }, { "epoch": 6.08128601759175, "grad_norm": 12.508307456970215, "learning_rate": 7.984823630736646e-06, "loss": 1.5787, "step": 160400 }, { "epoch": 6.085077343039126, "grad_norm": 12.167118072509766, "learning_rate": 7.971862678423582e-06, "loss": 1.559, "step": 160500 }, { "epoch": 6.088868668486503, "grad_norm": 11.51626205444336, "learning_rate": 7.95890527828387e-06, "loss": 1.5384, "step": 160600 }, { "epoch": 6.0926599939338795, "grad_norm": 11.625091552734375, "learning_rate": 7.945951453011695e-06, "loss": 1.5691, "step": 160700 }, { "epoch": 6.096451319381256, "grad_norm": 11.168635368347168, "learning_rate": 7.93300122529499e-06, "loss": 1.572, "step": 160800 }, { "epoch": 6.100242644828632, "grad_norm": 11.287224769592285, "learning_rate": 7.920054617815376e-06, "loss": 1.582, "step": 160900 }, { "epoch": 6.104033970276008, "grad_norm": 11.98828411102295, "learning_rate": 7.90711165324815e-06, "loss": 1.5894, "step": 161000 }, { "epoch": 6.1078252957233845, "grad_norm": 11.063945770263672, "learning_rate": 7.894172354262202e-06, "loss": 1.5572, "step": 161100 }, { "epoch": 6.111616621170762, "grad_norm": 11.431550025939941, "learning_rate": 7.881236743520029e-06, "loss": 1.5518, "step": 161200 }, { "epoch": 6.115407946618138, "grad_norm": 12.146788597106934, "learning_rate": 7.868304843677656e-06, "loss": 1.5632, "step": 161300 }, { "epoch": 6.119199272065514, "grad_norm": 11.888874053955078, "learning_rate": 7.855376677384608e-06, "loss": 1.5617, "step": 161400 }, { "epoch": 6.12299059751289, "grad_norm": 10.939279556274414, "learning_rate": 7.84245226728387e-06, "loss": 1.5732, "step": 161500 }, { "epoch": 6.126781922960267, "grad_norm": 11.886305809020996, "learning_rate": 7.829531636011853e-06, "loss": 1.5619, "step": 161600 }, { "epoch": 6.130573248407643, "grad_norm": 11.215821266174316, "learning_rate": 7.816614806198347e-06, "loss": 1.5622, "step": 161700 }, { "epoch": 6.13436457385502, "grad_norm": 10.780815124511719, "learning_rate": 7.803701800466481e-06, "loss": 1.5931, "step": 161800 }, { "epoch": 6.138155899302396, "grad_norm": 11.858797073364258, "learning_rate": 7.790792641432694e-06, "loss": 1.539, "step": 161900 }, { "epoch": 6.141947224749773, "grad_norm": 10.948421478271484, "learning_rate": 7.777887351706684e-06, "loss": 1.5625, "step": 162000 }, { "epoch": 6.145738550197149, "grad_norm": 12.686944007873535, "learning_rate": 7.764985953891368e-06, "loss": 1.5427, "step": 162100 }, { "epoch": 6.149529875644525, "grad_norm": 11.992308616638184, "learning_rate": 7.752088470582849e-06, "loss": 1.5362, "step": 162200 }, { "epoch": 6.153321201091901, "grad_norm": 10.531576156616211, "learning_rate": 7.73919492437038e-06, "loss": 1.5784, "step": 162300 }, { "epoch": 6.1571125265392785, "grad_norm": 11.363094329833984, "learning_rate": 7.726305337836306e-06, "loss": 1.5801, "step": 162400 }, { "epoch": 6.160903851986655, "grad_norm": 12.400845527648926, "learning_rate": 7.713419733556047e-06, "loss": 1.5529, "step": 162500 }, { "epoch": 6.164695177434031, "grad_norm": 11.559149742126465, "learning_rate": 7.700538134098052e-06, "loss": 1.5961, "step": 162600 }, { "epoch": 6.168486502881407, "grad_norm": 10.350358963012695, "learning_rate": 7.687660562023736e-06, "loss": 1.5576, "step": 162700 }, { "epoch": 6.172277828328784, "grad_norm": 11.613056182861328, "learning_rate": 7.674787039887484e-06, "loss": 1.5861, "step": 162800 }, { "epoch": 6.17606915377616, "grad_norm": 10.37353515625, "learning_rate": 7.661917590236573e-06, "loss": 1.5745, "step": 162900 }, { "epoch": 6.179860479223537, "grad_norm": 12.00024127960205, "learning_rate": 7.649052235611154e-06, "loss": 1.6068, "step": 163000 }, { "epoch": 6.183651804670913, "grad_norm": 11.836668014526367, "learning_rate": 7.636190998544198e-06, "loss": 1.5775, "step": 163100 }, { "epoch": 6.1874431301182895, "grad_norm": 11.959616661071777, "learning_rate": 7.623333901561471e-06, "loss": 1.5416, "step": 163200 }, { "epoch": 6.191234455565666, "grad_norm": 11.583044052124023, "learning_rate": 7.610480967181492e-06, "loss": 1.5821, "step": 163300 }, { "epoch": 6.195025781013042, "grad_norm": 11.979923248291016, "learning_rate": 7.597632217915476e-06, "loss": 1.5704, "step": 163400 }, { "epoch": 6.198817106460418, "grad_norm": 11.3414888381958, "learning_rate": 7.58478767626732e-06, "loss": 1.5787, "step": 163500 }, { "epoch": 6.2026084319077945, "grad_norm": 12.808886528015137, "learning_rate": 7.57194736473355e-06, "loss": 1.5708, "step": 163600 }, { "epoch": 6.206399757355172, "grad_norm": 11.763092041015625, "learning_rate": 7.559111305803273e-06, "loss": 1.5501, "step": 163700 }, { "epoch": 6.210191082802548, "grad_norm": 10.817452430725098, "learning_rate": 7.54627952195816e-06, "loss": 1.5608, "step": 163800 }, { "epoch": 6.213982408249924, "grad_norm": 13.040711402893066, "learning_rate": 7.533452035672387e-06, "loss": 1.5817, "step": 163900 }, { "epoch": 6.2177737336973005, "grad_norm": 10.778230667114258, "learning_rate": 7.520628869412615e-06, "loss": 1.5636, "step": 164000 }, { "epoch": 6.221565059144677, "grad_norm": 12.305252075195312, "learning_rate": 7.507810045637915e-06, "loss": 1.5718, "step": 164100 }, { "epoch": 6.225356384592053, "grad_norm": 11.195343971252441, "learning_rate": 7.494995586799776e-06, "loss": 1.5824, "step": 164200 }, { "epoch": 6.22914771003943, "grad_norm": 11.273686408996582, "learning_rate": 7.482185515342034e-06, "loss": 1.5676, "step": 164300 }, { "epoch": 6.232939035486806, "grad_norm": 11.690625190734863, "learning_rate": 7.469379853700835e-06, "loss": 1.5515, "step": 164400 }, { "epoch": 6.236730360934183, "grad_norm": 12.36613655090332, "learning_rate": 7.456578624304607e-06, "loss": 1.5263, "step": 164500 }, { "epoch": 6.240521686381559, "grad_norm": 11.007948875427246, "learning_rate": 7.443781849574017e-06, "loss": 1.5566, "step": 164600 }, { "epoch": 6.244313011828935, "grad_norm": 11.56943130493164, "learning_rate": 7.430989551921922e-06, "loss": 1.5945, "step": 164700 }, { "epoch": 6.248104337276311, "grad_norm": 11.851801872253418, "learning_rate": 7.418201753753345e-06, "loss": 1.5706, "step": 164800 }, { "epoch": 6.251895662723689, "grad_norm": 10.846909523010254, "learning_rate": 7.405418477465429e-06, "loss": 1.5695, "step": 164900 }, { "epoch": 6.255686988171065, "grad_norm": 13.116886138916016, "learning_rate": 7.392639745447394e-06, "loss": 1.5858, "step": 165000 }, { "epoch": 6.259478313618441, "grad_norm": 12.648842811584473, "learning_rate": 7.379865580080499e-06, "loss": 1.5628, "step": 165100 }, { "epoch": 6.263269639065817, "grad_norm": 11.038084983825684, "learning_rate": 7.367096003738006e-06, "loss": 1.5602, "step": 165200 }, { "epoch": 6.267060964513194, "grad_norm": 12.708508491516113, "learning_rate": 7.3543310387851455e-06, "loss": 1.577, "step": 165300 }, { "epoch": 6.27085228996057, "grad_norm": 11.94873332977295, "learning_rate": 7.341570707579062e-06, "loss": 1.5558, "step": 165400 }, { "epoch": 6.274643615407947, "grad_norm": 11.363472938537598, "learning_rate": 7.3288150324687865e-06, "loss": 1.5633, "step": 165500 }, { "epoch": 6.278434940855323, "grad_norm": 12.280117988586426, "learning_rate": 7.316064035795208e-06, "loss": 1.5688, "step": 165600 }, { "epoch": 6.2822262663026995, "grad_norm": 12.41207218170166, "learning_rate": 7.303317739890998e-06, "loss": 1.586, "step": 165700 }, { "epoch": 6.286017591750076, "grad_norm": 12.464340209960938, "learning_rate": 7.290576167080615e-06, "loss": 1.5495, "step": 165800 }, { "epoch": 6.289808917197452, "grad_norm": 12.295889854431152, "learning_rate": 7.2778393396802374e-06, "loss": 1.5508, "step": 165900 }, { "epoch": 6.293600242644828, "grad_norm": 11.185117721557617, "learning_rate": 7.265107279997733e-06, "loss": 1.5742, "step": 166000 }, { "epoch": 6.2973915680922055, "grad_norm": 12.16705322265625, "learning_rate": 7.252380010332616e-06, "loss": 1.5888, "step": 166100 }, { "epoch": 6.301182893539582, "grad_norm": 12.094047546386719, "learning_rate": 7.239657552976014e-06, "loss": 1.6054, "step": 166200 }, { "epoch": 6.304974218986958, "grad_norm": 11.859914779663086, "learning_rate": 7.226939930210635e-06, "loss": 1.5423, "step": 166300 }, { "epoch": 6.308765544434334, "grad_norm": 11.034880638122559, "learning_rate": 7.214227164310698e-06, "loss": 1.5956, "step": 166400 }, { "epoch": 6.3125568698817105, "grad_norm": 11.484114646911621, "learning_rate": 7.201519277541935e-06, "loss": 1.576, "step": 166500 }, { "epoch": 6.316348195329087, "grad_norm": 12.680161476135254, "learning_rate": 7.188816292161528e-06, "loss": 1.5744, "step": 166600 }, { "epoch": 6.320139520776463, "grad_norm": 12.29842758178711, "learning_rate": 7.176118230418067e-06, "loss": 1.5851, "step": 166700 }, { "epoch": 6.32393084622384, "grad_norm": 12.066814422607422, "learning_rate": 7.163425114551525e-06, "loss": 1.5989, "step": 166800 }, { "epoch": 6.327722171671216, "grad_norm": 11.265061378479004, "learning_rate": 7.15073696679321e-06, "loss": 1.5882, "step": 166900 }, { "epoch": 6.331513497118593, "grad_norm": 11.37795639038086, "learning_rate": 7.1380538093657395e-06, "loss": 1.5837, "step": 167000 }, { "epoch": 6.335304822565969, "grad_norm": 14.099513053894043, "learning_rate": 7.125375664482965e-06, "loss": 1.5572, "step": 167100 }, { "epoch": 6.339096148013345, "grad_norm": 11.486617088317871, "learning_rate": 7.112702554349986e-06, "loss": 1.573, "step": 167200 }, { "epoch": 6.3428874734607215, "grad_norm": 10.815934181213379, "learning_rate": 7.100034501163071e-06, "loss": 1.5901, "step": 167300 }, { "epoch": 6.346678798908099, "grad_norm": 12.056611061096191, "learning_rate": 7.087371527109632e-06, "loss": 1.5883, "step": 167400 }, { "epoch": 6.350470124355475, "grad_norm": 11.79443645477295, "learning_rate": 7.074713654368185e-06, "loss": 1.5759, "step": 167500 }, { "epoch": 6.354261449802851, "grad_norm": 12.65126895904541, "learning_rate": 7.0620609051083165e-06, "loss": 1.5921, "step": 167600 }, { "epoch": 6.358052775250227, "grad_norm": 11.01827335357666, "learning_rate": 7.049413301490631e-06, "loss": 1.5757, "step": 167700 }, { "epoch": 6.361844100697604, "grad_norm": 11.086094856262207, "learning_rate": 7.036770865666729e-06, "loss": 1.5575, "step": 167800 }, { "epoch": 6.36563542614498, "grad_norm": 12.343743324279785, "learning_rate": 7.024133619779156e-06, "loss": 1.5999, "step": 167900 }, { "epoch": 6.369426751592357, "grad_norm": 12.470636367797852, "learning_rate": 7.011501585961369e-06, "loss": 1.5791, "step": 168000 }, { "epoch": 6.373218077039733, "grad_norm": 11.98880672454834, "learning_rate": 6.998874786337691e-06, "loss": 1.5708, "step": 168100 }, { "epoch": 6.37700940248711, "grad_norm": 13.099966049194336, "learning_rate": 6.986253243023285e-06, "loss": 1.587, "step": 168200 }, { "epoch": 6.380800727934486, "grad_norm": 12.526756286621094, "learning_rate": 6.973636978124108e-06, "loss": 1.5785, "step": 168300 }, { "epoch": 6.384592053381862, "grad_norm": 12.60748291015625, "learning_rate": 6.961026013736859e-06, "loss": 1.5489, "step": 168400 }, { "epoch": 6.388383378829238, "grad_norm": 12.510016441345215, "learning_rate": 6.948420371948971e-06, "loss": 1.5812, "step": 168500 }, { "epoch": 6.3921747042766155, "grad_norm": 13.390847206115723, "learning_rate": 6.935820074838551e-06, "loss": 1.5981, "step": 168600 }, { "epoch": 6.395966029723992, "grad_norm": 12.104558944702148, "learning_rate": 6.923225144474331e-06, "loss": 1.5989, "step": 168700 }, { "epoch": 6.399757355171368, "grad_norm": 11.727679252624512, "learning_rate": 6.9106356029156584e-06, "loss": 1.5905, "step": 168800 }, { "epoch": 6.403548680618744, "grad_norm": 11.340494155883789, "learning_rate": 6.898051472212441e-06, "loss": 1.5851, "step": 168900 }, { "epoch": 6.4073400060661205, "grad_norm": 12.350801467895508, "learning_rate": 6.8854727744050995e-06, "loss": 1.5943, "step": 169000 }, { "epoch": 6.411131331513497, "grad_norm": 11.936524391174316, "learning_rate": 6.872899531524548e-06, "loss": 1.5812, "step": 169100 }, { "epoch": 6.414922656960874, "grad_norm": 12.08755111694336, "learning_rate": 6.860331765592149e-06, "loss": 1.5651, "step": 169200 }, { "epoch": 6.41871398240825, "grad_norm": 12.988269805908203, "learning_rate": 6.8477694986196664e-06, "loss": 1.6106, "step": 169300 }, { "epoch": 6.4225053078556265, "grad_norm": 12.905387878417969, "learning_rate": 6.83521275260923e-06, "loss": 1.5987, "step": 169400 }, { "epoch": 6.426296633303003, "grad_norm": 10.5891695022583, "learning_rate": 6.82266154955331e-06, "loss": 1.5772, "step": 169500 }, { "epoch": 6.430087958750379, "grad_norm": 12.226225852966309, "learning_rate": 6.810115911434663e-06, "loss": 1.5558, "step": 169600 }, { "epoch": 6.433879284197755, "grad_norm": 12.79734992980957, "learning_rate": 6.797575860226293e-06, "loss": 1.585, "step": 169700 }, { "epoch": 6.4376706096451315, "grad_norm": 11.220429420471191, "learning_rate": 6.785041417891427e-06, "loss": 1.5584, "step": 169800 }, { "epoch": 6.441461935092509, "grad_norm": 11.418993949890137, "learning_rate": 6.772512606383475e-06, "loss": 1.5768, "step": 169900 }, { "epoch": 6.445253260539885, "grad_norm": 11.215234756469727, "learning_rate": 6.75998944764596e-06, "loss": 1.5747, "step": 170000 }, { "epoch": 6.449044585987261, "grad_norm": 11.339964866638184, "learning_rate": 6.747471963612533e-06, "loss": 1.5909, "step": 170100 }, { "epoch": 6.452835911434637, "grad_norm": 12.44511604309082, "learning_rate": 6.734960176206888e-06, "loss": 1.5998, "step": 170200 }, { "epoch": 6.456627236882014, "grad_norm": 13.004100799560547, "learning_rate": 6.722454107342751e-06, "loss": 1.5944, "step": 170300 }, { "epoch": 6.460418562329391, "grad_norm": 11.460034370422363, "learning_rate": 6.709953778923824e-06, "loss": 1.5833, "step": 170400 }, { "epoch": 6.464209887776767, "grad_norm": 11.30329704284668, "learning_rate": 6.69745921284376e-06, "loss": 1.5897, "step": 170500 }, { "epoch": 6.468001213224143, "grad_norm": 11.543671607971191, "learning_rate": 6.684970430986126e-06, "loss": 1.5245, "step": 170600 }, { "epoch": 6.47179253867152, "grad_norm": 12.401044845581055, "learning_rate": 6.67248745522434e-06, "loss": 1.6057, "step": 170700 }, { "epoch": 6.475583864118896, "grad_norm": 11.761506080627441, "learning_rate": 6.660010307421669e-06, "loss": 1.5872, "step": 170800 }, { "epoch": 6.479375189566272, "grad_norm": 10.88347053527832, "learning_rate": 6.6475390094311675e-06, "loss": 1.5836, "step": 170900 }, { "epoch": 6.483166515013648, "grad_norm": 11.388224601745605, "learning_rate": 6.635073583095637e-06, "loss": 1.5726, "step": 171000 }, { "epoch": 6.4869578404610255, "grad_norm": 12.162073135375977, "learning_rate": 6.6226140502476045e-06, "loss": 1.5582, "step": 171100 }, { "epoch": 6.490749165908402, "grad_norm": 11.910102844238281, "learning_rate": 6.610160432709266e-06, "loss": 1.5748, "step": 171200 }, { "epoch": 6.494540491355778, "grad_norm": 11.985915184020996, "learning_rate": 6.597712752292473e-06, "loss": 1.5928, "step": 171300 }, { "epoch": 6.498331816803154, "grad_norm": 10.915767669677734, "learning_rate": 6.585271030798657e-06, "loss": 1.5502, "step": 171400 }, { "epoch": 6.502123142250531, "grad_norm": 13.120352745056152, "learning_rate": 6.572835290018828e-06, "loss": 1.5839, "step": 171500 }, { "epoch": 6.505914467697907, "grad_norm": 11.82824993133545, "learning_rate": 6.560405551733517e-06, "loss": 1.5529, "step": 171600 }, { "epoch": 6.509705793145284, "grad_norm": 11.547855377197266, "learning_rate": 6.54798183771274e-06, "loss": 1.5517, "step": 171700 }, { "epoch": 6.51349711859266, "grad_norm": 12.268568992614746, "learning_rate": 6.5355641697159644e-06, "loss": 1.5782, "step": 171800 }, { "epoch": 6.5172884440400365, "grad_norm": 10.838194847106934, "learning_rate": 6.523152569492068e-06, "loss": 1.5863, "step": 171900 }, { "epoch": 6.521079769487413, "grad_norm": 12.038179397583008, "learning_rate": 6.510747058779299e-06, "loss": 1.6017, "step": 172000 }, { "epoch": 6.524871094934789, "grad_norm": 13.132615089416504, "learning_rate": 6.49834765930524e-06, "loss": 1.5878, "step": 172100 }, { "epoch": 6.528662420382165, "grad_norm": 12.512954711914062, "learning_rate": 6.485954392786775e-06, "loss": 1.5735, "step": 172200 }, { "epoch": 6.532453745829542, "grad_norm": 13.542767524719238, "learning_rate": 6.473567280930044e-06, "loss": 1.6051, "step": 172300 }, { "epoch": 6.536245071276919, "grad_norm": 11.998785972595215, "learning_rate": 6.461186345430404e-06, "loss": 1.5806, "step": 172400 }, { "epoch": 6.540036396724295, "grad_norm": 11.366271018981934, "learning_rate": 6.448811607972396e-06, "loss": 1.5944, "step": 172500 }, { "epoch": 6.543827722171671, "grad_norm": 12.306340217590332, "learning_rate": 6.436443090229711e-06, "loss": 1.5845, "step": 172600 }, { "epoch": 6.5476190476190474, "grad_norm": 10.787001609802246, "learning_rate": 6.424080813865139e-06, "loss": 1.5897, "step": 172700 }, { "epoch": 6.551410373066424, "grad_norm": 12.918703079223633, "learning_rate": 6.41172480053054e-06, "loss": 1.5717, "step": 172800 }, { "epoch": 6.5552016985138, "grad_norm": 13.117171287536621, "learning_rate": 6.399375071866814e-06, "loss": 1.586, "step": 172900 }, { "epoch": 6.558993023961177, "grad_norm": 12.480173110961914, "learning_rate": 6.387031649503835e-06, "loss": 1.6053, "step": 173000 }, { "epoch": 6.562784349408553, "grad_norm": 11.882287979125977, "learning_rate": 6.37469455506045e-06, "loss": 1.6014, "step": 173100 }, { "epoch": 6.56657567485593, "grad_norm": 12.698135375976562, "learning_rate": 6.362363810144414e-06, "loss": 1.544, "step": 173200 }, { "epoch": 6.570367000303306, "grad_norm": 11.684245109558105, "learning_rate": 6.3500394363523645e-06, "loss": 1.5935, "step": 173300 }, { "epoch": 6.574158325750682, "grad_norm": 11.454160690307617, "learning_rate": 6.3377214552697765e-06, "loss": 1.5745, "step": 173400 }, { "epoch": 6.577949651198059, "grad_norm": 12.443729400634766, "learning_rate": 6.325409888470929e-06, "loss": 1.5815, "step": 173500 }, { "epoch": 6.581740976645436, "grad_norm": 12.456474304199219, "learning_rate": 6.313104757518879e-06, "loss": 1.5662, "step": 173600 }, { "epoch": 6.585532302092812, "grad_norm": 11.548056602478027, "learning_rate": 6.300806083965386e-06, "loss": 1.5749, "step": 173700 }, { "epoch": 6.589323627540188, "grad_norm": 12.651735305786133, "learning_rate": 6.288513889350925e-06, "loss": 1.566, "step": 173800 }, { "epoch": 6.593114952987564, "grad_norm": 12.43497085571289, "learning_rate": 6.276228195204612e-06, "loss": 1.5869, "step": 173900 }, { "epoch": 6.596906278434941, "grad_norm": 12.086784362792969, "learning_rate": 6.263949023044176e-06, "loss": 1.5659, "step": 174000 }, { "epoch": 6.600697603882317, "grad_norm": 13.311132431030273, "learning_rate": 6.251676394375926e-06, "loss": 1.5668, "step": 174100 }, { "epoch": 6.604488929329694, "grad_norm": 14.332047462463379, "learning_rate": 6.239410330694711e-06, "loss": 1.5831, "step": 174200 }, { "epoch": 6.60828025477707, "grad_norm": 11.911043167114258, "learning_rate": 6.2271508534838885e-06, "loss": 1.6021, "step": 174300 }, { "epoch": 6.6120715802244465, "grad_norm": 12.088520050048828, "learning_rate": 6.21489798421526e-06, "loss": 1.5389, "step": 174400 }, { "epoch": 6.615862905671823, "grad_norm": 11.501230239868164, "learning_rate": 6.202651744349076e-06, "loss": 1.5478, "step": 174500 }, { "epoch": 6.619654231119199, "grad_norm": 12.913456916809082, "learning_rate": 6.190412155333967e-06, "loss": 1.5579, "step": 174600 }, { "epoch": 6.623445556566575, "grad_norm": 12.485562324523926, "learning_rate": 6.1781792386069074e-06, "loss": 1.6149, "step": 174700 }, { "epoch": 6.6272368820139524, "grad_norm": 10.915718078613281, "learning_rate": 6.165953015593196e-06, "loss": 1.601, "step": 174800 }, { "epoch": 6.631028207461329, "grad_norm": 12.131805419921875, "learning_rate": 6.1537335077064066e-06, "loss": 1.5862, "step": 174900 }, { "epoch": 6.634819532908705, "grad_norm": 11.747113227844238, "learning_rate": 6.141520736348345e-06, "loss": 1.5758, "step": 175000 }, { "epoch": 6.638610858356081, "grad_norm": 11.681252479553223, "learning_rate": 6.129314722909024e-06, "loss": 1.5744, "step": 175100 }, { "epoch": 6.6424021838034575, "grad_norm": 12.15557861328125, "learning_rate": 6.11711548876662e-06, "loss": 1.5741, "step": 175200 }, { "epoch": 6.646193509250834, "grad_norm": 12.310876846313477, "learning_rate": 6.10492305528744e-06, "loss": 1.5618, "step": 175300 }, { "epoch": 6.649984834698211, "grad_norm": 12.02529239654541, "learning_rate": 6.092737443825867e-06, "loss": 1.5673, "step": 175400 }, { "epoch": 6.653776160145587, "grad_norm": 11.48465633392334, "learning_rate": 6.080558675724347e-06, "loss": 1.5505, "step": 175500 }, { "epoch": 6.657567485592963, "grad_norm": 11.90403938293457, "learning_rate": 6.06838677231334e-06, "loss": 1.6107, "step": 175600 }, { "epoch": 6.66135881104034, "grad_norm": 10.626382827758789, "learning_rate": 6.056221754911269e-06, "loss": 1.5812, "step": 175700 }, { "epoch": 6.665150136487716, "grad_norm": 11.586104393005371, "learning_rate": 6.044063644824518e-06, "loss": 1.5465, "step": 175800 }, { "epoch": 6.668941461935092, "grad_norm": 12.849113464355469, "learning_rate": 6.031912463347361e-06, "loss": 1.5788, "step": 175900 }, { "epoch": 6.6727327873824684, "grad_norm": 11.026603698730469, "learning_rate": 6.019768231761934e-06, "loss": 1.5702, "step": 176000 }, { "epoch": 6.676524112829846, "grad_norm": 12.312780380249023, "learning_rate": 6.007630971338207e-06, "loss": 1.5644, "step": 176100 }, { "epoch": 6.680315438277222, "grad_norm": 11.3616304397583, "learning_rate": 5.995500703333939e-06, "loss": 1.5835, "step": 176200 }, { "epoch": 6.684106763724598, "grad_norm": 12.169947624206543, "learning_rate": 5.983377448994642e-06, "loss": 1.5717, "step": 176300 }, { "epoch": 6.687898089171974, "grad_norm": 12.28679370880127, "learning_rate": 5.9712612295535375e-06, "loss": 1.5207, "step": 176400 }, { "epoch": 6.691689414619351, "grad_norm": 12.259623527526855, "learning_rate": 5.959152066231539e-06, "loss": 1.576, "step": 176500 }, { "epoch": 6.695480740066728, "grad_norm": 11.888885498046875, "learning_rate": 5.947049980237195e-06, "loss": 1.5844, "step": 176600 }, { "epoch": 6.699272065514104, "grad_norm": 11.779487609863281, "learning_rate": 5.9349549927666505e-06, "loss": 1.5835, "step": 176700 }, { "epoch": 6.70306339096148, "grad_norm": 12.409217834472656, "learning_rate": 5.922867125003631e-06, "loss": 1.6009, "step": 176800 }, { "epoch": 6.7068547164088566, "grad_norm": 11.563682556152344, "learning_rate": 5.9107863981193844e-06, "loss": 1.5739, "step": 176900 }, { "epoch": 6.710646041856233, "grad_norm": 12.317644119262695, "learning_rate": 5.8987128332726525e-06, "loss": 1.5433, "step": 177000 }, { "epoch": 6.714437367303609, "grad_norm": 12.137526512145996, "learning_rate": 5.886646451609632e-06, "loss": 1.5699, "step": 177100 }, { "epoch": 6.718228692750985, "grad_norm": 12.462454795837402, "learning_rate": 5.874587274263946e-06, "loss": 1.5534, "step": 177200 }, { "epoch": 6.7220200181983625, "grad_norm": 11.364400863647461, "learning_rate": 5.862535322356595e-06, "loss": 1.5932, "step": 177300 }, { "epoch": 6.725811343645739, "grad_norm": 12.443015098571777, "learning_rate": 5.850490616995921e-06, "loss": 1.5646, "step": 177400 }, { "epoch": 6.729602669093115, "grad_norm": 11.145082473754883, "learning_rate": 5.838453179277574e-06, "loss": 1.5489, "step": 177500 }, { "epoch": 6.733393994540491, "grad_norm": 12.626555442810059, "learning_rate": 5.826423030284489e-06, "loss": 1.5611, "step": 177600 }, { "epoch": 6.7371853199878675, "grad_norm": 13.616358757019043, "learning_rate": 5.814400191086808e-06, "loss": 1.568, "step": 177700 }, { "epoch": 6.740976645435244, "grad_norm": 11.458916664123535, "learning_rate": 5.802384682741902e-06, "loss": 1.588, "step": 177800 }, { "epoch": 6.744767970882621, "grad_norm": 12.305092811584473, "learning_rate": 5.790376526294282e-06, "loss": 1.6002, "step": 177900 }, { "epoch": 6.748559296329997, "grad_norm": 12.611316680908203, "learning_rate": 5.778375742775585e-06, "loss": 1.5845, "step": 178000 }, { "epoch": 6.752350621777373, "grad_norm": 11.844655990600586, "learning_rate": 5.766382353204538e-06, "loss": 1.5986, "step": 178100 }, { "epoch": 6.75614194722475, "grad_norm": 12.948076248168945, "learning_rate": 5.754396378586915e-06, "loss": 1.6091, "step": 178200 }, { "epoch": 6.759933272672126, "grad_norm": 10.998056411743164, "learning_rate": 5.7424178399155176e-06, "loss": 1.5775, "step": 178300 }, { "epoch": 6.763724598119502, "grad_norm": 12.610383987426758, "learning_rate": 5.7304467581700976e-06, "loss": 1.5543, "step": 178400 }, { "epoch": 6.767515923566879, "grad_norm": 12.120309829711914, "learning_rate": 5.718483154317365e-06, "loss": 1.5721, "step": 178500 }, { "epoch": 6.771307249014256, "grad_norm": 12.690911293029785, "learning_rate": 5.706527049310929e-06, "loss": 1.5817, "step": 178600 }, { "epoch": 6.775098574461632, "grad_norm": 12.65075397491455, "learning_rate": 5.694578464091264e-06, "loss": 1.592, "step": 178700 }, { "epoch": 6.778889899909008, "grad_norm": 12.895487785339355, "learning_rate": 5.682637419585672e-06, "loss": 1.5767, "step": 178800 }, { "epoch": 6.782681225356384, "grad_norm": 12.082797050476074, "learning_rate": 5.670703936708256e-06, "loss": 1.5685, "step": 178900 }, { "epoch": 6.786472550803761, "grad_norm": 10.953842163085938, "learning_rate": 5.65877803635986e-06, "loss": 1.5933, "step": 179000 }, { "epoch": 6.790263876251137, "grad_norm": 10.629667282104492, "learning_rate": 5.646859739428056e-06, "loss": 1.5804, "step": 179100 }, { "epoch": 6.794055201698514, "grad_norm": 12.160419464111328, "learning_rate": 5.634949066787108e-06, "loss": 1.566, "step": 179200 }, { "epoch": 6.79784652714589, "grad_norm": 12.70361614227295, "learning_rate": 5.6230460392979185e-06, "loss": 1.5657, "step": 179300 }, { "epoch": 6.801637852593267, "grad_norm": 11.730588912963867, "learning_rate": 5.61115067780799e-06, "loss": 1.5922, "step": 179400 }, { "epoch": 6.805429178040643, "grad_norm": 14.135165214538574, "learning_rate": 5.5992630031514136e-06, "loss": 1.5691, "step": 179500 }, { "epoch": 6.809220503488019, "grad_norm": 12.373421669006348, "learning_rate": 5.58738303614881e-06, "loss": 1.5675, "step": 179600 }, { "epoch": 6.813011828935396, "grad_norm": 11.99539852142334, "learning_rate": 5.575510797607304e-06, "loss": 1.5583, "step": 179700 }, { "epoch": 6.8168031543827725, "grad_norm": 11.734624862670898, "learning_rate": 5.563646308320479e-06, "loss": 1.5767, "step": 179800 }, { "epoch": 6.820594479830149, "grad_norm": 11.702163696289062, "learning_rate": 5.551789589068354e-06, "loss": 1.5778, "step": 179900 }, { "epoch": 6.824385805277525, "grad_norm": 11.904010772705078, "learning_rate": 5.539940660617331e-06, "loss": 1.575, "step": 180000 }, { "epoch": 6.828177130724901, "grad_norm": 13.515007019042969, "learning_rate": 5.528099543720171e-06, "loss": 1.5754, "step": 180100 }, { "epoch": 6.8319684561722775, "grad_norm": 12.8431396484375, "learning_rate": 5.5162662591159534e-06, "loss": 1.5479, "step": 180200 }, { "epoch": 6.835759781619654, "grad_norm": 11.991058349609375, "learning_rate": 5.504440827530043e-06, "loss": 1.5728, "step": 180300 }, { "epoch": 6.839551107067031, "grad_norm": 11.53125286102295, "learning_rate": 5.492623269674036e-06, "loss": 1.6055, "step": 180400 }, { "epoch": 6.843342432514407, "grad_norm": 10.548478126525879, "learning_rate": 5.480813606245759e-06, "loss": 1.5737, "step": 180500 }, { "epoch": 6.8471337579617835, "grad_norm": 12.238279342651367, "learning_rate": 5.469011857929202e-06, "loss": 1.5739, "step": 180600 }, { "epoch": 6.85092508340916, "grad_norm": 11.689667701721191, "learning_rate": 5.457218045394485e-06, "loss": 1.5935, "step": 180700 }, { "epoch": 6.854716408856536, "grad_norm": 12.386988639831543, "learning_rate": 5.445432189297839e-06, "loss": 1.5666, "step": 180800 }, { "epoch": 6.858507734303913, "grad_norm": 10.942350387573242, "learning_rate": 5.433654310281558e-06, "loss": 1.5775, "step": 180900 }, { "epoch": 6.862299059751289, "grad_norm": 11.2795991897583, "learning_rate": 5.42188442897396e-06, "loss": 1.5687, "step": 181000 }, { "epoch": 6.866090385198666, "grad_norm": 11.240723609924316, "learning_rate": 5.410122565989362e-06, "loss": 1.5666, "step": 181100 }, { "epoch": 6.869881710646042, "grad_norm": 12.121834754943848, "learning_rate": 5.398368741928036e-06, "loss": 1.5914, "step": 181200 }, { "epoch": 6.873673036093418, "grad_norm": 12.583788871765137, "learning_rate": 5.3866229773761614e-06, "loss": 1.5651, "step": 181300 }, { "epoch": 6.877464361540794, "grad_norm": 11.747200965881348, "learning_rate": 5.3748852929058225e-06, "loss": 1.5775, "step": 181400 }, { "epoch": 6.881255686988171, "grad_norm": 11.591804504394531, "learning_rate": 5.363155709074941e-06, "loss": 1.5491, "step": 181500 }, { "epoch": 6.885047012435548, "grad_norm": 11.792959213256836, "learning_rate": 5.351434246427253e-06, "loss": 1.5563, "step": 181600 }, { "epoch": 6.888838337882924, "grad_norm": 13.207117080688477, "learning_rate": 5.339720925492262e-06, "loss": 1.5729, "step": 181700 }, { "epoch": 6.8926296633303, "grad_norm": 13.433188438415527, "learning_rate": 5.328015766785219e-06, "loss": 1.5556, "step": 181800 }, { "epoch": 6.896420988777677, "grad_norm": 12.62912654876709, "learning_rate": 5.316318790807094e-06, "loss": 1.548, "step": 181900 }, { "epoch": 6.900212314225053, "grad_norm": 11.166362762451172, "learning_rate": 5.304630018044494e-06, "loss": 1.5718, "step": 182000 }, { "epoch": 6.904003639672429, "grad_norm": 12.012219429016113, "learning_rate": 5.2929494689696825e-06, "loss": 1.578, "step": 182100 }, { "epoch": 6.907794965119805, "grad_norm": 13.170934677124023, "learning_rate": 5.281277164040512e-06, "loss": 1.5717, "step": 182200 }, { "epoch": 6.9115862905671825, "grad_norm": 11.134900093078613, "learning_rate": 5.269613123700397e-06, "loss": 1.5713, "step": 182300 }, { "epoch": 6.915377616014559, "grad_norm": 12.227039337158203, "learning_rate": 5.2579573683782746e-06, "loss": 1.5759, "step": 182400 }, { "epoch": 6.919168941461935, "grad_norm": 11.756880760192871, "learning_rate": 5.246309918488574e-06, "loss": 1.5914, "step": 182500 }, { "epoch": 6.922960266909311, "grad_norm": 11.864381790161133, "learning_rate": 5.234670794431183e-06, "loss": 1.5727, "step": 182600 }, { "epoch": 6.926751592356688, "grad_norm": 11.939608573913574, "learning_rate": 5.223040016591383e-06, "loss": 1.5749, "step": 182700 }, { "epoch": 6.930542917804065, "grad_norm": 13.33112907409668, "learning_rate": 5.211417605339873e-06, "loss": 1.5577, "step": 182800 }, { "epoch": 6.934334243251441, "grad_norm": 12.545299530029297, "learning_rate": 5.199803581032677e-06, "loss": 1.5518, "step": 182900 }, { "epoch": 6.938125568698817, "grad_norm": 12.574894905090332, "learning_rate": 5.188197964011128e-06, "loss": 1.5731, "step": 183000 }, { "epoch": 6.9419168941461935, "grad_norm": 12.258808135986328, "learning_rate": 5.176600774601841e-06, "loss": 1.5656, "step": 183100 }, { "epoch": 6.94570821959357, "grad_norm": 11.367091178894043, "learning_rate": 5.165012033116672e-06, "loss": 1.5449, "step": 183200 }, { "epoch": 6.949499545040946, "grad_norm": 12.18317985534668, "learning_rate": 5.153431759852676e-06, "loss": 1.599, "step": 183300 }, { "epoch": 6.953290870488322, "grad_norm": 12.359045028686523, "learning_rate": 5.1418599750920784e-06, "loss": 1.6015, "step": 183400 }, { "epoch": 6.957082195935699, "grad_norm": 12.138041496276855, "learning_rate": 5.1302966991022375e-06, "loss": 1.5875, "step": 183500 }, { "epoch": 6.960873521383076, "grad_norm": 12.202312469482422, "learning_rate": 5.118741952135609e-06, "loss": 1.5638, "step": 183600 }, { "epoch": 6.964664846830452, "grad_norm": 13.207098007202148, "learning_rate": 5.1071957544297105e-06, "loss": 1.5507, "step": 183700 }, { "epoch": 6.968456172277828, "grad_norm": 11.600495338439941, "learning_rate": 5.095658126207086e-06, "loss": 1.5755, "step": 183800 }, { "epoch": 6.9722474977252045, "grad_norm": 12.143369674682617, "learning_rate": 5.084129087675273e-06, "loss": 1.5732, "step": 183900 }, { "epoch": 6.976038823172582, "grad_norm": 12.664071083068848, "learning_rate": 5.072608659026754e-06, "loss": 1.5384, "step": 184000 }, { "epoch": 6.979830148619958, "grad_norm": 11.89238166809082, "learning_rate": 5.061096860438941e-06, "loss": 1.5962, "step": 184100 }, { "epoch": 6.983621474067334, "grad_norm": 11.586669921875, "learning_rate": 5.049593712074141e-06, "loss": 1.5616, "step": 184200 }, { "epoch": 6.98741279951471, "grad_norm": 10.929930686950684, "learning_rate": 5.038099234079488e-06, "loss": 1.5946, "step": 184300 }, { "epoch": 6.991204124962087, "grad_norm": 12.143250465393066, "learning_rate": 5.026613446586943e-06, "loss": 1.559, "step": 184400 }, { "epoch": 6.994995450409463, "grad_norm": 11.404658317565918, "learning_rate": 5.015136369713247e-06, "loss": 1.5524, "step": 184500 }, { "epoch": 6.998786775856839, "grad_norm": 11.464829444885254, "learning_rate": 5.003668023559883e-06, "loss": 1.5672, "step": 184600 }, { "epoch": 7.0, "eval_accuracy": 0.5259731950750216, "eval_loss": 2.1297895908355713, "eval_runtime": 932.1345, "eval_samples_per_second": 905.474, "eval_steps_per_second": 7.074, "step": 184632 }, { "epoch": 7.002578101304216, "grad_norm": 10.56268310546875, "learning_rate": 4.992208428213038e-06, "loss": 1.5139, "step": 184700 }, { "epoch": 7.006369426751593, "grad_norm": 12.568649291992188, "learning_rate": 4.98075760374358e-06, "loss": 1.4407, "step": 184800 }, { "epoch": 7.010160752198969, "grad_norm": 12.517056465148926, "learning_rate": 4.9693155702070135e-06, "loss": 1.4531, "step": 184900 }, { "epoch": 7.013952077646345, "grad_norm": 10.836019515991211, "learning_rate": 4.957882347643436e-06, "loss": 1.4536, "step": 185000 }, { "epoch": 7.017743403093721, "grad_norm": 11.469989776611328, "learning_rate": 4.946457956077529e-06, "loss": 1.4617, "step": 185100 }, { "epoch": 7.021534728541098, "grad_norm": 11.820192337036133, "learning_rate": 4.9350424155185015e-06, "loss": 1.4582, "step": 185200 }, { "epoch": 7.025326053988475, "grad_norm": 11.849312782287598, "learning_rate": 4.923635745960052e-06, "loss": 1.4487, "step": 185300 }, { "epoch": 7.029117379435851, "grad_norm": 12.268925666809082, "learning_rate": 4.912237967380348e-06, "loss": 1.4849, "step": 185400 }, { "epoch": 7.032908704883227, "grad_norm": 11.798918724060059, "learning_rate": 4.900849099741985e-06, "loss": 1.4791, "step": 185500 }, { "epoch": 7.0367000303306035, "grad_norm": 11.124809265136719, "learning_rate": 4.889469162991961e-06, "loss": 1.4198, "step": 185600 }, { "epoch": 7.04049135577798, "grad_norm": 11.798725128173828, "learning_rate": 4.878098177061613e-06, "loss": 1.4775, "step": 185700 }, { "epoch": 7.044282681225356, "grad_norm": 10.969219207763672, "learning_rate": 4.8667361618666135e-06, "loss": 1.4431, "step": 185800 }, { "epoch": 7.048074006672733, "grad_norm": 11.691911697387695, "learning_rate": 4.85538313730692e-06, "loss": 1.4736, "step": 185900 }, { "epoch": 7.0518653321201095, "grad_norm": 11.743919372558594, "learning_rate": 4.844039123266746e-06, "loss": 1.459, "step": 186000 }, { "epoch": 7.055656657567486, "grad_norm": 13.230798721313477, "learning_rate": 4.832704139614519e-06, "loss": 1.4441, "step": 186100 }, { "epoch": 7.059447983014862, "grad_norm": 10.857861518859863, "learning_rate": 4.821378206202859e-06, "loss": 1.4656, "step": 186200 }, { "epoch": 7.063239308462238, "grad_norm": 12.192200660705566, "learning_rate": 4.8100613428685184e-06, "loss": 1.4455, "step": 186300 }, { "epoch": 7.0670306339096145, "grad_norm": 11.992803573608398, "learning_rate": 4.798753569432376e-06, "loss": 1.4465, "step": 186400 }, { "epoch": 7.070821959356991, "grad_norm": 12.342023849487305, "learning_rate": 4.787454905699396e-06, "loss": 1.4196, "step": 186500 }, { "epoch": 7.074613284804368, "grad_norm": 11.946333885192871, "learning_rate": 4.7761653714585785e-06, "loss": 1.4914, "step": 186600 }, { "epoch": 7.078404610251744, "grad_norm": 13.01424789428711, "learning_rate": 4.764884986482929e-06, "loss": 1.4582, "step": 186700 }, { "epoch": 7.08219593569912, "grad_norm": 12.088098526000977, "learning_rate": 4.753613770529437e-06, "loss": 1.4442, "step": 186800 }, { "epoch": 7.085987261146497, "grad_norm": 11.93554973602295, "learning_rate": 4.742351743339032e-06, "loss": 1.4626, "step": 186900 }, { "epoch": 7.089778586593873, "grad_norm": 12.095431327819824, "learning_rate": 4.731098924636547e-06, "loss": 1.455, "step": 187000 }, { "epoch": 7.093569912041249, "grad_norm": 12.00534439086914, "learning_rate": 4.719855334130688e-06, "loss": 1.4729, "step": 187100 }, { "epoch": 7.097361237488626, "grad_norm": 12.239509582519531, "learning_rate": 4.708620991514001e-06, "loss": 1.4728, "step": 187200 }, { "epoch": 7.101152562936003, "grad_norm": 11.604708671569824, "learning_rate": 4.69739591646283e-06, "loss": 1.4517, "step": 187300 }, { "epoch": 7.104943888383379, "grad_norm": 13.60051441192627, "learning_rate": 4.686180128637291e-06, "loss": 1.4774, "step": 187400 }, { "epoch": 7.108735213830755, "grad_norm": 12.121148109436035, "learning_rate": 4.674973647681231e-06, "loss": 1.4767, "step": 187500 }, { "epoch": 7.112526539278131, "grad_norm": 12.02375316619873, "learning_rate": 4.663776493222205e-06, "loss": 1.4553, "step": 187600 }, { "epoch": 7.116317864725508, "grad_norm": 12.208605766296387, "learning_rate": 4.652588684871412e-06, "loss": 1.4706, "step": 187700 }, { "epoch": 7.120109190172885, "grad_norm": 13.647366523742676, "learning_rate": 4.64141024222371e-06, "loss": 1.4405, "step": 187800 }, { "epoch": 7.123900515620261, "grad_norm": 11.577104568481445, "learning_rate": 4.630241184857536e-06, "loss": 1.4865, "step": 187900 }, { "epoch": 7.127691841067637, "grad_norm": 13.239177703857422, "learning_rate": 4.61908153233489e-06, "loss": 1.4475, "step": 188000 }, { "epoch": 7.131483166515014, "grad_norm": 11.769329071044922, "learning_rate": 4.607931304201302e-06, "loss": 1.4612, "step": 188100 }, { "epoch": 7.13527449196239, "grad_norm": 11.046523094177246, "learning_rate": 4.596790519985799e-06, "loss": 1.4802, "step": 188200 }, { "epoch": 7.139065817409766, "grad_norm": 11.291935920715332, "learning_rate": 4.585659199200864e-06, "loss": 1.4748, "step": 188300 }, { "epoch": 7.142857142857143, "grad_norm": 12.358168601989746, "learning_rate": 4.5745373613424075e-06, "loss": 1.4607, "step": 188400 }, { "epoch": 7.1466484683045195, "grad_norm": 13.51307201385498, "learning_rate": 4.5634250258897275e-06, "loss": 1.4676, "step": 188500 }, { "epoch": 7.150439793751896, "grad_norm": 10.760235786437988, "learning_rate": 4.552322212305483e-06, "loss": 1.4575, "step": 188600 }, { "epoch": 7.154231119199272, "grad_norm": 13.084484100341797, "learning_rate": 4.541228940035654e-06, "loss": 1.4659, "step": 188700 }, { "epoch": 7.158022444646648, "grad_norm": 11.566179275512695, "learning_rate": 4.530145228509511e-06, "loss": 1.465, "step": 188800 }, { "epoch": 7.1618137700940245, "grad_norm": 12.14118480682373, "learning_rate": 4.519071097139579e-06, "loss": 1.4528, "step": 188900 }, { "epoch": 7.165605095541402, "grad_norm": 12.82026481628418, "learning_rate": 4.508006565321599e-06, "loss": 1.4584, "step": 189000 }, { "epoch": 7.169396420988778, "grad_norm": 11.690791130065918, "learning_rate": 4.496951652434501e-06, "loss": 1.4495, "step": 189100 }, { "epoch": 7.173187746436154, "grad_norm": 12.744691848754883, "learning_rate": 4.485906377840379e-06, "loss": 1.459, "step": 189200 }, { "epoch": 7.1769790718835305, "grad_norm": 10.657965660095215, "learning_rate": 4.47487076088443e-06, "loss": 1.4746, "step": 189300 }, { "epoch": 7.180770397330907, "grad_norm": 11.708250045776367, "learning_rate": 4.463844820894944e-06, "loss": 1.4773, "step": 189400 }, { "epoch": 7.184561722778283, "grad_norm": 12.226489067077637, "learning_rate": 4.452828577183262e-06, "loss": 1.4488, "step": 189500 }, { "epoch": 7.188353048225659, "grad_norm": 11.205961227416992, "learning_rate": 4.4418220490437436e-06, "loss": 1.4745, "step": 189600 }, { "epoch": 7.192144373673036, "grad_norm": 12.943376541137695, "learning_rate": 4.430825255753728e-06, "loss": 1.4681, "step": 189700 }, { "epoch": 7.195935699120413, "grad_norm": 12.167372703552246, "learning_rate": 4.4198382165735075e-06, "loss": 1.4652, "step": 189800 }, { "epoch": 7.199727024567789, "grad_norm": 13.172160148620605, "learning_rate": 4.408860950746293e-06, "loss": 1.4137, "step": 189900 }, { "epoch": 7.203518350015165, "grad_norm": 10.23653507232666, "learning_rate": 4.397893477498165e-06, "loss": 1.4588, "step": 190000 }, { "epoch": 7.207309675462541, "grad_norm": 12.459962844848633, "learning_rate": 4.386935816038074e-06, "loss": 1.4434, "step": 190100 }, { "epoch": 7.211101000909918, "grad_norm": 12.446187019348145, "learning_rate": 4.375987985557771e-06, "loss": 1.4627, "step": 190200 }, { "epoch": 7.214892326357295, "grad_norm": 11.183820724487305, "learning_rate": 4.365050005231788e-06, "loss": 1.4519, "step": 190300 }, { "epoch": 7.218683651804671, "grad_norm": 12.674784660339355, "learning_rate": 4.354121894217411e-06, "loss": 1.4486, "step": 190400 }, { "epoch": 7.222474977252047, "grad_norm": 11.734650611877441, "learning_rate": 4.343203671654634e-06, "loss": 1.4764, "step": 190500 }, { "epoch": 7.226266302699424, "grad_norm": 11.69714641571045, "learning_rate": 4.332295356666149e-06, "loss": 1.4254, "step": 190600 }, { "epoch": 7.2300576281468, "grad_norm": 12.96596622467041, "learning_rate": 4.321396968357269e-06, "loss": 1.4635, "step": 190700 }, { "epoch": 7.233848953594176, "grad_norm": 12.446158409118652, "learning_rate": 4.310508525815939e-06, "loss": 1.4586, "step": 190800 }, { "epoch": 7.237640279041553, "grad_norm": 11.66193962097168, "learning_rate": 4.299630048112682e-06, "loss": 1.4356, "step": 190900 }, { "epoch": 7.2414316044889295, "grad_norm": 11.215117454528809, "learning_rate": 4.288761554300563e-06, "loss": 1.4571, "step": 191000 }, { "epoch": 7.245222929936306, "grad_norm": 11.736091613769531, "learning_rate": 4.277903063415167e-06, "loss": 1.4424, "step": 191100 }, { "epoch": 7.249014255383682, "grad_norm": 13.667712211608887, "learning_rate": 4.2670545944745574e-06, "loss": 1.441, "step": 191200 }, { "epoch": 7.252805580831058, "grad_norm": 11.600788116455078, "learning_rate": 4.256216166479238e-06, "loss": 1.4991, "step": 191300 }, { "epoch": 7.256596906278435, "grad_norm": 13.099230766296387, "learning_rate": 4.2453877984121315e-06, "loss": 1.4905, "step": 191400 }, { "epoch": 7.260388231725812, "grad_norm": 12.109108924865723, "learning_rate": 4.2345695092385485e-06, "loss": 1.4454, "step": 191500 }, { "epoch": 7.264179557173188, "grad_norm": 11.286263465881348, "learning_rate": 4.223761317906141e-06, "loss": 1.463, "step": 191600 }, { "epoch": 7.267970882620564, "grad_norm": 13.759344100952148, "learning_rate": 4.212963243344866e-06, "loss": 1.4454, "step": 191700 }, { "epoch": 7.2717622080679405, "grad_norm": 10.820724487304688, "learning_rate": 4.202175304466976e-06, "loss": 1.44, "step": 191800 }, { "epoch": 7.275553533515317, "grad_norm": 12.238749504089355, "learning_rate": 4.191397520166962e-06, "loss": 1.4828, "step": 191900 }, { "epoch": 7.279344858962693, "grad_norm": 12.918777465820312, "learning_rate": 4.180629909321534e-06, "loss": 1.4457, "step": 192000 }, { "epoch": 7.28313618441007, "grad_norm": 12.288901329040527, "learning_rate": 4.169872490789585e-06, "loss": 1.4696, "step": 192100 }, { "epoch": 7.286927509857446, "grad_norm": 11.18660831451416, "learning_rate": 4.159125283412155e-06, "loss": 1.4598, "step": 192200 }, { "epoch": 7.290718835304823, "grad_norm": 12.26436710357666, "learning_rate": 4.148388306012391e-06, "loss": 1.4715, "step": 192300 }, { "epoch": 7.294510160752199, "grad_norm": 11.755010604858398, "learning_rate": 4.13766157739554e-06, "loss": 1.4625, "step": 192400 }, { "epoch": 7.298301486199575, "grad_norm": 12.798155784606934, "learning_rate": 4.126945116348887e-06, "loss": 1.4657, "step": 192500 }, { "epoch": 7.3020928116469515, "grad_norm": 11.293261528015137, "learning_rate": 4.11623894164174e-06, "loss": 1.4354, "step": 192600 }, { "epoch": 7.305884137094328, "grad_norm": 11.86343765258789, "learning_rate": 4.105543072025381e-06, "loss": 1.4813, "step": 192700 }, { "epoch": 7.309675462541705, "grad_norm": 12.764716148376465, "learning_rate": 4.094857526233049e-06, "loss": 1.5038, "step": 192800 }, { "epoch": 7.313466787989081, "grad_norm": 11.530652046203613, "learning_rate": 4.084182322979915e-06, "loss": 1.4768, "step": 192900 }, { "epoch": 7.317258113436457, "grad_norm": 10.532978057861328, "learning_rate": 4.0735174809630095e-06, "loss": 1.4388, "step": 193000 }, { "epoch": 7.321049438883834, "grad_norm": 12.39688491821289, "learning_rate": 4.062863018861235e-06, "loss": 1.4617, "step": 193100 }, { "epoch": 7.32484076433121, "grad_norm": 13.243266105651855, "learning_rate": 4.0522189553353075e-06, "loss": 1.4585, "step": 193200 }, { "epoch": 7.328632089778587, "grad_norm": 13.385005950927734, "learning_rate": 4.041585309027731e-06, "loss": 1.4648, "step": 193300 }, { "epoch": 7.332423415225963, "grad_norm": 11.608242988586426, "learning_rate": 4.030962098562765e-06, "loss": 1.4676, "step": 193400 }, { "epoch": 7.33621474067334, "grad_norm": 11.500589370727539, "learning_rate": 4.02034934254639e-06, "loss": 1.4166, "step": 193500 }, { "epoch": 7.340006066120716, "grad_norm": 11.58963394165039, "learning_rate": 4.009747059566283e-06, "loss": 1.4559, "step": 193600 }, { "epoch": 7.343797391568092, "grad_norm": 12.849601745605469, "learning_rate": 3.999155268191758e-06, "loss": 1.4778, "step": 193700 }, { "epoch": 7.347588717015468, "grad_norm": 11.223885536193848, "learning_rate": 3.98857398697378e-06, "loss": 1.4729, "step": 193800 }, { "epoch": 7.351380042462845, "grad_norm": 11.647134780883789, "learning_rate": 3.978003234444893e-06, "loss": 1.4651, "step": 193900 }, { "epoch": 7.355171367910222, "grad_norm": 12.231060981750488, "learning_rate": 3.967443029119196e-06, "loss": 1.4433, "step": 194000 }, { "epoch": 7.358962693357598, "grad_norm": 11.437355041503906, "learning_rate": 3.95689338949232e-06, "loss": 1.4636, "step": 194100 }, { "epoch": 7.362754018804974, "grad_norm": 12.315284729003906, "learning_rate": 3.946354334041393e-06, "loss": 1.4552, "step": 194200 }, { "epoch": 7.3665453442523505, "grad_norm": 12.190777778625488, "learning_rate": 3.935825881225004e-06, "loss": 1.476, "step": 194300 }, { "epoch": 7.370336669699727, "grad_norm": 12.552740097045898, "learning_rate": 3.925308049483171e-06, "loss": 1.4708, "step": 194400 }, { "epoch": 7.374127995147103, "grad_norm": 11.692163467407227, "learning_rate": 3.914800857237308e-06, "loss": 1.4244, "step": 194500 }, { "epoch": 7.37791932059448, "grad_norm": 10.79149341583252, "learning_rate": 3.9043043228902e-06, "loss": 1.4457, "step": 194600 }, { "epoch": 7.3817106460418564, "grad_norm": 11.736242294311523, "learning_rate": 3.8938184648259596e-06, "loss": 1.4554, "step": 194700 }, { "epoch": 7.385501971489233, "grad_norm": 12.321134567260742, "learning_rate": 3.883343301410004e-06, "loss": 1.4522, "step": 194800 }, { "epoch": 7.389293296936609, "grad_norm": 12.858579635620117, "learning_rate": 3.87287885098902e-06, "loss": 1.4995, "step": 194900 }, { "epoch": 7.393084622383985, "grad_norm": 12.626704216003418, "learning_rate": 3.862425131890925e-06, "loss": 1.4321, "step": 195000 }, { "epoch": 7.3968759478313615, "grad_norm": 11.76616096496582, "learning_rate": 3.851982162424842e-06, "loss": 1.4646, "step": 195100 }, { "epoch": 7.400667273278739, "grad_norm": 13.024399757385254, "learning_rate": 3.841549960881083e-06, "loss": 1.4658, "step": 195200 }, { "epoch": 7.404458598726115, "grad_norm": 12.576539039611816, "learning_rate": 3.831128545531076e-06, "loss": 1.445, "step": 195300 }, { "epoch": 7.408249924173491, "grad_norm": 11.101597785949707, "learning_rate": 3.8207179346273725e-06, "loss": 1.4702, "step": 195400 }, { "epoch": 7.412041249620867, "grad_norm": 11.0336332321167, "learning_rate": 3.810318146403598e-06, "loss": 1.4689, "step": 195500 }, { "epoch": 7.415832575068244, "grad_norm": 12.612707138061523, "learning_rate": 3.799929199074419e-06, "loss": 1.4718, "step": 195600 }, { "epoch": 7.41962390051562, "grad_norm": 13.329168319702148, "learning_rate": 3.789551110835521e-06, "loss": 1.4483, "step": 195700 }, { "epoch": 7.423415225962997, "grad_norm": 11.64931869506836, "learning_rate": 3.7791838998635657e-06, "loss": 1.4415, "step": 195800 }, { "epoch": 7.427206551410373, "grad_norm": 11.911974906921387, "learning_rate": 3.768827584316164e-06, "loss": 1.4607, "step": 195900 }, { "epoch": 7.43099787685775, "grad_norm": 12.335108757019043, "learning_rate": 3.7584821823318464e-06, "loss": 1.4571, "step": 196000 }, { "epoch": 7.434789202305126, "grad_norm": 10.682324409484863, "learning_rate": 3.748147712030027e-06, "loss": 1.4691, "step": 196100 }, { "epoch": 7.438580527752502, "grad_norm": 11.649224281311035, "learning_rate": 3.737824191510979e-06, "loss": 1.4834, "step": 196200 }, { "epoch": 7.442371853199878, "grad_norm": 12.008169174194336, "learning_rate": 3.727511638855784e-06, "loss": 1.4485, "step": 196300 }, { "epoch": 7.4461631786472555, "grad_norm": 12.858367919921875, "learning_rate": 3.717210072126324e-06, "loss": 1.4901, "step": 196400 }, { "epoch": 7.449954504094632, "grad_norm": 14.653989791870117, "learning_rate": 3.7069195093652486e-06, "loss": 1.4502, "step": 196500 }, { "epoch": 7.453745829542008, "grad_norm": 12.428114891052246, "learning_rate": 3.6966399685959144e-06, "loss": 1.4792, "step": 196600 }, { "epoch": 7.457537154989384, "grad_norm": 12.44163990020752, "learning_rate": 3.686371467822387e-06, "loss": 1.471, "step": 196700 }, { "epoch": 7.461328480436761, "grad_norm": 11.379399299621582, "learning_rate": 3.6761140250293935e-06, "loss": 1.4728, "step": 196800 }, { "epoch": 7.465119805884137, "grad_norm": 11.300498008728027, "learning_rate": 3.6658676581822926e-06, "loss": 1.4531, "step": 196900 }, { "epoch": 7.468911131331513, "grad_norm": 10.958044052124023, "learning_rate": 3.6556323852270436e-06, "loss": 1.4383, "step": 197000 }, { "epoch": 7.47270245677889, "grad_norm": 11.524394989013672, "learning_rate": 3.6454082240901777e-06, "loss": 1.4782, "step": 197100 }, { "epoch": 7.4764937822262665, "grad_norm": 11.287458419799805, "learning_rate": 3.635195192678764e-06, "loss": 1.4512, "step": 197200 }, { "epoch": 7.480285107673643, "grad_norm": 11.350493431091309, "learning_rate": 3.624993308880368e-06, "loss": 1.4658, "step": 197300 }, { "epoch": 7.484076433121019, "grad_norm": 11.665024757385254, "learning_rate": 3.614802590563051e-06, "loss": 1.45, "step": 197400 }, { "epoch": 7.487867758568395, "grad_norm": 11.237317085266113, "learning_rate": 3.604623055575307e-06, "loss": 1.4519, "step": 197500 }, { "epoch": 7.4916590840157715, "grad_norm": 11.03515625, "learning_rate": 3.594454721746037e-06, "loss": 1.4468, "step": 197600 }, { "epoch": 7.495450409463149, "grad_norm": 11.224727630615234, "learning_rate": 3.5842976068845325e-06, "loss": 1.4514, "step": 197700 }, { "epoch": 7.499241734910525, "grad_norm": 11.740256309509277, "learning_rate": 3.5741517287804327e-06, "loss": 1.4677, "step": 197800 }, { "epoch": 7.503033060357901, "grad_norm": 10.91279125213623, "learning_rate": 3.5640171052037044e-06, "loss": 1.4453, "step": 197900 }, { "epoch": 7.5068243858052774, "grad_norm": 11.92087173461914, "learning_rate": 3.5538937539045872e-06, "loss": 1.4741, "step": 198000 }, { "epoch": 7.510615711252654, "grad_norm": 12.371010780334473, "learning_rate": 3.543781692613588e-06, "loss": 1.4721, "step": 198100 }, { "epoch": 7.51440703670003, "grad_norm": 13.100205421447754, "learning_rate": 3.5336809390414383e-06, "loss": 1.469, "step": 198200 }, { "epoch": 7.518198362147407, "grad_norm": 9.989629745483398, "learning_rate": 3.523591510879065e-06, "loss": 1.4507, "step": 198300 }, { "epoch": 7.521989687594783, "grad_norm": 12.098480224609375, "learning_rate": 3.513513425797558e-06, "loss": 1.4812, "step": 198400 }, { "epoch": 7.52578101304216, "grad_norm": 12.726118087768555, "learning_rate": 3.503446701448143e-06, "loss": 1.4388, "step": 198500 }, { "epoch": 7.529572338489536, "grad_norm": 11.969866752624512, "learning_rate": 3.493391355462141e-06, "loss": 1.4715, "step": 198600 }, { "epoch": 7.533363663936912, "grad_norm": 14.494378089904785, "learning_rate": 3.4833474054509477e-06, "loss": 1.4163, "step": 198700 }, { "epoch": 7.537154989384288, "grad_norm": 13.008855819702148, "learning_rate": 3.473314869006009e-06, "loss": 1.4666, "step": 198800 }, { "epoch": 7.540946314831665, "grad_norm": 12.787714004516602, "learning_rate": 3.463293763698772e-06, "loss": 1.4654, "step": 198900 }, { "epoch": 7.544737640279042, "grad_norm": 11.005511283874512, "learning_rate": 3.4532841070806554e-06, "loss": 1.4634, "step": 199000 }, { "epoch": 7.548528965726418, "grad_norm": 11.839588165283203, "learning_rate": 3.443285916683039e-06, "loss": 1.445, "step": 199100 }, { "epoch": 7.552320291173794, "grad_norm": 11.886452674865723, "learning_rate": 3.4332992100172135e-06, "loss": 1.436, "step": 199200 }, { "epoch": 7.556111616621171, "grad_norm": 11.938136100769043, "learning_rate": 3.42332400457436e-06, "loss": 1.4673, "step": 199300 }, { "epoch": 7.559902942068547, "grad_norm": 13.715043067932129, "learning_rate": 3.4133603178255125e-06, "loss": 1.4812, "step": 199400 }, { "epoch": 7.563694267515924, "grad_norm": 12.895392417907715, "learning_rate": 3.403408167221536e-06, "loss": 1.4655, "step": 199500 }, { "epoch": 7.5674855929633, "grad_norm": 11.878474235534668, "learning_rate": 3.3934675701930753e-06, "loss": 1.4672, "step": 199600 }, { "epoch": 7.5712769184106765, "grad_norm": 13.690333366394043, "learning_rate": 3.383538544150563e-06, "loss": 1.4513, "step": 199700 }, { "epoch": 7.575068243858053, "grad_norm": 13.036225318908691, "learning_rate": 3.3736211064841495e-06, "loss": 1.4689, "step": 199800 }, { "epoch": 7.578859569305429, "grad_norm": 11.705101013183594, "learning_rate": 3.363715274563696e-06, "loss": 1.4679, "step": 199900 }, { "epoch": 7.582650894752805, "grad_norm": 11.910470008850098, "learning_rate": 3.3538210657387284e-06, "loss": 1.4638, "step": 200000 }, { "epoch": 7.5864422202001816, "grad_norm": 12.77523136138916, "learning_rate": 3.3439384973384194e-06, "loss": 1.4821, "step": 200100 }, { "epoch": 7.590233545647559, "grad_norm": 13.141159057617188, "learning_rate": 3.3340675866715653e-06, "loss": 1.4664, "step": 200200 }, { "epoch": 7.594024871094935, "grad_norm": 12.058465957641602, "learning_rate": 3.3242083510265276e-06, "loss": 1.4771, "step": 200300 }, { "epoch": 7.597816196542311, "grad_norm": 12.679981231689453, "learning_rate": 3.3143608076712277e-06, "loss": 1.4687, "step": 200400 }, { "epoch": 7.6016075219896875, "grad_norm": 11.648070335388184, "learning_rate": 3.304524973853108e-06, "loss": 1.4576, "step": 200500 }, { "epoch": 7.605398847437064, "grad_norm": 13.72588062286377, "learning_rate": 3.294700866799103e-06, "loss": 1.4896, "step": 200600 }, { "epoch": 7.609190172884441, "grad_norm": 10.427191734313965, "learning_rate": 3.2848885037156073e-06, "loss": 1.4659, "step": 200700 }, { "epoch": 7.612981498331817, "grad_norm": 13.418951034545898, "learning_rate": 3.275087901788445e-06, "loss": 1.4799, "step": 200800 }, { "epoch": 7.616772823779193, "grad_norm": 13.088275909423828, "learning_rate": 3.2652990781828487e-06, "loss": 1.4692, "step": 200900 }, { "epoch": 7.62056414922657, "grad_norm": 12.289769172668457, "learning_rate": 3.255522050043404e-06, "loss": 1.4576, "step": 201000 }, { "epoch": 7.624355474673946, "grad_norm": 13.83558464050293, "learning_rate": 3.2457568344940606e-06, "loss": 1.4658, "step": 201100 }, { "epoch": 7.628146800121322, "grad_norm": 13.160947799682617, "learning_rate": 3.2360034486380664e-06, "loss": 1.4614, "step": 201200 }, { "epoch": 7.631938125568698, "grad_norm": 11.460283279418945, "learning_rate": 3.226261909557947e-06, "loss": 1.4625, "step": 201300 }, { "epoch": 7.635729451016076, "grad_norm": 11.974932670593262, "learning_rate": 3.2165322343154858e-06, "loss": 1.4783, "step": 201400 }, { "epoch": 7.639520776463452, "grad_norm": 11.823965072631836, "learning_rate": 3.2068144399516863e-06, "loss": 1.461, "step": 201500 }, { "epoch": 7.643312101910828, "grad_norm": 12.76525592803955, "learning_rate": 3.197108543486741e-06, "loss": 1.4565, "step": 201600 }, { "epoch": 7.647103427358204, "grad_norm": 12.268322944641113, "learning_rate": 3.187414561920007e-06, "loss": 1.4605, "step": 201700 }, { "epoch": 7.650894752805581, "grad_norm": 12.149821281433105, "learning_rate": 3.1777325122299685e-06, "loss": 1.446, "step": 201800 }, { "epoch": 7.654686078252957, "grad_norm": 12.823575973510742, "learning_rate": 3.1680624113742165e-06, "loss": 1.4535, "step": 201900 }, { "epoch": 7.658477403700333, "grad_norm": 11.837114334106445, "learning_rate": 3.1584042762894116e-06, "loss": 1.4661, "step": 202000 }, { "epoch": 7.66226872914771, "grad_norm": 12.966864585876465, "learning_rate": 3.1487581238912566e-06, "loss": 1.4436, "step": 202100 }, { "epoch": 7.6660600545950865, "grad_norm": 13.026566505432129, "learning_rate": 3.1391239710744736e-06, "loss": 1.4455, "step": 202200 }, { "epoch": 7.669851380042463, "grad_norm": 11.392702102661133, "learning_rate": 3.129501834712755e-06, "loss": 1.4667, "step": 202300 }, { "epoch": 7.673642705489839, "grad_norm": 12.67297649383545, "learning_rate": 3.119891731658753e-06, "loss": 1.4713, "step": 202400 }, { "epoch": 7.677434030937215, "grad_norm": 12.5248441696167, "learning_rate": 3.1102936787440586e-06, "loss": 1.4512, "step": 202500 }, { "epoch": 7.6812253563845925, "grad_norm": 13.357857704162598, "learning_rate": 3.1007076927791324e-06, "loss": 1.5138, "step": 202600 }, { "epoch": 7.685016681831969, "grad_norm": 11.70677375793457, "learning_rate": 3.091133790553319e-06, "loss": 1.4579, "step": 202700 }, { "epoch": 7.688808007279345, "grad_norm": 11.800013542175293, "learning_rate": 3.0815719888347894e-06, "loss": 1.4365, "step": 202800 }, { "epoch": 7.692599332726721, "grad_norm": 11.036773681640625, "learning_rate": 3.0720223043705278e-06, "loss": 1.4994, "step": 202900 }, { "epoch": 7.6963906581740975, "grad_norm": 11.164999008178711, "learning_rate": 3.062484753886291e-06, "loss": 1.4884, "step": 203000 }, { "epoch": 7.700181983621474, "grad_norm": 12.471749305725098, "learning_rate": 3.0529593540865854e-06, "loss": 1.467, "step": 203100 }, { "epoch": 7.70397330906885, "grad_norm": 12.733320236206055, "learning_rate": 3.043446121654635e-06, "loss": 1.4958, "step": 203200 }, { "epoch": 7.707764634516227, "grad_norm": 12.760834693908691, "learning_rate": 3.0339450732523567e-06, "loss": 1.4359, "step": 203300 }, { "epoch": 7.711555959963603, "grad_norm": 11.818862915039062, "learning_rate": 3.0244562255203234e-06, "loss": 1.4446, "step": 203400 }, { "epoch": 7.71534728541098, "grad_norm": 12.18575382232666, "learning_rate": 3.014979595077745e-06, "loss": 1.4501, "step": 203500 }, { "epoch": 7.719138610858356, "grad_norm": 11.294827461242676, "learning_rate": 3.005515198522423e-06, "loss": 1.4595, "step": 203600 }, { "epoch": 7.722929936305732, "grad_norm": 12.619243621826172, "learning_rate": 2.996063052430739e-06, "loss": 1.4602, "step": 203700 }, { "epoch": 7.726721261753109, "grad_norm": 10.504117012023926, "learning_rate": 2.9866231733576245e-06, "loss": 1.4866, "step": 203800 }, { "epoch": 7.730512587200486, "grad_norm": 12.339829444885254, "learning_rate": 2.977195577836519e-06, "loss": 1.4606, "step": 203900 }, { "epoch": 7.734303912647862, "grad_norm": 12.702207565307617, "learning_rate": 2.9677802823793433e-06, "loss": 1.4787, "step": 204000 }, { "epoch": 7.738095238095238, "grad_norm": 12.178668022155762, "learning_rate": 2.958377303476483e-06, "loss": 1.4736, "step": 204100 }, { "epoch": 7.741886563542614, "grad_norm": 13.135882377624512, "learning_rate": 2.9489866575967485e-06, "loss": 1.456, "step": 204200 }, { "epoch": 7.745677888989991, "grad_norm": 13.023058891296387, "learning_rate": 2.9396083611873526e-06, "loss": 1.4411, "step": 204300 }, { "epoch": 7.749469214437367, "grad_norm": 12.61095142364502, "learning_rate": 2.930242430673875e-06, "loss": 1.4356, "step": 204400 }, { "epoch": 7.753260539884744, "grad_norm": 12.568473815917969, "learning_rate": 2.9208888824602434e-06, "loss": 1.4476, "step": 204500 }, { "epoch": 7.75705186533212, "grad_norm": 11.614837646484375, "learning_rate": 2.9115477329286835e-06, "loss": 1.4733, "step": 204600 }, { "epoch": 7.760843190779497, "grad_norm": 12.06226921081543, "learning_rate": 2.9022189984397266e-06, "loss": 1.4704, "step": 204700 }, { "epoch": 7.764634516226873, "grad_norm": 13.799254417419434, "learning_rate": 2.8929026953321436e-06, "loss": 1.4636, "step": 204800 }, { "epoch": 7.768425841674249, "grad_norm": 11.712929725646973, "learning_rate": 2.883598839922943e-06, "loss": 1.4631, "step": 204900 }, { "epoch": 7.772217167121625, "grad_norm": 10.375909805297852, "learning_rate": 2.8743074485073207e-06, "loss": 1.4484, "step": 205000 }, { "epoch": 7.7760084925690025, "grad_norm": 11.485528945922852, "learning_rate": 2.8650285373586475e-06, "loss": 1.4467, "step": 205100 }, { "epoch": 7.779799818016379, "grad_norm": 12.066153526306152, "learning_rate": 2.855762122728447e-06, "loss": 1.4949, "step": 205200 }, { "epoch": 7.783591143463755, "grad_norm": 12.087640762329102, "learning_rate": 2.8465082208463358e-06, "loss": 1.4288, "step": 205300 }, { "epoch": 7.787382468911131, "grad_norm": 13.033794403076172, "learning_rate": 2.8372668479200273e-06, "loss": 1.4575, "step": 205400 }, { "epoch": 7.7911737943585075, "grad_norm": 11.785131454467773, "learning_rate": 2.8280380201352908e-06, "loss": 1.4455, "step": 205500 }, { "epoch": 7.794965119805884, "grad_norm": 13.766409873962402, "learning_rate": 2.818821753655919e-06, "loss": 1.4894, "step": 205600 }, { "epoch": 7.798756445253261, "grad_norm": 11.996438980102539, "learning_rate": 2.8096180646237094e-06, "loss": 1.4709, "step": 205700 }, { "epoch": 7.802547770700637, "grad_norm": 10.94782543182373, "learning_rate": 2.800426969158425e-06, "loss": 1.472, "step": 205800 }, { "epoch": 7.8063390961480135, "grad_norm": 11.958955764770508, "learning_rate": 2.791248483357779e-06, "loss": 1.4607, "step": 205900 }, { "epoch": 7.81013042159539, "grad_norm": 12.694316864013672, "learning_rate": 2.7820826232973875e-06, "loss": 1.4743, "step": 206000 }, { "epoch": 7.813921747042766, "grad_norm": 12.337864875793457, "learning_rate": 2.7729294050307685e-06, "loss": 1.4592, "step": 206100 }, { "epoch": 7.817713072490142, "grad_norm": 12.94625186920166, "learning_rate": 2.7637888445892913e-06, "loss": 1.4595, "step": 206200 }, { "epoch": 7.8215043979375185, "grad_norm": 12.249018669128418, "learning_rate": 2.7546609579821493e-06, "loss": 1.4623, "step": 206300 }, { "epoch": 7.825295723384896, "grad_norm": 12.474944114685059, "learning_rate": 2.7455457611963487e-06, "loss": 1.4479, "step": 206400 }, { "epoch": 7.829087048832272, "grad_norm": 12.30129337310791, "learning_rate": 2.7364432701966648e-06, "loss": 1.4527, "step": 206500 }, { "epoch": 7.832878374279648, "grad_norm": 12.303889274597168, "learning_rate": 2.7273535009256213e-06, "loss": 1.4389, "step": 206600 }, { "epoch": 7.836669699727024, "grad_norm": 12.05512523651123, "learning_rate": 2.7182764693034593e-06, "loss": 1.4758, "step": 206700 }, { "epoch": 7.840461025174401, "grad_norm": 11.939696311950684, "learning_rate": 2.709212191228112e-06, "loss": 1.45, "step": 206800 }, { "epoch": 7.844252350621778, "grad_norm": 11.555091857910156, "learning_rate": 2.700160682575174e-06, "loss": 1.4419, "step": 206900 }, { "epoch": 7.848043676069154, "grad_norm": 11.523796081542969, "learning_rate": 2.691121959197874e-06, "loss": 1.4435, "step": 207000 }, { "epoch": 7.85183500151653, "grad_norm": 11.20706844329834, "learning_rate": 2.682096036927053e-06, "loss": 1.459, "step": 207100 }, { "epoch": 7.855626326963907, "grad_norm": 11.983468055725098, "learning_rate": 2.6730829315711284e-06, "loss": 1.453, "step": 207200 }, { "epoch": 7.859417652411283, "grad_norm": 12.764911651611328, "learning_rate": 2.6640826589160628e-06, "loss": 1.4468, "step": 207300 }, { "epoch": 7.863208977858659, "grad_norm": 12.270650863647461, "learning_rate": 2.6550952347253512e-06, "loss": 1.4666, "step": 207400 }, { "epoch": 7.867000303306035, "grad_norm": 12.31965446472168, "learning_rate": 2.64612067473999e-06, "loss": 1.4498, "step": 207500 }, { "epoch": 7.8707916287534125, "grad_norm": 10.307977676391602, "learning_rate": 2.6371589946784315e-06, "loss": 1.4335, "step": 207600 }, { "epoch": 7.874582954200789, "grad_norm": 13.750533103942871, "learning_rate": 2.628210210236577e-06, "loss": 1.4767, "step": 207700 }, { "epoch": 7.878374279648165, "grad_norm": 11.502288818359375, "learning_rate": 2.6192743370877416e-06, "loss": 1.4417, "step": 207800 }, { "epoch": 7.882165605095541, "grad_norm": 11.2874174118042, "learning_rate": 2.610351390882626e-06, "loss": 1.4431, "step": 207900 }, { "epoch": 7.885956930542918, "grad_norm": 11.436494827270508, "learning_rate": 2.6014413872492904e-06, "loss": 1.467, "step": 208000 }, { "epoch": 7.889748255990294, "grad_norm": 12.1028470993042, "learning_rate": 2.592544341793126e-06, "loss": 1.4595, "step": 208100 }, { "epoch": 7.893539581437671, "grad_norm": 11.925895690917969, "learning_rate": 2.583660270096833e-06, "loss": 1.4617, "step": 208200 }, { "epoch": 7.897330906885047, "grad_norm": 12.269451141357422, "learning_rate": 2.5747891877203733e-06, "loss": 1.4509, "step": 208300 }, { "epoch": 7.9011222323324235, "grad_norm": 13.169232368469238, "learning_rate": 2.565931110200983e-06, "loss": 1.4349, "step": 208400 }, { "epoch": 7.9049135577798, "grad_norm": 12.367587089538574, "learning_rate": 2.557086053053105e-06, "loss": 1.4588, "step": 208500 }, { "epoch": 7.908704883227176, "grad_norm": 11.855305671691895, "learning_rate": 2.5482540317683766e-06, "loss": 1.4636, "step": 208600 }, { "epoch": 7.912496208674552, "grad_norm": 12.490200996398926, "learning_rate": 2.5394350618156095e-06, "loss": 1.4363, "step": 208700 }, { "epoch": 7.916287534121929, "grad_norm": 12.725021362304688, "learning_rate": 2.530629158640755e-06, "loss": 1.4633, "step": 208800 }, { "epoch": 7.920078859569306, "grad_norm": 12.82268238067627, "learning_rate": 2.5218363376668798e-06, "loss": 1.4509, "step": 208900 }, { "epoch": 7.923870185016682, "grad_norm": 11.277029037475586, "learning_rate": 2.513056614294135e-06, "loss": 1.4636, "step": 209000 }, { "epoch": 7.927661510464058, "grad_norm": 13.22074031829834, "learning_rate": 2.5042900038997342e-06, "loss": 1.4483, "step": 209100 }, { "epoch": 7.9314528359114345, "grad_norm": 13.798327445983887, "learning_rate": 2.4955365218379234e-06, "loss": 1.4509, "step": 209200 }, { "epoch": 7.935244161358811, "grad_norm": 11.378777503967285, "learning_rate": 2.4867961834399556e-06, "loss": 1.4517, "step": 209300 }, { "epoch": 7.939035486806187, "grad_norm": 11.003828048706055, "learning_rate": 2.478069004014063e-06, "loss": 1.4477, "step": 209400 }, { "epoch": 7.942826812253564, "grad_norm": 12.217031478881836, "learning_rate": 2.4693549988454334e-06, "loss": 1.4802, "step": 209500 }, { "epoch": 7.94661813770094, "grad_norm": 11.81205940246582, "learning_rate": 2.4606541831961715e-06, "loss": 1.4536, "step": 209600 }, { "epoch": 7.950409463148317, "grad_norm": 11.06438159942627, "learning_rate": 2.4519665723052875e-06, "loss": 1.4421, "step": 209700 }, { "epoch": 7.954200788595693, "grad_norm": 12.169476509094238, "learning_rate": 2.4432921813886745e-06, "loss": 1.471, "step": 209800 }, { "epoch": 7.957992114043069, "grad_norm": 12.946663856506348, "learning_rate": 2.4346310256390514e-06, "loss": 1.4934, "step": 209900 }, { "epoch": 7.961783439490446, "grad_norm": 13.840192794799805, "learning_rate": 2.4259831202259697e-06, "loss": 1.4781, "step": 210000 }, { "epoch": 7.965574764937823, "grad_norm": 11.183977127075195, "learning_rate": 2.4173484802957693e-06, "loss": 1.4515, "step": 210100 }, { "epoch": 7.969366090385199, "grad_norm": 12.332466125488281, "learning_rate": 2.408727120971558e-06, "loss": 1.4381, "step": 210200 }, { "epoch": 7.973157415832575, "grad_norm": 13.181252479553223, "learning_rate": 2.4001190573531828e-06, "loss": 1.4542, "step": 210300 }, { "epoch": 7.976948741279951, "grad_norm": 13.109928131103516, "learning_rate": 2.3915243045172034e-06, "loss": 1.441, "step": 210400 }, { "epoch": 7.980740066727328, "grad_norm": 12.629493713378906, "learning_rate": 2.382942877516866e-06, "loss": 1.4917, "step": 210500 }, { "epoch": 7.984531392174704, "grad_norm": 12.907447814941406, "learning_rate": 2.374374791382079e-06, "loss": 1.4658, "step": 210600 }, { "epoch": 7.988322717622081, "grad_norm": 13.988858222961426, "learning_rate": 2.3658200611193847e-06, "loss": 1.4997, "step": 210700 }, { "epoch": 7.992114043069457, "grad_norm": 11.742740631103516, "learning_rate": 2.3572787017119346e-06, "loss": 1.4501, "step": 210800 }, { "epoch": 7.9959053685168335, "grad_norm": 12.537259101867676, "learning_rate": 2.3487507281194544e-06, "loss": 1.467, "step": 210900 }, { "epoch": 7.99969669396421, "grad_norm": 11.705291748046875, "learning_rate": 2.340236155278234e-06, "loss": 1.4574, "step": 211000 }, { "epoch": 8.0, "eval_accuracy": 0.5281188686577633, "eval_loss": 2.1245110034942627, "eval_runtime": 946.4545, "eval_samples_per_second": 891.775, "eval_steps_per_second": 6.967, "step": 211008 }, { "epoch": 8.003488019411586, "grad_norm": 13.449438095092773, "learning_rate": 2.3317349981010863e-06, "loss": 1.3835, "step": 211100 }, { "epoch": 8.007279344858963, "grad_norm": 11.56676197052002, "learning_rate": 2.323247271477339e-06, "loss": 1.411, "step": 211200 }, { "epoch": 8.011070670306339, "grad_norm": 12.162264823913574, "learning_rate": 2.31477299027278e-06, "loss": 1.3833, "step": 211300 }, { "epoch": 8.014861995753716, "grad_norm": 13.021665573120117, "learning_rate": 2.3063121693296587e-06, "loss": 1.3864, "step": 211400 }, { "epoch": 8.018653321201091, "grad_norm": 12.135092735290527, "learning_rate": 2.297864823466649e-06, "loss": 1.3864, "step": 211500 }, { "epoch": 8.022444646648468, "grad_norm": 13.025875091552734, "learning_rate": 2.2894309674788216e-06, "loss": 1.3855, "step": 211600 }, { "epoch": 8.026235972095845, "grad_norm": 12.053659439086914, "learning_rate": 2.2810106161376223e-06, "loss": 1.3834, "step": 211700 }, { "epoch": 8.03002729754322, "grad_norm": 12.010221481323242, "learning_rate": 2.2726037841908444e-06, "loss": 1.3783, "step": 211800 }, { "epoch": 8.033818622990598, "grad_norm": 12.158656120300293, "learning_rate": 2.2642104863625945e-06, "loss": 1.3899, "step": 211900 }, { "epoch": 8.037609948437973, "grad_norm": 11.787154197692871, "learning_rate": 2.2558307373532895e-06, "loss": 1.3802, "step": 212000 }, { "epoch": 8.04140127388535, "grad_norm": 11.579230308532715, "learning_rate": 2.2474645518396065e-06, "loss": 1.3892, "step": 212100 }, { "epoch": 8.045192599332728, "grad_norm": 10.911691665649414, "learning_rate": 2.2391119444744714e-06, "loss": 1.4067, "step": 212200 }, { "epoch": 8.048983924780103, "grad_norm": 12.764824867248535, "learning_rate": 2.2307729298870208e-06, "loss": 1.3917, "step": 212300 }, { "epoch": 8.05277525022748, "grad_norm": 11.58056640625, "learning_rate": 2.2224475226825903e-06, "loss": 1.3638, "step": 212400 }, { "epoch": 8.056566575674855, "grad_norm": 12.390571594238281, "learning_rate": 2.214135737442692e-06, "loss": 1.4143, "step": 212500 }, { "epoch": 8.060357901122233, "grad_norm": 11.440682411193848, "learning_rate": 2.2058375887249595e-06, "loss": 1.3952, "step": 212600 }, { "epoch": 8.064149226569608, "grad_norm": 11.579509735107422, "learning_rate": 2.1975530910631583e-06, "loss": 1.4069, "step": 212700 }, { "epoch": 8.067940552016985, "grad_norm": 10.284637451171875, "learning_rate": 2.189282258967139e-06, "loss": 1.371, "step": 212800 }, { "epoch": 8.071731877464362, "grad_norm": 12.285242080688477, "learning_rate": 2.1810251069228193e-06, "loss": 1.3479, "step": 212900 }, { "epoch": 8.075523202911738, "grad_norm": 12.233055114746094, "learning_rate": 2.172781649392157e-06, "loss": 1.4043, "step": 213000 }, { "epoch": 8.079314528359115, "grad_norm": 13.59484577178955, "learning_rate": 2.1645519008131245e-06, "loss": 1.3935, "step": 213100 }, { "epoch": 8.08310585380649, "grad_norm": 12.624910354614258, "learning_rate": 2.1563358755996843e-06, "loss": 1.3926, "step": 213200 }, { "epoch": 8.086897179253867, "grad_norm": 11.246472358703613, "learning_rate": 2.1481335881417553e-06, "loss": 1.3932, "step": 213300 }, { "epoch": 8.090688504701243, "grad_norm": 11.74521255493164, "learning_rate": 2.139945052805211e-06, "loss": 1.3915, "step": 213400 }, { "epoch": 8.09447983014862, "grad_norm": 11.420463562011719, "learning_rate": 2.13177028393183e-06, "loss": 1.3864, "step": 213500 }, { "epoch": 8.098271155595997, "grad_norm": 12.733565330505371, "learning_rate": 2.1236092958392752e-06, "loss": 1.3927, "step": 213600 }, { "epoch": 8.102062481043372, "grad_norm": 11.340044975280762, "learning_rate": 2.1154621028210797e-06, "loss": 1.3866, "step": 213700 }, { "epoch": 8.10585380649075, "grad_norm": 11.650097846984863, "learning_rate": 2.107328719146613e-06, "loss": 1.4016, "step": 213800 }, { "epoch": 8.109645131938125, "grad_norm": 11.602895736694336, "learning_rate": 2.099209159061061e-06, "loss": 1.3689, "step": 213900 }, { "epoch": 8.113436457385502, "grad_norm": 12.877341270446777, "learning_rate": 2.0911034367853935e-06, "loss": 1.3857, "step": 214000 }, { "epoch": 8.11722778283288, "grad_norm": 13.356156349182129, "learning_rate": 2.0830115665163488e-06, "loss": 1.398, "step": 214100 }, { "epoch": 8.121019108280255, "grad_norm": 11.071208000183105, "learning_rate": 2.0749335624264e-06, "loss": 1.369, "step": 214200 }, { "epoch": 8.124810433727632, "grad_norm": 12.332612037658691, "learning_rate": 2.0668694386637388e-06, "loss": 1.4116, "step": 214300 }, { "epoch": 8.128601759175007, "grad_norm": 12.739418983459473, "learning_rate": 2.058819209352243e-06, "loss": 1.3959, "step": 214400 }, { "epoch": 8.132393084622384, "grad_norm": 12.981528282165527, "learning_rate": 2.0507828885914595e-06, "loss": 1.338, "step": 214500 }, { "epoch": 8.13618441006976, "grad_norm": 11.981734275817871, "learning_rate": 2.0427604904565632e-06, "loss": 1.3809, "step": 214600 }, { "epoch": 8.139975735517137, "grad_norm": 13.717453956604004, "learning_rate": 2.034752028998356e-06, "loss": 1.4141, "step": 214700 }, { "epoch": 8.143767060964514, "grad_norm": 11.75163745880127, "learning_rate": 2.026757518243234e-06, "loss": 1.3956, "step": 214800 }, { "epoch": 8.14755838641189, "grad_norm": 11.8756685256958, "learning_rate": 2.0187769721931425e-06, "loss": 1.4011, "step": 214900 }, { "epoch": 8.151349711859266, "grad_norm": 12.344826698303223, "learning_rate": 2.010810404825585e-06, "loss": 1.3849, "step": 215000 }, { "epoch": 8.155141037306642, "grad_norm": 11.921911239624023, "learning_rate": 2.0028578300935718e-06, "loss": 1.3617, "step": 215100 }, { "epoch": 8.158932362754019, "grad_norm": 10.301591873168945, "learning_rate": 1.994919261925611e-06, "loss": 1.4046, "step": 215200 }, { "epoch": 8.162723688201396, "grad_norm": 12.922263145446777, "learning_rate": 1.986994714225676e-06, "loss": 1.3716, "step": 215300 }, { "epoch": 8.166515013648771, "grad_norm": 11.925300598144531, "learning_rate": 1.979084200873189e-06, "loss": 1.3896, "step": 215400 }, { "epoch": 8.170306339096149, "grad_norm": 12.997511863708496, "learning_rate": 1.971187735722987e-06, "loss": 1.3814, "step": 215500 }, { "epoch": 8.174097664543524, "grad_norm": 13.62182331085205, "learning_rate": 1.963305332605299e-06, "loss": 1.3906, "step": 215600 }, { "epoch": 8.177888989990901, "grad_norm": 12.422194480895996, "learning_rate": 1.9554370053257352e-06, "loss": 1.4075, "step": 215700 }, { "epoch": 8.181680315438276, "grad_norm": 11.846393585205078, "learning_rate": 1.9475827676652483e-06, "loss": 1.3953, "step": 215800 }, { "epoch": 8.185471640885654, "grad_norm": 12.962081909179688, "learning_rate": 1.9397426333801084e-06, "loss": 1.3964, "step": 215900 }, { "epoch": 8.18926296633303, "grad_norm": 12.63536548614502, "learning_rate": 1.9319166162018897e-06, "loss": 1.3829, "step": 216000 }, { "epoch": 8.193054291780406, "grad_norm": 11.416707992553711, "learning_rate": 1.9241047298374382e-06, "loss": 1.3877, "step": 216100 }, { "epoch": 8.196845617227783, "grad_norm": 11.846955299377441, "learning_rate": 1.916306987968861e-06, "loss": 1.3836, "step": 216200 }, { "epoch": 8.200636942675159, "grad_norm": 13.573320388793945, "learning_rate": 1.908523404253474e-06, "loss": 1.3982, "step": 216300 }, { "epoch": 8.204428268122536, "grad_norm": 11.605878829956055, "learning_rate": 1.9007539923238083e-06, "loss": 1.4133, "step": 216400 }, { "epoch": 8.208219593569911, "grad_norm": 12.870721817016602, "learning_rate": 1.8929987657875714e-06, "loss": 1.3958, "step": 216500 }, { "epoch": 8.212010919017288, "grad_norm": 12.52823257446289, "learning_rate": 1.8852577382276238e-06, "loss": 1.3869, "step": 216600 }, { "epoch": 8.215802244464665, "grad_norm": 13.688454627990723, "learning_rate": 1.87753092320196e-06, "loss": 1.4014, "step": 216700 }, { "epoch": 8.21959356991204, "grad_norm": 13.289541244506836, "learning_rate": 1.8698183342436815e-06, "loss": 1.3823, "step": 216800 }, { "epoch": 8.223384895359418, "grad_norm": 13.140172004699707, "learning_rate": 1.8621199848609694e-06, "loss": 1.3822, "step": 216900 }, { "epoch": 8.227176220806793, "grad_norm": 13.169207572937012, "learning_rate": 1.8544358885370673e-06, "loss": 1.4034, "step": 217000 }, { "epoch": 8.23096754625417, "grad_norm": 11.883615493774414, "learning_rate": 1.8467660587302616e-06, "loss": 1.3847, "step": 217100 }, { "epoch": 8.234758871701548, "grad_norm": 12.150272369384766, "learning_rate": 1.8391105088738459e-06, "loss": 1.3845, "step": 217200 }, { "epoch": 8.238550197148923, "grad_norm": 11.365819931030273, "learning_rate": 1.8314692523761002e-06, "loss": 1.3872, "step": 217300 }, { "epoch": 8.2423415225963, "grad_norm": 12.42846393585205, "learning_rate": 1.8238423026202745e-06, "loss": 1.4009, "step": 217400 }, { "epoch": 8.246132848043676, "grad_norm": 13.724477767944336, "learning_rate": 1.8162296729645612e-06, "loss": 1.4044, "step": 217500 }, { "epoch": 8.249924173491053, "grad_norm": 13.594022750854492, "learning_rate": 1.8086313767420716e-06, "loss": 1.4056, "step": 217600 }, { "epoch": 8.253715498938428, "grad_norm": 11.546835899353027, "learning_rate": 1.8010474272608137e-06, "loss": 1.3738, "step": 217700 }, { "epoch": 8.257506824385805, "grad_norm": 11.825241088867188, "learning_rate": 1.7934778378036676e-06, "loss": 1.376, "step": 217800 }, { "epoch": 8.261298149833182, "grad_norm": 11.22656536102295, "learning_rate": 1.7859226216283554e-06, "loss": 1.397, "step": 217900 }, { "epoch": 8.265089475280558, "grad_norm": 11.86167049407959, "learning_rate": 1.7783817919674384e-06, "loss": 1.4032, "step": 218000 }, { "epoch": 8.268880800727935, "grad_norm": 12.390405654907227, "learning_rate": 1.770855362028272e-06, "loss": 1.388, "step": 218100 }, { "epoch": 8.27267212617531, "grad_norm": 11.554635047912598, "learning_rate": 1.763343344992995e-06, "loss": 1.3748, "step": 218200 }, { "epoch": 8.276463451622687, "grad_norm": 12.308452606201172, "learning_rate": 1.755845754018497e-06, "loss": 1.3995, "step": 218300 }, { "epoch": 8.280254777070065, "grad_norm": 10.610359191894531, "learning_rate": 1.748362602236403e-06, "loss": 1.3579, "step": 218400 }, { "epoch": 8.28404610251744, "grad_norm": 12.95811653137207, "learning_rate": 1.7408939027530591e-06, "loss": 1.3973, "step": 218500 }, { "epoch": 8.287837427964817, "grad_norm": 12.46086597442627, "learning_rate": 1.7334396686494836e-06, "loss": 1.39, "step": 218600 }, { "epoch": 8.291628753412192, "grad_norm": 11.843360900878906, "learning_rate": 1.7259999129813687e-06, "loss": 1.3845, "step": 218700 }, { "epoch": 8.29542007885957, "grad_norm": 13.333578109741211, "learning_rate": 1.7185746487790445e-06, "loss": 1.3944, "step": 218800 }, { "epoch": 8.299211404306945, "grad_norm": 12.631851196289062, "learning_rate": 1.7111638890474635e-06, "loss": 1.3893, "step": 218900 }, { "epoch": 8.303002729754322, "grad_norm": 11.40541934967041, "learning_rate": 1.7037676467661712e-06, "loss": 1.3923, "step": 219000 }, { "epoch": 8.3067940552017, "grad_norm": 11.727877616882324, "learning_rate": 1.6963859348892876e-06, "loss": 1.3699, "step": 219100 }, { "epoch": 8.310585380649075, "grad_norm": 13.006036758422852, "learning_rate": 1.689018766345485e-06, "loss": 1.3959, "step": 219200 }, { "epoch": 8.314376706096452, "grad_norm": 11.573909759521484, "learning_rate": 1.6816661540379619e-06, "loss": 1.376, "step": 219300 }, { "epoch": 8.318168031543827, "grad_norm": 10.86030387878418, "learning_rate": 1.6743281108444231e-06, "loss": 1.3896, "step": 219400 }, { "epoch": 8.321959356991204, "grad_norm": 13.467279434204102, "learning_rate": 1.6670046496170577e-06, "loss": 1.4186, "step": 219500 }, { "epoch": 8.325750682438581, "grad_norm": 12.737801551818848, "learning_rate": 1.659695783182511e-06, "loss": 1.4056, "step": 219600 }, { "epoch": 8.329542007885957, "grad_norm": 11.6494779586792, "learning_rate": 1.65240152434187e-06, "loss": 1.3859, "step": 219700 }, { "epoch": 8.333333333333334, "grad_norm": 12.805058479309082, "learning_rate": 1.6451218858706374e-06, "loss": 1.3835, "step": 219800 }, { "epoch": 8.33712465878071, "grad_norm": 12.958781242370605, "learning_rate": 1.6378568805187068e-06, "loss": 1.4038, "step": 219900 }, { "epoch": 8.340915984228086, "grad_norm": 12.255399703979492, "learning_rate": 1.630606521010345e-06, "loss": 1.3779, "step": 220000 }, { "epoch": 8.344707309675462, "grad_norm": 12.412662506103516, "learning_rate": 1.6233708200441666e-06, "loss": 1.4042, "step": 220100 }, { "epoch": 8.348498635122839, "grad_norm": 12.455756187438965, "learning_rate": 1.6161497902931122e-06, "loss": 1.3986, "step": 220200 }, { "epoch": 8.352289960570216, "grad_norm": 12.365410804748535, "learning_rate": 1.608943444404426e-06, "loss": 1.3938, "step": 220300 }, { "epoch": 8.356081286017591, "grad_norm": 11.344766616821289, "learning_rate": 1.6017517949996354e-06, "loss": 1.3976, "step": 220400 }, { "epoch": 8.359872611464969, "grad_norm": 10.713216781616211, "learning_rate": 1.594574854674531e-06, "loss": 1.3992, "step": 220500 }, { "epoch": 8.363663936912344, "grad_norm": 12.936990737915039, "learning_rate": 1.5874126359991281e-06, "loss": 1.3643, "step": 220600 }, { "epoch": 8.367455262359721, "grad_norm": 13.998220443725586, "learning_rate": 1.5802651515176748e-06, "loss": 1.4041, "step": 220700 }, { "epoch": 8.371246587807097, "grad_norm": 11.270366668701172, "learning_rate": 1.573132413748607e-06, "loss": 1.4089, "step": 220800 }, { "epoch": 8.375037913254474, "grad_norm": 13.056965827941895, "learning_rate": 1.566014435184524e-06, "loss": 1.3971, "step": 220900 }, { "epoch": 8.37882923870185, "grad_norm": 12.062751770019531, "learning_rate": 1.558911228292187e-06, "loss": 1.3839, "step": 221000 }, { "epoch": 8.382620564149226, "grad_norm": 12.753608703613281, "learning_rate": 1.5518228055124773e-06, "loss": 1.3913, "step": 221100 }, { "epoch": 8.386411889596603, "grad_norm": 12.396443367004395, "learning_rate": 1.5447491792603898e-06, "loss": 1.391, "step": 221200 }, { "epoch": 8.390203215043979, "grad_norm": 9.809871673583984, "learning_rate": 1.5376903619249962e-06, "loss": 1.3932, "step": 221300 }, { "epoch": 8.393994540491356, "grad_norm": 12.30370807647705, "learning_rate": 1.5306463658694382e-06, "loss": 1.3815, "step": 221400 }, { "epoch": 8.397785865938733, "grad_norm": 11.358133316040039, "learning_rate": 1.5236172034308939e-06, "loss": 1.4144, "step": 221500 }, { "epoch": 8.401577191386108, "grad_norm": 12.571556091308594, "learning_rate": 1.516602886920564e-06, "loss": 1.4063, "step": 221600 }, { "epoch": 8.405368516833486, "grad_norm": 10.727476119995117, "learning_rate": 1.509603428623646e-06, "loss": 1.3777, "step": 221700 }, { "epoch": 8.40915984228086, "grad_norm": 12.778519630432129, "learning_rate": 1.502618840799316e-06, "loss": 1.39, "step": 221800 }, { "epoch": 8.412951167728238, "grad_norm": 11.02316951751709, "learning_rate": 1.495649135680699e-06, "loss": 1.3761, "step": 221900 }, { "epoch": 8.416742493175613, "grad_norm": 14.20637035369873, "learning_rate": 1.48869432547486e-06, "loss": 1.3912, "step": 222000 }, { "epoch": 8.42053381862299, "grad_norm": 14.043320655822754, "learning_rate": 1.48175442236278e-06, "loss": 1.3928, "step": 222100 }, { "epoch": 8.424325144070368, "grad_norm": 11.602519989013672, "learning_rate": 1.47482943849932e-06, "loss": 1.4081, "step": 222200 }, { "epoch": 8.428116469517743, "grad_norm": 11.00360107421875, "learning_rate": 1.4679193860132169e-06, "loss": 1.3958, "step": 222300 }, { "epoch": 8.43190779496512, "grad_norm": 13.816750526428223, "learning_rate": 1.4610242770070583e-06, "loss": 1.404, "step": 222400 }, { "epoch": 8.435699120412496, "grad_norm": 10.262680053710938, "learning_rate": 1.454144123557254e-06, "loss": 1.3717, "step": 222500 }, { "epoch": 8.439490445859873, "grad_norm": 12.771817207336426, "learning_rate": 1.447278937714024e-06, "loss": 1.3859, "step": 222600 }, { "epoch": 8.443281771307248, "grad_norm": 11.128013610839844, "learning_rate": 1.440428731501372e-06, "loss": 1.3999, "step": 222700 }, { "epoch": 8.447073096754625, "grad_norm": 12.910786628723145, "learning_rate": 1.4335935169170655e-06, "loss": 1.3823, "step": 222800 }, { "epoch": 8.450864422202002, "grad_norm": 11.943010330200195, "learning_rate": 1.4267733059326117e-06, "loss": 1.4076, "step": 222900 }, { "epoch": 8.454655747649378, "grad_norm": 11.622629165649414, "learning_rate": 1.419968110493246e-06, "loss": 1.4006, "step": 223000 }, { "epoch": 8.458447073096755, "grad_norm": 11.6835355758667, "learning_rate": 1.4131779425179049e-06, "loss": 1.3934, "step": 223100 }, { "epoch": 8.46223839854413, "grad_norm": 11.741151809692383, "learning_rate": 1.406402813899196e-06, "loss": 1.3762, "step": 223200 }, { "epoch": 8.466029723991507, "grad_norm": 14.218157768249512, "learning_rate": 1.3996427365033948e-06, "loss": 1.403, "step": 223300 }, { "epoch": 8.469821049438885, "grad_norm": 11.961411476135254, "learning_rate": 1.392897722170411e-06, "loss": 1.3848, "step": 223400 }, { "epoch": 8.47361237488626, "grad_norm": 12.336662292480469, "learning_rate": 1.3861677827137809e-06, "loss": 1.3711, "step": 223500 }, { "epoch": 8.477403700333637, "grad_norm": 10.948334693908691, "learning_rate": 1.379452929920625e-06, "loss": 1.3793, "step": 223600 }, { "epoch": 8.481195025781012, "grad_norm": 12.13304328918457, "learning_rate": 1.3727531755516477e-06, "loss": 1.3995, "step": 223700 }, { "epoch": 8.48498635122839, "grad_norm": 12.741959571838379, "learning_rate": 1.3660685313411093e-06, "loss": 1.3858, "step": 223800 }, { "epoch": 8.488777676675765, "grad_norm": 13.26450252532959, "learning_rate": 1.3593990089968035e-06, "loss": 1.4111, "step": 223900 }, { "epoch": 8.492569002123142, "grad_norm": 12.409791946411133, "learning_rate": 1.3527446202000394e-06, "loss": 1.4012, "step": 224000 }, { "epoch": 8.49636032757052, "grad_norm": 11.896472930908203, "learning_rate": 1.3461053766056243e-06, "loss": 1.4105, "step": 224100 }, { "epoch": 8.500151653017895, "grad_norm": 12.160500526428223, "learning_rate": 1.3394812898418307e-06, "loss": 1.3764, "step": 224200 }, { "epoch": 8.503942978465272, "grad_norm": 10.460376739501953, "learning_rate": 1.3328723715103897e-06, "loss": 1.4079, "step": 224300 }, { "epoch": 8.507734303912647, "grad_norm": 10.243362426757812, "learning_rate": 1.3262786331864707e-06, "loss": 1.3652, "step": 224400 }, { "epoch": 8.511525629360024, "grad_norm": 11.483405113220215, "learning_rate": 1.3197000864186515e-06, "loss": 1.3762, "step": 224500 }, { "epoch": 8.515316954807401, "grad_norm": 12.814970970153809, "learning_rate": 1.3131367427288988e-06, "loss": 1.3949, "step": 224600 }, { "epoch": 8.519108280254777, "grad_norm": 11.390839576721191, "learning_rate": 1.306588613612557e-06, "loss": 1.3819, "step": 224700 }, { "epoch": 8.522899605702154, "grad_norm": 12.9599027633667, "learning_rate": 1.3000557105383239e-06, "loss": 1.3775, "step": 224800 }, { "epoch": 8.52669093114953, "grad_norm": 10.822602272033691, "learning_rate": 1.2935380449482239e-06, "loss": 1.3668, "step": 224900 }, { "epoch": 8.530482256596907, "grad_norm": 12.791426658630371, "learning_rate": 1.2870356282576013e-06, "loss": 1.3851, "step": 225000 }, { "epoch": 8.534273582044282, "grad_norm": 12.803642272949219, "learning_rate": 1.2805484718550887e-06, "loss": 1.3826, "step": 225100 }, { "epoch": 8.538064907491659, "grad_norm": 11.238692283630371, "learning_rate": 1.274076587102585e-06, "loss": 1.392, "step": 225200 }, { "epoch": 8.541856232939036, "grad_norm": 12.19664478302002, "learning_rate": 1.2676199853352556e-06, "loss": 1.3971, "step": 225300 }, { "epoch": 8.545647558386412, "grad_norm": 10.57886028289795, "learning_rate": 1.2611786778614877e-06, "loss": 1.3759, "step": 225400 }, { "epoch": 8.549438883833789, "grad_norm": 12.765009880065918, "learning_rate": 1.254752675962888e-06, "loss": 1.3707, "step": 225500 }, { "epoch": 8.553230209281164, "grad_norm": 10.90847110748291, "learning_rate": 1.2483419908942474e-06, "loss": 1.3753, "step": 225600 }, { "epoch": 8.557021534728541, "grad_norm": 12.015028953552246, "learning_rate": 1.2419466338835362e-06, "loss": 1.3918, "step": 225700 }, { "epoch": 8.560812860175918, "grad_norm": 10.991117477416992, "learning_rate": 1.2355666161318846e-06, "loss": 1.4016, "step": 225800 }, { "epoch": 8.564604185623294, "grad_norm": 13.125874519348145, "learning_rate": 1.2292019488135443e-06, "loss": 1.3857, "step": 225900 }, { "epoch": 8.56839551107067, "grad_norm": 11.569889068603516, "learning_rate": 1.222852643075888e-06, "loss": 1.3912, "step": 226000 }, { "epoch": 8.572186836518046, "grad_norm": 11.563084602355957, "learning_rate": 1.216518710039385e-06, "loss": 1.3927, "step": 226100 }, { "epoch": 8.575978161965423, "grad_norm": 12.586675643920898, "learning_rate": 1.2102001607975766e-06, "loss": 1.4167, "step": 226200 }, { "epoch": 8.579769487412799, "grad_norm": 11.207196235656738, "learning_rate": 1.2038970064170618e-06, "loss": 1.3756, "step": 226300 }, { "epoch": 8.583560812860176, "grad_norm": 12.826082229614258, "learning_rate": 1.1976092579374776e-06, "loss": 1.3978, "step": 226400 }, { "epoch": 8.587352138307553, "grad_norm": 12.134796142578125, "learning_rate": 1.191336926371477e-06, "loss": 1.3809, "step": 226500 }, { "epoch": 8.591143463754928, "grad_norm": 12.718849182128906, "learning_rate": 1.1850800227047065e-06, "loss": 1.4012, "step": 226600 }, { "epoch": 8.594934789202306, "grad_norm": 13.182507514953613, "learning_rate": 1.1788385578958018e-06, "loss": 1.3708, "step": 226700 }, { "epoch": 8.598726114649681, "grad_norm": 11.157575607299805, "learning_rate": 1.1726125428763523e-06, "loss": 1.3827, "step": 226800 }, { "epoch": 8.602517440097058, "grad_norm": 11.58060359954834, "learning_rate": 1.1664019885508848e-06, "loss": 1.3749, "step": 226900 }, { "epoch": 8.606308765544433, "grad_norm": 12.750163078308105, "learning_rate": 1.1602069057968523e-06, "loss": 1.3813, "step": 227000 }, { "epoch": 8.61010009099181, "grad_norm": 12.556397438049316, "learning_rate": 1.1540273054646079e-06, "loss": 1.3858, "step": 227100 }, { "epoch": 8.613891416439188, "grad_norm": 10.654935836791992, "learning_rate": 1.1478631983773913e-06, "loss": 1.3852, "step": 227200 }, { "epoch": 8.617682741886563, "grad_norm": 10.90953540802002, "learning_rate": 1.1417145953313024e-06, "loss": 1.386, "step": 227300 }, { "epoch": 8.62147406733394, "grad_norm": 11.21645450592041, "learning_rate": 1.1355815070952892e-06, "loss": 1.3743, "step": 227400 }, { "epoch": 8.625265392781316, "grad_norm": 11.556816101074219, "learning_rate": 1.1294639444111255e-06, "loss": 1.3766, "step": 227500 }, { "epoch": 8.629056718228693, "grad_norm": 12.615433692932129, "learning_rate": 1.123361917993393e-06, "loss": 1.3813, "step": 227600 }, { "epoch": 8.63284804367607, "grad_norm": 11.458868026733398, "learning_rate": 1.1172754385294626e-06, "loss": 1.403, "step": 227700 }, { "epoch": 8.636639369123445, "grad_norm": 12.897326469421387, "learning_rate": 1.1112045166794773e-06, "loss": 1.3693, "step": 227800 }, { "epoch": 8.640430694570822, "grad_norm": 11.25291633605957, "learning_rate": 1.1051491630763244e-06, "loss": 1.3739, "step": 227900 }, { "epoch": 8.644222020018198, "grad_norm": 12.779375076293945, "learning_rate": 1.0991093883256354e-06, "loss": 1.3731, "step": 228000 }, { "epoch": 8.648013345465575, "grad_norm": 11.747629165649414, "learning_rate": 1.0930852030057505e-06, "loss": 1.4088, "step": 228100 }, { "epoch": 8.65180467091295, "grad_norm": 12.90594482421875, "learning_rate": 1.0870766176677016e-06, "loss": 1.3896, "step": 228200 }, { "epoch": 8.655595996360328, "grad_norm": 12.541007995605469, "learning_rate": 1.0810836428352057e-06, "loss": 1.4185, "step": 228300 }, { "epoch": 8.659387321807705, "grad_norm": 11.728806495666504, "learning_rate": 1.0751062890046338e-06, "loss": 1.3887, "step": 228400 }, { "epoch": 8.66317864725508, "grad_norm": 11.755396842956543, "learning_rate": 1.0691445666450039e-06, "loss": 1.3783, "step": 228500 }, { "epoch": 8.666969972702457, "grad_norm": 13.73670482635498, "learning_rate": 1.0631984861979472e-06, "loss": 1.3818, "step": 228600 }, { "epoch": 8.670761298149833, "grad_norm": 13.203643798828125, "learning_rate": 1.057268058077704e-06, "loss": 1.3898, "step": 228700 }, { "epoch": 8.67455262359721, "grad_norm": 11.712393760681152, "learning_rate": 1.0513532926711012e-06, "loss": 1.3947, "step": 228800 }, { "epoch": 8.678343949044585, "grad_norm": 12.164974212646484, "learning_rate": 1.0454542003375323e-06, "loss": 1.3875, "step": 228900 }, { "epoch": 8.682135274491962, "grad_norm": 11.37363338470459, "learning_rate": 1.0395707914089392e-06, "loss": 1.3713, "step": 229000 }, { "epoch": 8.68592659993934, "grad_norm": 12.851922035217285, "learning_rate": 1.0337030761897982e-06, "loss": 1.4049, "step": 229100 }, { "epoch": 8.689717925386715, "grad_norm": 13.741557121276855, "learning_rate": 1.0278510649570916e-06, "loss": 1.4015, "step": 229200 }, { "epoch": 8.693509250834092, "grad_norm": 12.29905891418457, "learning_rate": 1.0220147679603043e-06, "loss": 1.39, "step": 229300 }, { "epoch": 8.697300576281467, "grad_norm": 10.566752433776855, "learning_rate": 1.0161941954213994e-06, "loss": 1.4212, "step": 229400 }, { "epoch": 8.701091901728844, "grad_norm": 13.152318000793457, "learning_rate": 1.0103893575347956e-06, "loss": 1.3884, "step": 229500 }, { "epoch": 8.704883227176222, "grad_norm": 12.769410133361816, "learning_rate": 1.0046002644673502e-06, "loss": 1.3857, "step": 229600 }, { "epoch": 8.708674552623597, "grad_norm": 11.144576072692871, "learning_rate": 9.988269263583505e-07, "loss": 1.3827, "step": 229700 }, { "epoch": 8.712465878070974, "grad_norm": 12.85888957977295, "learning_rate": 9.930693533194879e-07, "loss": 1.3755, "step": 229800 }, { "epoch": 8.71625720351835, "grad_norm": 11.614727973937988, "learning_rate": 9.873275554348417e-07, "loss": 1.3842, "step": 229900 }, { "epoch": 8.720048528965727, "grad_norm": 11.733217239379883, "learning_rate": 9.816015427608604e-07, "loss": 1.3951, "step": 230000 }, { "epoch": 8.723839854413104, "grad_norm": 11.730627059936523, "learning_rate": 9.758913253263502e-07, "loss": 1.3772, "step": 230100 }, { "epoch": 8.727631179860479, "grad_norm": 11.596705436706543, "learning_rate": 9.701969131324429e-07, "loss": 1.3844, "step": 230200 }, { "epoch": 8.731422505307856, "grad_norm": 10.85795783996582, "learning_rate": 9.645183161526017e-07, "loss": 1.3965, "step": 230300 }, { "epoch": 8.735213830755232, "grad_norm": 12.451567649841309, "learning_rate": 9.588555443325808e-07, "loss": 1.3701, "step": 230400 }, { "epoch": 8.739005156202609, "grad_norm": 11.363391876220703, "learning_rate": 9.532086075904234e-07, "loss": 1.39, "step": 230500 }, { "epoch": 8.742796481649984, "grad_norm": 11.850738525390625, "learning_rate": 9.475775158164291e-07, "loss": 1.374, "step": 230600 }, { "epoch": 8.746587807097361, "grad_norm": 10.727429389953613, "learning_rate": 9.419622788731552e-07, "loss": 1.3954, "step": 230700 }, { "epoch": 8.750379132544738, "grad_norm": 12.555522918701172, "learning_rate": 9.363629065953895e-07, "loss": 1.3719, "step": 230800 }, { "epoch": 8.754170457992114, "grad_norm": 12.374374389648438, "learning_rate": 9.307794087901279e-07, "loss": 1.3814, "step": 230900 }, { "epoch": 8.757961783439491, "grad_norm": 13.343976974487305, "learning_rate": 9.252117952365669e-07, "loss": 1.3726, "step": 231000 }, { "epoch": 8.761753108886866, "grad_norm": 11.541436195373535, "learning_rate": 9.196600756860818e-07, "loss": 1.39, "step": 231100 }, { "epoch": 8.765544434334243, "grad_norm": 10.646956443786621, "learning_rate": 9.141242598622113e-07, "loss": 1.3958, "step": 231200 }, { "epoch": 8.769335759781619, "grad_norm": 11.475406646728516, "learning_rate": 9.086043574606384e-07, "loss": 1.4085, "step": 231300 }, { "epoch": 8.773127085228996, "grad_norm": 11.397378921508789, "learning_rate": 9.031003781491754e-07, "loss": 1.3946, "step": 231400 }, { "epoch": 8.776918410676373, "grad_norm": 11.92272663116455, "learning_rate": 8.976123315677476e-07, "loss": 1.3477, "step": 231500 }, { "epoch": 8.780709736123748, "grad_norm": 11.574936866760254, "learning_rate": 8.921402273283663e-07, "loss": 1.3978, "step": 231600 }, { "epoch": 8.784501061571126, "grad_norm": 12.372962951660156, "learning_rate": 8.866840750151351e-07, "loss": 1.3861, "step": 231700 }, { "epoch": 8.788292387018501, "grad_norm": 11.662347793579102, "learning_rate": 8.812438841842108e-07, "loss": 1.4038, "step": 231800 }, { "epoch": 8.792083712465878, "grad_norm": 11.177085876464844, "learning_rate": 8.758196643637895e-07, "loss": 1.4113, "step": 231900 }, { "epoch": 8.795875037913255, "grad_norm": 11.250405311584473, "learning_rate": 8.70411425054104e-07, "loss": 1.4245, "step": 232000 }, { "epoch": 8.79966636336063, "grad_norm": 11.20543384552002, "learning_rate": 8.650191757273929e-07, "loss": 1.3924, "step": 232100 }, { "epoch": 8.803457688808008, "grad_norm": 13.849470138549805, "learning_rate": 8.596429258278915e-07, "loss": 1.398, "step": 232200 }, { "epoch": 8.807249014255383, "grad_norm": 12.192399978637695, "learning_rate": 8.542826847718111e-07, "loss": 1.3805, "step": 232300 }, { "epoch": 8.81104033970276, "grad_norm": 11.504365921020508, "learning_rate": 8.489384619473274e-07, "loss": 1.3873, "step": 232400 }, { "epoch": 8.814831665150136, "grad_norm": 12.45508861541748, "learning_rate": 8.436102667145573e-07, "loss": 1.3563, "step": 232500 }, { "epoch": 8.818622990597513, "grad_norm": 13.641050338745117, "learning_rate": 8.382981084055475e-07, "loss": 1.3893, "step": 232600 }, { "epoch": 8.82241431604489, "grad_norm": 12.708812713623047, "learning_rate": 8.330019963242597e-07, "loss": 1.3883, "step": 232700 }, { "epoch": 8.826205641492265, "grad_norm": 12.641253471374512, "learning_rate": 8.277219397465486e-07, "loss": 1.3858, "step": 232800 }, { "epoch": 8.829996966939643, "grad_norm": 12.014039993286133, "learning_rate": 8.224579479201467e-07, "loss": 1.4032, "step": 232900 }, { "epoch": 8.833788292387018, "grad_norm": 12.888080596923828, "learning_rate": 8.172100300646513e-07, "loss": 1.4014, "step": 233000 }, { "epoch": 8.837579617834395, "grad_norm": 11.833473205566406, "learning_rate": 8.119781953715145e-07, "loss": 1.3896, "step": 233100 }, { "epoch": 8.84137094328177, "grad_norm": 11.78145980834961, "learning_rate": 8.067624530040086e-07, "loss": 1.3707, "step": 233200 }, { "epoch": 8.845162268729148, "grad_norm": 11.8798246383667, "learning_rate": 8.015628120972252e-07, "loss": 1.4117, "step": 233300 }, { "epoch": 8.848953594176525, "grad_norm": 12.29345989227295, "learning_rate": 7.963792817580562e-07, "loss": 1.383, "step": 233400 }, { "epoch": 8.8527449196239, "grad_norm": 12.847376823425293, "learning_rate": 7.912118710651761e-07, "loss": 1.3681, "step": 233500 }, { "epoch": 8.856536245071277, "grad_norm": 11.909847259521484, "learning_rate": 7.860605890690276e-07, "loss": 1.4051, "step": 233600 }, { "epoch": 8.860327570518653, "grad_norm": 12.414546012878418, "learning_rate": 7.809254447918025e-07, "loss": 1.3962, "step": 233700 }, { "epoch": 8.86411889596603, "grad_norm": 13.922514915466309, "learning_rate": 7.758064472274318e-07, "loss": 1.3721, "step": 233800 }, { "epoch": 8.867910221413407, "grad_norm": 12.309815406799316, "learning_rate": 7.707036053415595e-07, "loss": 1.4117, "step": 233900 }, { "epoch": 8.871701546860782, "grad_norm": 11.946707725524902, "learning_rate": 7.656169280715431e-07, "loss": 1.3859, "step": 234000 }, { "epoch": 8.87549287230816, "grad_norm": 12.14351749420166, "learning_rate": 7.605464243264237e-07, "loss": 1.3608, "step": 234100 }, { "epoch": 8.879284197755535, "grad_norm": 10.600497245788574, "learning_rate": 7.554921029869133e-07, "loss": 1.3921, "step": 234200 }, { "epoch": 8.883075523202912, "grad_norm": 12.62093448638916, "learning_rate": 7.504539729053839e-07, "loss": 1.4016, "step": 234300 }, { "epoch": 8.886866848650287, "grad_norm": 11.294477462768555, "learning_rate": 7.454320429058504e-07, "loss": 1.3751, "step": 234400 }, { "epoch": 8.890658174097664, "grad_norm": 12.757590293884277, "learning_rate": 7.404263217839536e-07, "loss": 1.4287, "step": 234500 }, { "epoch": 8.894449499545042, "grad_norm": 12.8955078125, "learning_rate": 7.354368183069416e-07, "loss": 1.3801, "step": 234600 }, { "epoch": 8.898240824992417, "grad_norm": 12.603814125061035, "learning_rate": 7.30463541213664e-07, "loss": 1.3811, "step": 234700 }, { "epoch": 8.902032150439794, "grad_norm": 12.135048866271973, "learning_rate": 7.255064992145478e-07, "loss": 1.399, "step": 234800 }, { "epoch": 8.90582347588717, "grad_norm": 11.659422874450684, "learning_rate": 7.205657009915834e-07, "loss": 1.3659, "step": 234900 }, { "epoch": 8.909614801334547, "grad_norm": 11.14718246459961, "learning_rate": 7.156411551983144e-07, "loss": 1.3899, "step": 235000 }, { "epoch": 8.913406126781924, "grad_norm": 11.911691665649414, "learning_rate": 7.107328704598193e-07, "loss": 1.4006, "step": 235100 }, { "epoch": 8.9171974522293, "grad_norm": 11.250821113586426, "learning_rate": 7.058408553726881e-07, "loss": 1.392, "step": 235200 }, { "epoch": 8.920988777676676, "grad_norm": 11.389333724975586, "learning_rate": 7.009651185050292e-07, "loss": 1.3775, "step": 235300 }, { "epoch": 8.924780103124052, "grad_norm": 12.124297142028809, "learning_rate": 6.961056683964318e-07, "loss": 1.3683, "step": 235400 }, { "epoch": 8.928571428571429, "grad_norm": 12.216679573059082, "learning_rate": 6.912625135579587e-07, "loss": 1.3853, "step": 235500 }, { "epoch": 8.932362754018804, "grad_norm": 9.668691635131836, "learning_rate": 6.864356624721357e-07, "loss": 1.3873, "step": 235600 }, { "epoch": 8.936154079466181, "grad_norm": 11.131156921386719, "learning_rate": 6.816251235929327e-07, "loss": 1.4193, "step": 235700 }, { "epoch": 8.939945404913558, "grad_norm": 12.117420196533203, "learning_rate": 6.768309053457501e-07, "loss": 1.3787, "step": 235800 }, { "epoch": 8.943736730360934, "grad_norm": 12.317082405090332, "learning_rate": 6.72053016127403e-07, "loss": 1.3788, "step": 235900 }, { "epoch": 8.947528055808311, "grad_norm": 12.81973648071289, "learning_rate": 6.672914643061079e-07, "loss": 1.3962, "step": 236000 }, { "epoch": 8.951319381255686, "grad_norm": 11.743191719055176, "learning_rate": 6.625462582214648e-07, "loss": 1.3867, "step": 236100 }, { "epoch": 8.955110706703064, "grad_norm": 13.014280319213867, "learning_rate": 6.578174061844488e-07, "loss": 1.379, "step": 236200 }, { "epoch": 8.95890203215044, "grad_norm": 11.90609359741211, "learning_rate": 6.531049164773872e-07, "loss": 1.3832, "step": 236300 }, { "epoch": 8.962693357597816, "grad_norm": 12.521207809448242, "learning_rate": 6.484087973539566e-07, "loss": 1.3865, "step": 236400 }, { "epoch": 8.966484683045193, "grad_norm": 11.93526840209961, "learning_rate": 6.437290570391519e-07, "loss": 1.3967, "step": 236500 }, { "epoch": 8.970276008492569, "grad_norm": 10.73900032043457, "learning_rate": 6.390657037292858e-07, "loss": 1.3811, "step": 236600 }, { "epoch": 8.974067333939946, "grad_norm": 12.341087341308594, "learning_rate": 6.344187455919748e-07, "loss": 1.3889, "step": 236700 }, { "epoch": 8.977858659387321, "grad_norm": 10.15781307220459, "learning_rate": 6.297881907661163e-07, "loss": 1.4094, "step": 236800 }, { "epoch": 8.981649984834698, "grad_norm": 14.200922012329102, "learning_rate": 6.251740473618739e-07, "loss": 1.3994, "step": 236900 }, { "epoch": 8.985441310282075, "grad_norm": 11.757405281066895, "learning_rate": 6.205763234606732e-07, "loss": 1.3835, "step": 237000 }, { "epoch": 8.98923263572945, "grad_norm": 12.768462181091309, "learning_rate": 6.159950271151805e-07, "loss": 1.3799, "step": 237100 }, { "epoch": 8.993023961176828, "grad_norm": 11.726208686828613, "learning_rate": 6.114301663492883e-07, "loss": 1.3919, "step": 237200 }, { "epoch": 8.996815286624203, "grad_norm": 11.764986991882324, "learning_rate": 6.068817491581069e-07, "loss": 1.3737, "step": 237300 }, { "epoch": 9.0, "eval_accuracy": 0.5285406576116319, "eval_loss": 2.1277427673339844, "eval_runtime": 932.9165, "eval_samples_per_second": 904.716, "eval_steps_per_second": 7.068, "step": 237384 }, { "epoch": 9.00060661207158, "grad_norm": 11.773358345031738, "learning_rate": 6.02349783507944e-07, "loss": 1.3758, "step": 237400 }, { "epoch": 9.004397937518956, "grad_norm": 11.612360000610352, "learning_rate": 5.978342773362899e-07, "loss": 1.3587, "step": 237500 }, { "epoch": 9.008189262966333, "grad_norm": 13.370427131652832, "learning_rate": 5.933352385518132e-07, "loss": 1.3804, "step": 237600 }, { "epoch": 9.01198058841371, "grad_norm": 13.460092544555664, "learning_rate": 5.888526750343393e-07, "loss": 1.3557, "step": 237700 }, { "epoch": 9.015771913861085, "grad_norm": 11.15639591217041, "learning_rate": 5.84386594634837e-07, "loss": 1.3559, "step": 237800 }, { "epoch": 9.019563239308463, "grad_norm": 11.935317993164062, "learning_rate": 5.799370051754028e-07, "loss": 1.3675, "step": 237900 }, { "epoch": 9.023354564755838, "grad_norm": 11.77146053314209, "learning_rate": 5.755039144492536e-07, "loss": 1.3679, "step": 238000 }, { "epoch": 9.027145890203215, "grad_norm": 11.471354484558105, "learning_rate": 5.710873302207132e-07, "loss": 1.3499, "step": 238100 }, { "epoch": 9.030937215650592, "grad_norm": 11.644347190856934, "learning_rate": 5.666872602251872e-07, "loss": 1.3747, "step": 238200 }, { "epoch": 9.034728541097968, "grad_norm": 12.236509323120117, "learning_rate": 5.62303712169161e-07, "loss": 1.3908, "step": 238300 }, { "epoch": 9.038519866545345, "grad_norm": 12.988731384277344, "learning_rate": 5.579366937301856e-07, "loss": 1.3438, "step": 238400 }, { "epoch": 9.04231119199272, "grad_norm": 10.854636192321777, "learning_rate": 5.53586212556858e-07, "loss": 1.3436, "step": 238500 }, { "epoch": 9.046102517440097, "grad_norm": 10.496390342712402, "learning_rate": 5.492522762688113e-07, "loss": 1.3702, "step": 238600 }, { "epoch": 9.049893842887473, "grad_norm": 11.242095947265625, "learning_rate": 5.44934892456701e-07, "loss": 1.3585, "step": 238700 }, { "epoch": 9.05368516833485, "grad_norm": 11.209606170654297, "learning_rate": 5.406340686821953e-07, "loss": 1.3479, "step": 238800 }, { "epoch": 9.057476493782227, "grad_norm": 14.081796646118164, "learning_rate": 5.36349812477951e-07, "loss": 1.3481, "step": 238900 }, { "epoch": 9.061267819229602, "grad_norm": 12.11839485168457, "learning_rate": 5.320821313476155e-07, "loss": 1.3367, "step": 239000 }, { "epoch": 9.06505914467698, "grad_norm": 13.206459045410156, "learning_rate": 5.278310327658032e-07, "loss": 1.3331, "step": 239100 }, { "epoch": 9.068850470124355, "grad_norm": 12.889446258544922, "learning_rate": 5.235965241780827e-07, "loss": 1.3494, "step": 239200 }, { "epoch": 9.072641795571732, "grad_norm": 12.415318489074707, "learning_rate": 5.193786130009671e-07, "loss": 1.3495, "step": 239300 }, { "epoch": 9.07643312101911, "grad_norm": 13.173932075500488, "learning_rate": 5.151773066219024e-07, "loss": 1.3621, "step": 239400 }, { "epoch": 9.080224446466485, "grad_norm": 11.70231819152832, "learning_rate": 5.109926123992504e-07, "loss": 1.3339, "step": 239500 }, { "epoch": 9.084015771913862, "grad_norm": 13.487547874450684, "learning_rate": 5.06824537662276e-07, "loss": 1.3836, "step": 239600 }, { "epoch": 9.087807097361237, "grad_norm": 13.816155433654785, "learning_rate": 5.026730897111409e-07, "loss": 1.3741, "step": 239700 }, { "epoch": 9.091598422808614, "grad_norm": 11.978727340698242, "learning_rate": 4.9853827581688e-07, "loss": 1.3494, "step": 239800 }, { "epoch": 9.09538974825599, "grad_norm": 13.294464111328125, "learning_rate": 4.944201032213991e-07, "loss": 1.3676, "step": 239900 }, { "epoch": 9.099181073703367, "grad_norm": 10.779894828796387, "learning_rate": 4.903185791374543e-07, "loss": 1.3815, "step": 240000 }, { "epoch": 9.102972399150744, "grad_norm": 11.391387939453125, "learning_rate": 4.86233710748647e-07, "loss": 1.3271, "step": 240100 }, { "epoch": 9.10676372459812, "grad_norm": 13.06803035736084, "learning_rate": 4.82165505209401e-07, "loss": 1.3572, "step": 240200 }, { "epoch": 9.110555050045496, "grad_norm": 12.108728408813477, "learning_rate": 4.781139696449588e-07, "loss": 1.3299, "step": 240300 }, { "epoch": 9.114346375492872, "grad_norm": 11.120113372802734, "learning_rate": 4.7407911115137207e-07, "loss": 1.3612, "step": 240400 }, { "epoch": 9.118137700940249, "grad_norm": 11.985166549682617, "learning_rate": 4.700609367954734e-07, "loss": 1.3542, "step": 240500 }, { "epoch": 9.121929026387624, "grad_norm": 11.158717155456543, "learning_rate": 4.660594536148799e-07, "loss": 1.3728, "step": 240600 }, { "epoch": 9.125720351835001, "grad_norm": 11.921896934509277, "learning_rate": 4.6207466861797554e-07, "loss": 1.3435, "step": 240700 }, { "epoch": 9.129511677282379, "grad_norm": 12.004598617553711, "learning_rate": 4.5810658878389424e-07, "loss": 1.3474, "step": 240800 }, { "epoch": 9.133303002729754, "grad_norm": 12.204935073852539, "learning_rate": 4.5415522106251666e-07, "loss": 1.353, "step": 240900 }, { "epoch": 9.137094328177131, "grad_norm": 13.859965324401855, "learning_rate": 4.5022057237445016e-07, "loss": 1.3631, "step": 241000 }, { "epoch": 9.140885653624506, "grad_norm": 12.730277061462402, "learning_rate": 4.463026496110201e-07, "loss": 1.3628, "step": 241100 }, { "epoch": 9.144676979071884, "grad_norm": 11.987946510314941, "learning_rate": 4.424014596342541e-07, "loss": 1.3537, "step": 241200 }, { "epoch": 9.14846830451926, "grad_norm": 12.134753227233887, "learning_rate": 4.385170092768809e-07, "loss": 1.3867, "step": 241300 }, { "epoch": 9.152259629966636, "grad_norm": 12.702415466308594, "learning_rate": 4.346493053423062e-07, "loss": 1.3664, "step": 241400 }, { "epoch": 9.156050955414013, "grad_norm": 12.732091903686523, "learning_rate": 4.307983546046024e-07, "loss": 1.3614, "step": 241500 }, { "epoch": 9.159842280861389, "grad_norm": 11.617935180664062, "learning_rate": 4.2696416380850313e-07, "loss": 1.3446, "step": 241600 }, { "epoch": 9.163633606308766, "grad_norm": 13.135943412780762, "learning_rate": 4.231467396693867e-07, "loss": 1.3576, "step": 241700 }, { "epoch": 9.167424931756141, "grad_norm": 12.336230278015137, "learning_rate": 4.193460888732692e-07, "loss": 1.3489, "step": 241800 }, { "epoch": 9.171216257203518, "grad_norm": 13.185897827148438, "learning_rate": 4.1556221807678263e-07, "loss": 1.364, "step": 241900 }, { "epoch": 9.175007582650895, "grad_norm": 12.380581855773926, "learning_rate": 4.117951339071746e-07, "loss": 1.3436, "step": 242000 }, { "epoch": 9.17879890809827, "grad_norm": 13.203110694885254, "learning_rate": 4.0804484296228965e-07, "loss": 1.3668, "step": 242100 }, { "epoch": 9.182590233545648, "grad_norm": 11.405385971069336, "learning_rate": 4.0431135181056016e-07, "loss": 1.3771, "step": 242200 }, { "epoch": 9.186381558993023, "grad_norm": 12.155765533447266, "learning_rate": 4.0059466699099546e-07, "loss": 1.3673, "step": 242300 }, { "epoch": 9.1901728844404, "grad_norm": 12.081136703491211, "learning_rate": 3.9689479501316965e-07, "loss": 1.3607, "step": 242400 }, { "epoch": 9.193964209887778, "grad_norm": 12.475247383117676, "learning_rate": 3.9321174235720796e-07, "loss": 1.3607, "step": 242500 }, { "epoch": 9.197755535335153, "grad_norm": 13.014174461364746, "learning_rate": 3.8954551547377817e-07, "loss": 1.3534, "step": 242600 }, { "epoch": 9.20154686078253, "grad_norm": 11.007379531860352, "learning_rate": 3.8589612078408277e-07, "loss": 1.3444, "step": 242700 }, { "epoch": 9.205338186229906, "grad_norm": 12.15437126159668, "learning_rate": 3.8226356467983785e-07, "loss": 1.3132, "step": 242800 }, { "epoch": 9.209129511677283, "grad_norm": 12.152040481567383, "learning_rate": 3.786478535232696e-07, "loss": 1.3766, "step": 242900 }, { "epoch": 9.212920837124658, "grad_norm": 11.822118759155273, "learning_rate": 3.7504899364710137e-07, "loss": 1.3502, "step": 243000 }, { "epoch": 9.216712162572035, "grad_norm": 11.338650703430176, "learning_rate": 3.7146699135454324e-07, "loss": 1.3203, "step": 243100 }, { "epoch": 9.220503488019412, "grad_norm": 11.903205871582031, "learning_rate": 3.6790185291928015e-07, "loss": 1.3405, "step": 243200 }, { "epoch": 9.224294813466788, "grad_norm": 11.894763946533203, "learning_rate": 3.6435358458545846e-07, "loss": 1.3581, "step": 243300 }, { "epoch": 9.228086138914165, "grad_norm": 11.834014892578125, "learning_rate": 3.6082219256768025e-07, "loss": 1.3571, "step": 243400 }, { "epoch": 9.23187746436154, "grad_norm": 11.740354537963867, "learning_rate": 3.573076830509881e-07, "loss": 1.3045, "step": 243500 }, { "epoch": 9.235668789808917, "grad_norm": 12.908510208129883, "learning_rate": 3.538100621908569e-07, "loss": 1.379, "step": 243600 }, { "epoch": 9.239460115256293, "grad_norm": 12.06467342376709, "learning_rate": 3.5032933611317987e-07, "loss": 1.3655, "step": 243700 }, { "epoch": 9.24325144070367, "grad_norm": 10.683778762817383, "learning_rate": 3.468655109142649e-07, "loss": 1.3358, "step": 243800 }, { "epoch": 9.247042766151047, "grad_norm": 12.14107608795166, "learning_rate": 3.434185926608091e-07, "loss": 1.377, "step": 243900 }, { "epoch": 9.250834091598422, "grad_norm": 11.577484130859375, "learning_rate": 3.3998858738991004e-07, "loss": 1.3414, "step": 244000 }, { "epoch": 9.2546254170458, "grad_norm": 13.198162078857422, "learning_rate": 3.365755011090366e-07, "loss": 1.3797, "step": 244100 }, { "epoch": 9.258416742493175, "grad_norm": 11.683913230895996, "learning_rate": 3.331793397960237e-07, "loss": 1.3677, "step": 244200 }, { "epoch": 9.262208067940552, "grad_norm": 11.068657875061035, "learning_rate": 3.2980010939906435e-07, "loss": 1.3598, "step": 244300 }, { "epoch": 9.26599939338793, "grad_norm": 12.268835067749023, "learning_rate": 3.2643781583670096e-07, "loss": 1.38, "step": 244400 }, { "epoch": 9.269790718835305, "grad_norm": 12.091774940490723, "learning_rate": 3.2309246499780953e-07, "loss": 1.3542, "step": 244500 }, { "epoch": 9.273582044282682, "grad_norm": 13.10727596282959, "learning_rate": 3.197640627415899e-07, "loss": 1.345, "step": 244600 }, { "epoch": 9.277373369730057, "grad_norm": 12.954718589782715, "learning_rate": 3.164526148975622e-07, "loss": 1.3518, "step": 244700 }, { "epoch": 9.281164695177434, "grad_norm": 14.055831909179688, "learning_rate": 3.1315812726554707e-07, "loss": 1.3415, "step": 244800 }, { "epoch": 9.28495602062481, "grad_norm": 13.444005966186523, "learning_rate": 3.0988060561566335e-07, "loss": 1.3584, "step": 244900 }, { "epoch": 9.288747346072187, "grad_norm": 12.843548774719238, "learning_rate": 3.0662005568831254e-07, "loss": 1.3963, "step": 245000 }, { "epoch": 9.292538671519564, "grad_norm": 12.494268417358398, "learning_rate": 3.0337648319417543e-07, "loss": 1.3439, "step": 245100 }, { "epoch": 9.29632999696694, "grad_norm": 11.615918159484863, "learning_rate": 3.0014989381419113e-07, "loss": 1.3709, "step": 245200 }, { "epoch": 9.300121322414316, "grad_norm": 12.33356761932373, "learning_rate": 2.9694029319955685e-07, "loss": 1.3762, "step": 245300 }, { "epoch": 9.303912647861692, "grad_norm": 12.230978012084961, "learning_rate": 2.937476869717193e-07, "loss": 1.3336, "step": 245400 }, { "epoch": 9.307703973309069, "grad_norm": 12.333333015441895, "learning_rate": 2.905720807223544e-07, "loss": 1.3773, "step": 245500 }, { "epoch": 9.311495298756446, "grad_norm": 12.07331371307373, "learning_rate": 2.8741348001336436e-07, "loss": 1.3562, "step": 245600 }, { "epoch": 9.315286624203821, "grad_norm": 10.544958114624023, "learning_rate": 2.8427189037686933e-07, "loss": 1.342, "step": 245700 }, { "epoch": 9.319077949651199, "grad_norm": 12.38078784942627, "learning_rate": 2.811473173151935e-07, "loss": 1.3641, "step": 245800 }, { "epoch": 9.322869275098574, "grad_norm": 11.659907341003418, "learning_rate": 2.780397663008605e-07, "loss": 1.3684, "step": 245900 }, { "epoch": 9.326660600545951, "grad_norm": 12.453530311584473, "learning_rate": 2.7494924277657766e-07, "loss": 1.3603, "step": 246000 }, { "epoch": 9.330451925993327, "grad_norm": 12.124307632446289, "learning_rate": 2.7187575215523067e-07, "loss": 1.3679, "step": 246100 }, { "epoch": 9.334243251440704, "grad_norm": 10.759623527526855, "learning_rate": 2.688192998198713e-07, "loss": 1.3657, "step": 246200 }, { "epoch": 9.33803457688808, "grad_norm": 12.0177583694458, "learning_rate": 2.6577989112371304e-07, "loss": 1.3453, "step": 246300 }, { "epoch": 9.341825902335456, "grad_norm": 11.817933082580566, "learning_rate": 2.627575313901176e-07, "loss": 1.3335, "step": 246400 }, { "epoch": 9.345617227782833, "grad_norm": 12.042447090148926, "learning_rate": 2.5975222591258287e-07, "loss": 1.3507, "step": 246500 }, { "epoch": 9.349408553230209, "grad_norm": 12.917941093444824, "learning_rate": 2.5676397995474056e-07, "loss": 1.378, "step": 246600 }, { "epoch": 9.353199878677586, "grad_norm": 13.921685218811035, "learning_rate": 2.537927987503419e-07, "loss": 1.3457, "step": 246700 }, { "epoch": 9.356991204124963, "grad_norm": 11.27734375, "learning_rate": 2.5083868750325424e-07, "loss": 1.3479, "step": 246800 }, { "epoch": 9.360782529572338, "grad_norm": 12.427639961242676, "learning_rate": 2.4790165138743996e-07, "loss": 1.3631, "step": 246900 }, { "epoch": 9.364573855019716, "grad_norm": 11.944756507873535, "learning_rate": 2.4498169554696197e-07, "loss": 1.3737, "step": 247000 }, { "epoch": 9.36836518046709, "grad_norm": 13.016319274902344, "learning_rate": 2.42078825095966e-07, "loss": 1.373, "step": 247100 }, { "epoch": 9.372156505914468, "grad_norm": 11.824865341186523, "learning_rate": 2.39193045118673e-07, "loss": 1.3613, "step": 247200 }, { "epoch": 9.375947831361843, "grad_norm": 12.41222858428955, "learning_rate": 2.3632436066937104e-07, "loss": 1.331, "step": 247300 }, { "epoch": 9.37973915680922, "grad_norm": 11.010222434997559, "learning_rate": 2.3347277677240654e-07, "loss": 1.3456, "step": 247400 }, { "epoch": 9.383530482256598, "grad_norm": 12.733603477478027, "learning_rate": 2.3063829842217557e-07, "loss": 1.3372, "step": 247500 }, { "epoch": 9.387321807703973, "grad_norm": 12.40368938446045, "learning_rate": 2.2782093058311249e-07, "loss": 1.357, "step": 247600 }, { "epoch": 9.39111313315135, "grad_norm": 12.589859008789062, "learning_rate": 2.250206781896891e-07, "loss": 1.3371, "step": 247700 }, { "epoch": 9.394904458598726, "grad_norm": 13.365696907043457, "learning_rate": 2.2223754614639437e-07, "loss": 1.3448, "step": 247800 }, { "epoch": 9.398695784046103, "grad_norm": 12.049545288085938, "learning_rate": 2.1947153932773358e-07, "loss": 1.352, "step": 247900 }, { "epoch": 9.402487109493478, "grad_norm": 11.439803123474121, "learning_rate": 2.167226625782204e-07, "loss": 1.3616, "step": 248000 }, { "epoch": 9.406278434940855, "grad_norm": 11.749093055725098, "learning_rate": 2.139909207123636e-07, "loss": 1.348, "step": 248100 }, { "epoch": 9.410069760388232, "grad_norm": 12.662898063659668, "learning_rate": 2.1127631851466268e-07, "loss": 1.3296, "step": 248200 }, { "epoch": 9.413861085835608, "grad_norm": 12.465304374694824, "learning_rate": 2.0857886073959666e-07, "loss": 1.3893, "step": 248300 }, { "epoch": 9.417652411282985, "grad_norm": 11.842853546142578, "learning_rate": 2.0589855211161859e-07, "loss": 1.3614, "step": 248400 }, { "epoch": 9.42144373673036, "grad_norm": 12.417558670043945, "learning_rate": 2.032353973251422e-07, "loss": 1.3732, "step": 248500 }, { "epoch": 9.425235062177737, "grad_norm": 12.443554878234863, "learning_rate": 2.0058940104454305e-07, "loss": 1.3574, "step": 248600 }, { "epoch": 9.429026387625115, "grad_norm": 11.214204788208008, "learning_rate": 1.9796056790414075e-07, "loss": 1.3553, "step": 248700 }, { "epoch": 9.43281771307249, "grad_norm": 12.169547080993652, "learning_rate": 1.9534890250819227e-07, "loss": 1.3456, "step": 248800 }, { "epoch": 9.436609038519867, "grad_norm": 11.959320068359375, "learning_rate": 1.9275440943089086e-07, "loss": 1.365, "step": 248900 }, { "epoch": 9.440400363967242, "grad_norm": 12.322478294372559, "learning_rate": 1.9017709321635158e-07, "loss": 1.3579, "step": 249000 }, { "epoch": 9.44419168941462, "grad_norm": 11.994853019714355, "learning_rate": 1.8761695837860582e-07, "loss": 1.3549, "step": 249100 }, { "epoch": 9.447983014861995, "grad_norm": 12.763020515441895, "learning_rate": 1.8507400940159014e-07, "loss": 1.3732, "step": 249200 }, { "epoch": 9.451774340309372, "grad_norm": 11.718242645263672, "learning_rate": 1.8254825073914295e-07, "loss": 1.3855, "step": 249300 }, { "epoch": 9.45556566575675, "grad_norm": 11.259212493896484, "learning_rate": 1.800396868149956e-07, "loss": 1.3497, "step": 249400 }, { "epoch": 9.459356991204125, "grad_norm": 13.890626907348633, "learning_rate": 1.775483220227636e-07, "loss": 1.354, "step": 249500 }, { "epoch": 9.463148316651502, "grad_norm": 11.976237297058105, "learning_rate": 1.7507416072593653e-07, "loss": 1.3389, "step": 249600 }, { "epoch": 9.466939642098877, "grad_norm": 12.011896133422852, "learning_rate": 1.726172072578758e-07, "loss": 1.3269, "step": 249700 }, { "epoch": 9.470730967546254, "grad_norm": 12.871967315673828, "learning_rate": 1.701774659218014e-07, "loss": 1.3579, "step": 249800 }, { "epoch": 9.47452229299363, "grad_norm": 11.391215324401855, "learning_rate": 1.6775494099078971e-07, "loss": 1.3368, "step": 249900 }, { "epoch": 9.478313618441007, "grad_norm": 11.781603813171387, "learning_rate": 1.653496367077645e-07, "loss": 1.3672, "step": 250000 }, { "epoch": 9.482104943888384, "grad_norm": 11.652870178222656, "learning_rate": 1.629615572854859e-07, "loss": 1.3598, "step": 250100 }, { "epoch": 9.48589626933576, "grad_norm": 11.833457946777344, "learning_rate": 1.60590706906546e-07, "loss": 1.3488, "step": 250200 }, { "epoch": 9.489687594783137, "grad_norm": 10.907110214233398, "learning_rate": 1.5823708972336206e-07, "loss": 1.357, "step": 250300 }, { "epoch": 9.493478920230512, "grad_norm": 13.253046989440918, "learning_rate": 1.5590070985816663e-07, "loss": 1.3807, "step": 250400 }, { "epoch": 9.497270245677889, "grad_norm": 12.510509490966797, "learning_rate": 1.5358157140300423e-07, "loss": 1.3362, "step": 250500 }, { "epoch": 9.501061571125266, "grad_norm": 12.2095308303833, "learning_rate": 1.5127967841972125e-07, "loss": 1.3575, "step": 250600 }, { "epoch": 9.504852896572642, "grad_norm": 11.592772483825684, "learning_rate": 1.4899503493995938e-07, "loss": 1.3371, "step": 250700 }, { "epoch": 9.508644222020019, "grad_norm": 12.52585220336914, "learning_rate": 1.467276449651478e-07, "loss": 1.3463, "step": 250800 }, { "epoch": 9.512435547467394, "grad_norm": 10.477096557617188, "learning_rate": 1.4447751246649877e-07, "loss": 1.3418, "step": 250900 }, { "epoch": 9.516226872914771, "grad_norm": 11.485027313232422, "learning_rate": 1.4224464138499872e-07, "loss": 1.3499, "step": 251000 }, { "epoch": 9.520018198362147, "grad_norm": 11.98810863494873, "learning_rate": 1.4002903563140047e-07, "loss": 1.3364, "step": 251100 }, { "epoch": 9.523809523809524, "grad_norm": 11.184930801391602, "learning_rate": 1.3783069908621772e-07, "loss": 1.3863, "step": 251200 }, { "epoch": 9.5276008492569, "grad_norm": 12.607863426208496, "learning_rate": 1.3564963559971944e-07, "loss": 1.3507, "step": 251300 }, { "epoch": 9.531392174704276, "grad_norm": 11.500112533569336, "learning_rate": 1.3348584899192107e-07, "loss": 1.3578, "step": 251400 }, { "epoch": 9.535183500151653, "grad_norm": 13.704484939575195, "learning_rate": 1.3133934305257778e-07, "loss": 1.335, "step": 251500 }, { "epoch": 9.538974825599029, "grad_norm": 11.886691093444824, "learning_rate": 1.2921012154118006e-07, "loss": 1.3454, "step": 251600 }, { "epoch": 9.542766151046406, "grad_norm": 12.547369003295898, "learning_rate": 1.2709818818694264e-07, "loss": 1.3311, "step": 251700 }, { "epoch": 9.546557476493783, "grad_norm": 13.166351318359375, "learning_rate": 1.2500354668880554e-07, "loss": 1.3405, "step": 251800 }, { "epoch": 9.550348801941158, "grad_norm": 12.186589241027832, "learning_rate": 1.229262007154197e-07, "loss": 1.3701, "step": 251900 }, { "epoch": 9.554140127388536, "grad_norm": 13.21019458770752, "learning_rate": 1.2086615390514477e-07, "loss": 1.3456, "step": 252000 }, { "epoch": 9.557931452835911, "grad_norm": 12.132020950317383, "learning_rate": 1.1882340986604123e-07, "loss": 1.3672, "step": 252100 }, { "epoch": 9.561722778283288, "grad_norm": 11.118840217590332, "learning_rate": 1.1679797217586719e-07, "loss": 1.3652, "step": 252200 }, { "epoch": 9.565514103730663, "grad_norm": 12.081765174865723, "learning_rate": 1.1478984438206497e-07, "loss": 1.3531, "step": 252300 }, { "epoch": 9.56930542917804, "grad_norm": 11.9998197555542, "learning_rate": 1.1279903000176562e-07, "loss": 1.3619, "step": 252400 }, { "epoch": 9.573096754625418, "grad_norm": 11.66572093963623, "learning_rate": 1.1082553252176998e-07, "loss": 1.3562, "step": 252500 }, { "epoch": 9.576888080072793, "grad_norm": 11.024676322937012, "learning_rate": 1.0886935539855425e-07, "loss": 1.367, "step": 252600 }, { "epoch": 9.58067940552017, "grad_norm": 12.373724937438965, "learning_rate": 1.0693050205825784e-07, "loss": 1.3714, "step": 252700 }, { "epoch": 9.584470730967546, "grad_norm": 14.374722480773926, "learning_rate": 1.0500897589667769e-07, "loss": 1.3479, "step": 252800 }, { "epoch": 9.588262056414923, "grad_norm": 11.23594856262207, "learning_rate": 1.0310478027926285e-07, "loss": 1.358, "step": 252900 }, { "epoch": 9.5920533818623, "grad_norm": 13.274566650390625, "learning_rate": 1.0121791854110996e-07, "loss": 1.3481, "step": 253000 }, { "epoch": 9.595844707309675, "grad_norm": 12.90688705444336, "learning_rate": 9.93483939869555e-08, "loss": 1.3315, "step": 253100 }, { "epoch": 9.599636032757052, "grad_norm": 12.44372844696045, "learning_rate": 9.749620989117136e-08, "loss": 1.3619, "step": 253200 }, { "epoch": 9.603427358204428, "grad_norm": 12.390811920166016, "learning_rate": 9.566136949775817e-08, "loss": 1.3724, "step": 253300 }, { "epoch": 9.607218683651805, "grad_norm": 13.405505180358887, "learning_rate": 9.384387602034084e-08, "loss": 1.3472, "step": 253400 }, { "epoch": 9.61101000909918, "grad_norm": 12.47612190246582, "learning_rate": 9.20437326421597e-08, "loss": 1.3601, "step": 253500 }, { "epoch": 9.614801334546557, "grad_norm": 11.974573135375977, "learning_rate": 9.026094251607054e-08, "loss": 1.3737, "step": 253600 }, { "epoch": 9.618592659993935, "grad_norm": 11.978750228881836, "learning_rate": 8.84955087645345e-08, "loss": 1.3655, "step": 253700 }, { "epoch": 9.62238398544131, "grad_norm": 12.345575332641602, "learning_rate": 8.674743447961154e-08, "loss": 1.3616, "step": 253800 }, { "epoch": 9.626175310888687, "grad_norm": 11.176697731018066, "learning_rate": 8.501672272296146e-08, "loss": 1.3575, "step": 253900 }, { "epoch": 9.629966636336063, "grad_norm": 10.914751052856445, "learning_rate": 8.330337652583287e-08, "loss": 1.391, "step": 254000 }, { "epoch": 9.63375796178344, "grad_norm": 12.183568954467773, "learning_rate": 8.160739888905867e-08, "loss": 1.3432, "step": 254100 }, { "epoch": 9.637549287230815, "grad_norm": 10.939971923828125, "learning_rate": 7.992879278305276e-08, "loss": 1.3783, "step": 254200 }, { "epoch": 9.641340612678192, "grad_norm": 11.264266967773438, "learning_rate": 7.826756114780343e-08, "loss": 1.3359, "step": 254300 }, { "epoch": 9.64513193812557, "grad_norm": 12.182567596435547, "learning_rate": 7.66237068928688e-08, "loss": 1.3349, "step": 254400 }, { "epoch": 9.648923263572945, "grad_norm": 14.312700271606445, "learning_rate": 7.499723289737026e-08, "loss": 1.3385, "step": 254500 }, { "epoch": 9.652714589020322, "grad_norm": 11.032988548278809, "learning_rate": 7.338814200998912e-08, "loss": 1.3668, "step": 254600 }, { "epoch": 9.656505914467697, "grad_norm": 11.53009033203125, "learning_rate": 7.179643704896433e-08, "loss": 1.3592, "step": 254700 }, { "epoch": 9.660297239915074, "grad_norm": 12.249180793762207, "learning_rate": 7.022212080207702e-08, "loss": 1.331, "step": 254800 }, { "epoch": 9.664088565362452, "grad_norm": 11.950242042541504, "learning_rate": 6.866519602666044e-08, "loss": 1.356, "step": 254900 }, { "epoch": 9.667879890809827, "grad_norm": 11.610028266906738, "learning_rate": 6.712566544958443e-08, "loss": 1.3443, "step": 255000 }, { "epoch": 9.671671216257204, "grad_norm": 12.945029258728027, "learning_rate": 6.56035317672532e-08, "loss": 1.3552, "step": 255100 }, { "epoch": 9.67546254170458, "grad_norm": 12.27959156036377, "learning_rate": 6.409879764560089e-08, "loss": 1.3519, "step": 255200 }, { "epoch": 9.679253867151957, "grad_norm": 12.127484321594238, "learning_rate": 6.261146572009047e-08, "loss": 1.351, "step": 255300 }, { "epoch": 9.683045192599332, "grad_norm": 12.983355522155762, "learning_rate": 6.11415385957026e-08, "loss": 1.3595, "step": 255400 }, { "epoch": 9.686836518046709, "grad_norm": 11.549211502075195, "learning_rate": 5.968901884693567e-08, "loss": 1.3376, "step": 255500 }, { "epoch": 9.690627843494086, "grad_norm": 12.067475318908691, "learning_rate": 5.825390901779915e-08, "loss": 1.3725, "step": 255600 }, { "epoch": 9.694419168941462, "grad_norm": 11.431747436523438, "learning_rate": 5.683621162181241e-08, "loss": 1.336, "step": 255700 }, { "epoch": 9.698210494388839, "grad_norm": 12.497295379638672, "learning_rate": 5.543592914199369e-08, "loss": 1.3697, "step": 255800 }, { "epoch": 9.702001819836214, "grad_norm": 10.706396102905273, "learning_rate": 5.405306403086452e-08, "loss": 1.3453, "step": 255900 }, { "epoch": 9.705793145283591, "grad_norm": 11.963174819946289, "learning_rate": 5.2687618710438593e-08, "loss": 1.3648, "step": 256000 }, { "epoch": 9.709584470730967, "grad_norm": 14.023811340332031, "learning_rate": 5.1339595572218456e-08, "loss": 1.3493, "step": 256100 }, { "epoch": 9.713375796178344, "grad_norm": 11.60668659210205, "learning_rate": 5.000899697719552e-08, "loss": 1.3468, "step": 256200 }, { "epoch": 9.717167121625721, "grad_norm": 12.333610534667969, "learning_rate": 4.8695825255840043e-08, "loss": 1.3757, "step": 256300 }, { "epoch": 9.720958447073096, "grad_norm": 12.220203399658203, "learning_rate": 4.740008270810226e-08, "loss": 1.3511, "step": 256400 }, { "epoch": 9.724749772520473, "grad_norm": 12.462227821350098, "learning_rate": 4.612177160340681e-08, "loss": 1.3653, "step": 256500 }, { "epoch": 9.728541097967849, "grad_norm": 13.274474143981934, "learning_rate": 4.486089418064499e-08, "loss": 1.3758, "step": 256600 }, { "epoch": 9.732332423415226, "grad_norm": 12.773098945617676, "learning_rate": 4.361745264817696e-08, "loss": 1.3665, "step": 256700 }, { "epoch": 9.736123748862603, "grad_norm": 11.732959747314453, "learning_rate": 4.239144918382287e-08, "loss": 1.3601, "step": 256800 }, { "epoch": 9.739915074309978, "grad_norm": 11.212507247924805, "learning_rate": 4.118288593486175e-08, "loss": 1.3355, "step": 256900 }, { "epoch": 9.743706399757356, "grad_norm": 11.269777297973633, "learning_rate": 3.999176501802815e-08, "loss": 1.3343, "step": 257000 }, { "epoch": 9.747497725204731, "grad_norm": 12.410697937011719, "learning_rate": 3.881808851950553e-08, "loss": 1.3671, "step": 257100 }, { "epoch": 9.751289050652108, "grad_norm": 12.207523345947266, "learning_rate": 3.766185849492399e-08, "loss": 1.3612, "step": 257200 }, { "epoch": 9.755080376099485, "grad_norm": 11.212423324584961, "learning_rate": 3.6523076969359196e-08, "loss": 1.384, "step": 257300 }, { "epoch": 9.75887170154686, "grad_norm": 11.700383186340332, "learning_rate": 3.5401745937326815e-08, "loss": 1.3619, "step": 257400 }, { "epoch": 9.762663026994238, "grad_norm": 13.542241096496582, "learning_rate": 3.429786736277585e-08, "loss": 1.3668, "step": 257500 }, { "epoch": 9.766454352441613, "grad_norm": 12.420721054077148, "learning_rate": 3.321144317909197e-08, "loss": 1.3351, "step": 257600 }, { "epoch": 9.77024567788899, "grad_norm": 12.954392433166504, "learning_rate": 3.214247528908754e-08, "loss": 1.3591, "step": 257700 }, { "epoch": 9.774037003336366, "grad_norm": 10.561363220214844, "learning_rate": 3.109096556500158e-08, "loss": 1.3368, "step": 257800 }, { "epoch": 9.777828328783743, "grad_norm": 13.146068572998047, "learning_rate": 3.005691584849868e-08, "loss": 1.3384, "step": 257900 }, { "epoch": 9.78161965423112, "grad_norm": 12.308815956115723, "learning_rate": 2.904032795066014e-08, "loss": 1.3572, "step": 258000 }, { "epoch": 9.785410979678495, "grad_norm": 12.615276336669922, "learning_rate": 2.8041203651985037e-08, "loss": 1.3663, "step": 258100 }, { "epoch": 9.789202305125873, "grad_norm": 12.344051361083984, "learning_rate": 2.70595447023847e-08, "loss": 1.3622, "step": 258200 }, { "epoch": 9.792993630573248, "grad_norm": 13.285365104675293, "learning_rate": 2.6095352821184916e-08, "loss": 1.3498, "step": 258300 }, { "epoch": 9.796784956020625, "grad_norm": 12.593259811401367, "learning_rate": 2.5148629697112625e-08, "loss": 1.3248, "step": 258400 }, { "epoch": 9.800576281468, "grad_norm": 11.824785232543945, "learning_rate": 2.421937698830479e-08, "loss": 1.3499, "step": 258500 }, { "epoch": 9.804367606915378, "grad_norm": 14.515942573547363, "learning_rate": 2.3307596322296178e-08, "loss": 1.3467, "step": 258600 }, { "epoch": 9.808158932362755, "grad_norm": 11.712870597839355, "learning_rate": 2.24132892960216e-08, "loss": 1.3983, "step": 258700 }, { "epoch": 9.81195025781013, "grad_norm": 11.883424758911133, "learning_rate": 2.1536457475812568e-08, "loss": 1.3582, "step": 258800 }, { "epoch": 9.815741583257507, "grad_norm": 11.924446105957031, "learning_rate": 2.0677102397391734e-08, "loss": 1.3784, "step": 258900 }, { "epoch": 9.819532908704883, "grad_norm": 11.502227783203125, "learning_rate": 1.9835225565874026e-08, "loss": 1.3436, "step": 259000 }, { "epoch": 9.82332423415226, "grad_norm": 11.912199020385742, "learning_rate": 1.9010828455761078e-08, "loss": 1.3636, "step": 259100 }, { "epoch": 9.827115559599637, "grad_norm": 12.016336441040039, "learning_rate": 1.8203912510940114e-08, "loss": 1.3826, "step": 259200 }, { "epoch": 9.830906885047012, "grad_norm": 12.02882194519043, "learning_rate": 1.741447914468064e-08, "loss": 1.3638, "step": 259300 }, { "epoch": 9.83469821049439, "grad_norm": 11.961102485656738, "learning_rate": 1.6642529739633316e-08, "loss": 1.3526, "step": 259400 }, { "epoch": 9.838489535941765, "grad_norm": 11.629354476928711, "learning_rate": 1.5888065647825522e-08, "loss": 1.3572, "step": 259500 }, { "epoch": 9.842280861389142, "grad_norm": 11.616954803466797, "learning_rate": 1.5151088190661355e-08, "loss": 1.3344, "step": 259600 }, { "epoch": 9.846072186836517, "grad_norm": 12.569297790527344, "learning_rate": 1.4431598658916079e-08, "loss": 1.3566, "step": 259700 }, { "epoch": 9.849863512283894, "grad_norm": 11.228477478027344, "learning_rate": 1.3729598312737235e-08, "loss": 1.3352, "step": 259800 }, { "epoch": 9.853654837731272, "grad_norm": 12.68139362335205, "learning_rate": 1.3045088381641314e-08, "loss": 1.355, "step": 259900 }, { "epoch": 9.857446163178647, "grad_norm": 12.226519584655762, "learning_rate": 1.2378070064509307e-08, "loss": 1.3165, "step": 260000 }, { "epoch": 9.861237488626024, "grad_norm": 12.641012191772461, "learning_rate": 1.1728544529588936e-08, "loss": 1.3671, "step": 260100 }, { "epoch": 9.8650288140734, "grad_norm": 11.181568145751953, "learning_rate": 1.109651291448799e-08, "loss": 1.3364, "step": 260200 }, { "epoch": 9.868820139520777, "grad_norm": 11.605782508850098, "learning_rate": 1.0481976326177646e-08, "loss": 1.3516, "step": 260300 }, { "epoch": 9.872611464968152, "grad_norm": 12.681472778320312, "learning_rate": 9.884935840984711e-09, "loss": 1.3502, "step": 260400 }, { "epoch": 9.87640279041553, "grad_norm": 12.962564468383789, "learning_rate": 9.305392504592726e-09, "loss": 1.3421, "step": 260500 }, { "epoch": 9.880194115862906, "grad_norm": 12.582594871520996, "learning_rate": 8.743347332041962e-09, "loss": 1.357, "step": 260600 }, { "epoch": 9.883985441310282, "grad_norm": 12.529866218566895, "learning_rate": 8.19880130772388e-09, "loss": 1.3444, "step": 260700 }, { "epoch": 9.887776766757659, "grad_norm": 12.147127151489258, "learning_rate": 7.671755385381119e-09, "loss": 1.3563, "step": 260800 }, { "epoch": 9.891568092205034, "grad_norm": 11.384781837463379, "learning_rate": 7.162210488106392e-09, "loss": 1.3316, "step": 260900 }, { "epoch": 9.895359417652411, "grad_norm": 11.740744590759277, "learning_rate": 6.6701675083402636e-09, "loss": 1.3496, "step": 261000 }, { "epoch": 9.899150743099788, "grad_norm": 13.272279739379883, "learning_rate": 6.1956273078700445e-09, "loss": 1.3678, "step": 261100 }, { "epoch": 9.902942068547164, "grad_norm": 12.729235649108887, "learning_rate": 5.738590717826453e-09, "loss": 1.3314, "step": 261200 }, { "epoch": 9.906733393994541, "grad_norm": 13.740254402160645, "learning_rate": 5.299058538683621e-09, "loss": 1.3849, "step": 261300 }, { "epoch": 9.910524719441916, "grad_norm": 13.462955474853516, "learning_rate": 4.877031540261312e-09, "loss": 1.3533, "step": 261400 }, { "epoch": 9.914316044889294, "grad_norm": 11.414237976074219, "learning_rate": 4.47251046171493e-09, "loss": 1.36, "step": 261500 }, { "epoch": 9.918107370336669, "grad_norm": 14.275734901428223, "learning_rate": 4.085496011542178e-09, "loss": 1.3517, "step": 261600 }, { "epoch": 9.921898695784046, "grad_norm": 13.304903030395508, "learning_rate": 3.7159888675775137e-09, "loss": 1.3755, "step": 261700 }, { "epoch": 9.925690021231423, "grad_norm": 12.437847137451172, "learning_rate": 3.3639896769932513e-09, "loss": 1.3331, "step": 261800 }, { "epoch": 9.929481346678799, "grad_norm": 11.855758666992188, "learning_rate": 3.029499056297347e-09, "loss": 1.3519, "step": 261900 }, { "epoch": 9.933272672126176, "grad_norm": 11.797375679016113, "learning_rate": 2.7125175913311763e-09, "loss": 1.3425, "step": 262000 }, { "epoch": 9.937063997573551, "grad_norm": 12.923712730407715, "learning_rate": 2.4130458372717546e-09, "loss": 1.3706, "step": 262100 }, { "epoch": 9.940855323020928, "grad_norm": 11.669929504394531, "learning_rate": 2.1310843186261864e-09, "loss": 1.3558, "step": 262200 }, { "epoch": 9.944646648468304, "grad_norm": 11.238569259643555, "learning_rate": 1.866633529236106e-09, "loss": 1.3635, "step": 262300 }, { "epoch": 9.94843797391568, "grad_norm": 12.64110279083252, "learning_rate": 1.6196939322732364e-09, "loss": 1.3701, "step": 262400 }, { "epoch": 9.952229299363058, "grad_norm": 10.803077697753906, "learning_rate": 1.3902659602382795e-09, "loss": 1.3654, "step": 262500 }, { "epoch": 9.956020624810433, "grad_norm": 11.54616928100586, "learning_rate": 1.1783500149620263e-09, "loss": 1.3227, "step": 262600 }, { "epoch": 9.95981195025781, "grad_norm": 11.042337417602539, "learning_rate": 9.839464676031363e-10, "loss": 1.3638, "step": 262700 }, { "epoch": 9.963603275705186, "grad_norm": 13.462218284606934, "learning_rate": 8.070556586503575e-10, "loss": 1.3641, "step": 262800 }, { "epoch": 9.967394601152563, "grad_norm": 13.954395294189453, "learning_rate": 6.476778979180864e-10, "loss": 1.362, "step": 262900 }, { "epoch": 9.97118592659994, "grad_norm": 11.63084602355957, "learning_rate": 5.058134645463675e-10, "loss": 1.3601, "step": 263000 }, { "epoch": 9.974977252047315, "grad_norm": 12.435700416564941, "learning_rate": 3.81462607005334e-10, "loss": 1.3359, "step": 263100 }, { "epoch": 9.978768577494693, "grad_norm": 10.39577865600586, "learning_rate": 2.7462554308743674e-10, "loss": 1.3463, "step": 263200 }, { "epoch": 9.982559902942068, "grad_norm": 12.28831958770752, "learning_rate": 1.853024599129949e-10, "loss": 1.3552, "step": 263300 }, { "epoch": 9.986351228389445, "grad_norm": 11.906715393066406, "learning_rate": 1.1349351392575536e-10, "loss": 1.3341, "step": 263400 }, { "epoch": 9.990142553836822, "grad_norm": 10.182751655578613, "learning_rate": 5.919883089511302e-11, "loss": 1.3601, "step": 263500 }, { "epoch": 9.993933879284198, "grad_norm": 11.972005844116211, "learning_rate": 2.2418505916110834e-11, "loss": 1.3549, "step": 263600 }, { "epoch": 9.997725204731575, "grad_norm": 11.785057067871094, "learning_rate": 3.1526034072193457e-12, "loss": 1.3748, "step": 263700 }, { "epoch": 10.0, "eval_accuracy": 0.5286828336634978, "eval_loss": 2.1286187171936035, "eval_runtime": 936.3883, "eval_samples_per_second": 901.361, "eval_steps_per_second": 7.042, "step": 263760 }, { "epoch": 10.0, "step": 263760, "total_flos": 9.242888597926871e+18, "train_loss": 2.4459246747957275, "train_runtime": 102710.4528, "train_samples_per_second": 328.7, "train_steps_per_second": 2.568 } ], "logging_steps": 100, "max_steps": 263760, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.242888597926871e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }